Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

compute AUROC and AUPRC score #15

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
191 changes: 191 additions & 0 deletions SGRNEval/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,191 @@
"""
SGRN Evaluation (:mod:`SGRNEval`) module contains the following
:class:`SGRNEval.SGRNEval` and three additional classes used in the
definition of SGRNEval class
- :class:`SGRNEval.ConfigParser`
- :class:`SGRNEval.InputSettings`
- :class:`SGRNEval.OutputSettings`
"""
import os
import yaml
import argparse
import itertools
import numpy as np
import pandas as pd
import networkx as nx
from tqdm import tqdm
import multiprocessing
from pathlib import Path
import concurrent.futures
from itertools import permutations
from collections import defaultdict



# local imports
from SGRNEval.computeAUC import PRROC


class InputSettings(object):
'''
The class for storing the names of input files.
This initilizes an InputSettings object based on the
following three parameters.

:param datadir: input dataset root directory, typically 'inputs/'
:type datadir: str
:param datasets: List of dataset names
:type datasets: list

:param algorithms: List of algorithm names
:type algorithms: list
'''

def __init__(self,
datadir, datasets, algorithms, randSeed) -> None:

self.datadir = datadir
self.datasets = datasets
self.algorithms = algorithms
self.randSeed = randSeed


class OutputSettings(object):
'''
The class for storing the names of directories that output should
be written to. This initilizes an OutputSettings object based on the
following two parameters.

:param base_dir: output root directory, typically 'outputs/'
:type base_dir: str
:param output_prefix: A prefix added to the final output files.
:type str:
'''

def __init__(self, base_dir, output_prefix: Path) -> None:
self.base_dir = base_dir
self.output_prefix = output_prefix




class SGRNEval(object):
'''
The SGRN Evaluation object is created by parsing a user-provided configuration
file. Its methods provide for further processing its inputs into
a series of jobs to be run, as well as running these jobs.
'''

def __init__(self,
input_settings: InputSettings,
output_settings: OutputSettings) -> None:

self.input_settings = input_settings
self.output_settings = output_settings


def computeAUC(self):

'''
Computes areas under the precision-recall (PR) and
and ROC plots for each algorithm-dataset combination.

:returns:
- AUPRC: A dataframe containing AUPRC values for each algorithm-dataset combination
- AUROC: A dataframe containing AUROC values for each algorithm-dataset combination
'''
AUPRCDict = {}
AUROCDict = {}

for dataset in tqdm(self.input_settings.datasets,
total = len(self.input_settings.datasets), unit = " Datasets"):
print("Evaluating for %s"%dataset)
AUPRC, AUROC = PRROC(dataset, self.input_settings,
selfEdges = False)
AUPRCDict[dataset['name']] = AUPRC
AUROCDict[dataset['name']] = AUROC
AUPRC = pd.DataFrame(AUPRCDict)
AUROC = pd.DataFrame(AUROCDict)
return AUPRC, AUROC




class ConfigParser(object):
'''
The class define static methods for parsing and storing the contents
of the config file that sets a that sets a large number of parameters
used in the SGRNEval.
'''
@staticmethod
def parse(config_file_handle) -> SGRNEval:
'''
A method for parsing the input .yaml file.

:param config_file_handle: Name of the .yaml file to be parsed
:type config_file_handle: str

:returns:
An object of class :class:`SGRNEval.SGRNEval`.
'''
config_map = yaml.load(config_file_handle)
return SGRNEval(
ConfigParser.__parse_input_settings(
config_map['input_settings']),
ConfigParser.__parse_output_settings(
config_map['output_settings']))

@staticmethod
def __parse_input_settings(input_settings_map) -> InputSettings:
'''
A method for parsing and initializing
InputSettings object.
'''
input_dir = input_settings_map['input_dir']
dataset_dir = input_settings_map['dataset_dir']
datasets = input_settings_map['datasets']
randSeed = input_settings_map['randSeed']

return InputSettings(
Path(input_dir, dataset_dir),
datasets,
ConfigParser.__parse_algorithms(
input_settings_map['algorithms']),
randSeed)


@staticmethod
def __parse_algorithms(algorithms_list):
'''
A method for parsing the list of algorithms
that are being evaluated, along with
any parameters being passed.

Note that these parameters may not be
used in the current evaluation, but can
be used at a later point.
'''

# Initilalize the list of algorithms
algorithms = []

# Parse contents of algorithms_list
encoders = algorithms_list[0]['params']['encoder']
decoders = algorithms_list[0]['params']['decoder']
for encoder in encoders:
for decoder in decoders:
algorithms.append(encoder+"-"+decoder)

return algorithms

@staticmethod
def __parse_output_settings(output_settings_map) -> OutputSettings:
'''
A method for parsing and initializing
Output object.
'''
output_dir = Path(output_settings_map['output_dir'])
output_prefix = Path(output_settings_map['output_prefix'])

return OutputSettings(output_dir,
output_prefix)
90 changes: 90 additions & 0 deletions SGRNEval/computeAUC.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
import pandas as pd
import numpy as np
#import seaborn as sns
from pathlib import Path
import matplotlib.pyplot as plt
from sklearn.metrics import precision_recall_curve, roc_curve, auc
from itertools import product, permutations, combinations, combinations_with_replacement
from tqdm import tqdm
from rpy2.robjects.packages import importr
from rpy2.robjects import FloatVector


def PRROC(dataDict, inputSettings, selfEdges = False):
'''
Computes areas under the precision-recall and ROC curves
for a given dataset for each algorithm.

:param selfEdges: A flag to indicate whether to includeself-edges (selfEdges = True) or exclude self-edges (selfEdges = False) from evaluation.
:type selfEdges: boolPRROC

:returns:
- AUPRC: A dictionary containing AUPRC values for each algorithm
- AUROC: A dictionary containing AUROC values for each algorithm
'''

# Initialize data dictionaries
precisionDict = {}
recallDict = {}
FPRDict = {}
TPRDict = {}
AUPRC = {}
AUROC = {}

# set-up outDir that stores output directory name
outDir = "outputs/"+dataDict['name']
for algo in tqdm(inputSettings.algorithms,
total = len(inputSettings.algorithms), unit = "Algorithms"):
for rSeed in tqdm(inputSettings.randSeed,
total = len(inputSettings.randSeed), unit = "Rand Seed"):
# check if the output rankedEdges file exists
if Path(outDir + '/randID-' + str(rSeed) + '/' + algo +'/rankedEdges.csv').exists():
rankedEdgesDF = pd.read_csv(outDir + '/randID-' + str(rSeed) + '/' + algo +'/rankedEdges.csv', \
sep = ',', header = 0, index_col = None)
trueEdgesDF = rankedEdgesDF['TrueScore']
predDF = rankedEdgesDF['PredScore']


precisionDict[algo], recallDict[algo], FPRDict[algo], TPRDict[algo], AUPRC[algo], AUROC[algo] = computeScores(trueEdgesDF, predDF)

else:
print(outDir + '/randID-' + str(rSeed) + '/' + algo +'/rankedEdges.csv', \
' does not exist. Skipping...')

return AUPRC, AUROC

def computeScores(trueEdgesDF, predEdgeDF):
'''
Computes precision-recall and ROC curves
using scikit-learn for a given set of predictions in the
form of a DataFrame.

:param trueEdgesDF: A pandas dataframe containing the true classes.The indices of this dataframe are all possible edgesin a graph formed using the genes in the given dataset. This dataframe only has one column to indicate the classlabel of an edge. If an edge is present in the reference network, it gets a class label of 1, else 0.
:type trueEdgesDF: DataFrame

:param predEdgeDF: A pandas dataframe containing the edge ranks from the prediced network. The indices of this dataframe are all possible edges.This dataframe only has one column to indicate the edge weightsin the predicted network. Higher the weight, higher the edge confidence.
:type predEdgeDF: DataFrame


:returns:
- prec: A list of precision values (for PR plot)
- recall: A list of precision values (for PR plot)
- fpr: A list of false positive rates (for ROC plot)
- tpr: A list of true positive rates (for ROC plot)
- AUPRC: Area under the precision-recall curve
- AUROC: Area under the ROC curve
'''


prroc = importr('PRROC')
prCurve = prroc.pr_curve(scores_class0 = FloatVector(list(trueEdgesDF.values)),
weights_class0 = FloatVector(list(predEdgeDF.values)), curve=True)


fpr, tpr, thresholds = roc_curve(y_true=trueEdgesDF,
y_score=predEdgeDF, pos_label=1)

prec, recall, thresholds = precision_recall_curve(y_true=trueEdgesDF,
probas_pred=predEdgeDF, pos_label=1)

return prec, recall, fpr, tpr, prCurve[1][0], auc(fpr, tpr)
75 changes: 75 additions & 0 deletions SGRNEvaluator.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
#!/usr/bin/env python
# coding: utf-8

import os
import yaml
import argparse
import itertools
import numpy as np
import pandas as pd
import networkx as nx
from tqdm import tqdm
import multiprocessing
from pathlib import Path


# local imports
import SGRNEval as ev

def get_parser() -> argparse.ArgumentParser:
'''
:return: an argparse ArgumentParser object for parsing command
line parameters
'''
parser = argparse.ArgumentParser(
description='Run pathway reconstruction pipeline.')

parser.add_argument('-c','--config', default='config.yaml',
help="Configuration file containing list of datasets "
"algorithms and output specifications.\n")

parser.add_argument('-a', '--auc', action="store_true", default=False,
help="Compute median of areas under Precision-Recall and ROC curves.\n")


return parser

def parse_arguments():
'''
Initialize a parser and use it to parse the command line arguments
:return: parsed dictionary of command line arguments
'''
parser = get_parser()
opts = parser.parse_args()

return opts

def main():
opts = parse_arguments()
config_file = opts.config

evalConfig = None

with open(config_file, 'r') as conf:
evalConfig = ev.ConfigParser.parse(conf)

print('\nPost-run evaluation started...')
evalSummarizer = ev.SGRNEval(evalConfig.input_settings, evalConfig.output_settings)

outDir = os.path.join(str(evalSummarizer.output_settings.base_dir), \
str(evalSummarizer.output_settings.output_prefix))

# Compute and plot ROC, PRC and report median AUROC, AUPRC
if (opts.auc):
print('\n\nComputing areas under ROC and PR curves...')

AUPRC, AUROC = evalSummarizer.computeAUC()
AUPRC.to_csv(os.path.join(outDir,'AUPRC.csv'))
AUROC.to_csv(os.path.join(outDir,'AUROC.csv'))


print('\n\nEvaluation complete...\n')


if __name__ == '__main__':
main()