From e1066950cf9a891ae5e8ecac62ff25eac3e523be Mon Sep 17 00:00:00 2001 From: malabikasen Date: Mon, 3 May 2021 00:00:45 +0000 Subject: [PATCH] compute AUROC and AUPRC score --- SGRNEval/__init__.py | 191 +++++++++++++++++++++++++++++++++++++++++ SGRNEval/computeAUC.py | 90 +++++++++++++++++++ SGRNEvaluator.py | 75 ++++++++++++++++ 3 files changed, 356 insertions(+) create mode 100644 SGRNEval/__init__.py create mode 100644 SGRNEval/computeAUC.py create mode 100644 SGRNEvaluator.py diff --git a/SGRNEval/__init__.py b/SGRNEval/__init__.py new file mode 100644 index 0000000..9804c1e --- /dev/null +++ b/SGRNEval/__init__.py @@ -0,0 +1,191 @@ +""" +SGRN Evaluation (:mod:`SGRNEval`) module contains the following +:class:`SGRNEval.SGRNEval` and three additional classes used in the +definition of SGRNEval class +- :class:`SGRNEval.ConfigParser` +- :class:`SGRNEval.InputSettings` +- :class:`SGRNEval.OutputSettings` +""" +import os +import yaml +import argparse +import itertools +import numpy as np +import pandas as pd +import networkx as nx +from tqdm import tqdm +import multiprocessing +from pathlib import Path +import concurrent.futures +from itertools import permutations +from collections import defaultdict + + + +# local imports +from SGRNEval.computeAUC import PRROC + + +class InputSettings(object): + ''' + The class for storing the names of input files. + This initilizes an InputSettings object based on the + following three parameters. + + :param datadir: input dataset root directory, typically 'inputs/' + :type datadir: str + :param datasets: List of dataset names + :type datasets: list + + :param algorithms: List of algorithm names + :type algorithms: list + ''' + + def __init__(self, + datadir, datasets, algorithms, randSeed) -> None: + + self.datadir = datadir + self.datasets = datasets + self.algorithms = algorithms + self.randSeed = randSeed + + +class OutputSettings(object): + ''' + The class for storing the names of directories that output should + be written to. This initilizes an OutputSettings object based on the + following two parameters. + + :param base_dir: output root directory, typically 'outputs/' + :type base_dir: str + :param output_prefix: A prefix added to the final output files. + :type str: + ''' + + def __init__(self, base_dir, output_prefix: Path) -> None: + self.base_dir = base_dir + self.output_prefix = output_prefix + + + + +class SGRNEval(object): + ''' + The SGRN Evaluation object is created by parsing a user-provided configuration + file. Its methods provide for further processing its inputs into + a series of jobs to be run, as well as running these jobs. + ''' + + def __init__(self, + input_settings: InputSettings, + output_settings: OutputSettings) -> None: + + self.input_settings = input_settings + self.output_settings = output_settings + + + def computeAUC(self): + + ''' + Computes areas under the precision-recall (PR) and + and ROC plots for each algorithm-dataset combination. + + :returns: + - AUPRC: A dataframe containing AUPRC values for each algorithm-dataset combination + - AUROC: A dataframe containing AUROC values for each algorithm-dataset combination + ''' + AUPRCDict = {} + AUROCDict = {} + + for dataset in tqdm(self.input_settings.datasets, + total = len(self.input_settings.datasets), unit = " Datasets"): + print("Evaluating for %s"%dataset) + AUPRC, AUROC = PRROC(dataset, self.input_settings, + selfEdges = False) + AUPRCDict[dataset['name']] = AUPRC + AUROCDict[dataset['name']] = AUROC + AUPRC = pd.DataFrame(AUPRCDict) + AUROC = pd.DataFrame(AUROCDict) + return AUPRC, AUROC + + + + +class ConfigParser(object): + ''' + The class define static methods for parsing and storing the contents + of the config file that sets a that sets a large number of parameters + used in the SGRNEval. + ''' + @staticmethod + def parse(config_file_handle) -> SGRNEval: + ''' + A method for parsing the input .yaml file. + + :param config_file_handle: Name of the .yaml file to be parsed + :type config_file_handle: str + + :returns: + An object of class :class:`SGRNEval.SGRNEval`. + ''' + config_map = yaml.load(config_file_handle) + return SGRNEval( + ConfigParser.__parse_input_settings( + config_map['input_settings']), + ConfigParser.__parse_output_settings( + config_map['output_settings'])) + + @staticmethod + def __parse_input_settings(input_settings_map) -> InputSettings: + ''' + A method for parsing and initializing + InputSettings object. + ''' + input_dir = input_settings_map['input_dir'] + dataset_dir = input_settings_map['dataset_dir'] + datasets = input_settings_map['datasets'] + randSeed = input_settings_map['randSeed'] + + return InputSettings( + Path(input_dir, dataset_dir), + datasets, + ConfigParser.__parse_algorithms( + input_settings_map['algorithms']), + randSeed) + + + @staticmethod + def __parse_algorithms(algorithms_list): + ''' + A method for parsing the list of algorithms + that are being evaluated, along with + any parameters being passed. + + Note that these parameters may not be + used in the current evaluation, but can + be used at a later point. + ''' + + # Initilalize the list of algorithms + algorithms = [] + + # Parse contents of algorithms_list + encoders = algorithms_list[0]['params']['encoder'] + decoders = algorithms_list[0]['params']['decoder'] + for encoder in encoders: + for decoder in decoders: + algorithms.append(encoder+"-"+decoder) + + return algorithms + + @staticmethod + def __parse_output_settings(output_settings_map) -> OutputSettings: + ''' + A method for parsing and initializing + Output object. + ''' + output_dir = Path(output_settings_map['output_dir']) + output_prefix = Path(output_settings_map['output_prefix']) + + return OutputSettings(output_dir, + output_prefix) \ No newline at end of file diff --git a/SGRNEval/computeAUC.py b/SGRNEval/computeAUC.py new file mode 100644 index 0000000..43f0409 --- /dev/null +++ b/SGRNEval/computeAUC.py @@ -0,0 +1,90 @@ +import pandas as pd +import numpy as np +#import seaborn as sns +from pathlib import Path +import matplotlib.pyplot as plt +from sklearn.metrics import precision_recall_curve, roc_curve, auc +from itertools import product, permutations, combinations, combinations_with_replacement +from tqdm import tqdm +from rpy2.robjects.packages import importr +from rpy2.robjects import FloatVector + + +def PRROC(dataDict, inputSettings, selfEdges = False): + ''' + Computes areas under the precision-recall and ROC curves + for a given dataset for each algorithm. + + :param selfEdges: A flag to indicate whether to includeself-edges (selfEdges = True) or exclude self-edges (selfEdges = False) from evaluation. + :type selfEdges: boolPRROC + + :returns: + - AUPRC: A dictionary containing AUPRC values for each algorithm + - AUROC: A dictionary containing AUROC values for each algorithm + ''' + + # Initialize data dictionaries + precisionDict = {} + recallDict = {} + FPRDict = {} + TPRDict = {} + AUPRC = {} + AUROC = {} + + # set-up outDir that stores output directory name + outDir = "outputs/"+dataDict['name'] + for algo in tqdm(inputSettings.algorithms, + total = len(inputSettings.algorithms), unit = "Algorithms"): + for rSeed in tqdm(inputSettings.randSeed, + total = len(inputSettings.randSeed), unit = "Rand Seed"): + # check if the output rankedEdges file exists + if Path(outDir + '/randID-' + str(rSeed) + '/' + algo +'/rankedEdges.csv').exists(): + rankedEdgesDF = pd.read_csv(outDir + '/randID-' + str(rSeed) + '/' + algo +'/rankedEdges.csv', \ + sep = ',', header = 0, index_col = None) + trueEdgesDF = rankedEdgesDF['TrueScore'] + predDF = rankedEdgesDF['PredScore'] + + + precisionDict[algo], recallDict[algo], FPRDict[algo], TPRDict[algo], AUPRC[algo], AUROC[algo] = computeScores(trueEdgesDF, predDF) + + else: + print(outDir + '/randID-' + str(rSeed) + '/' + algo +'/rankedEdges.csv', \ + ' does not exist. Skipping...') + + return AUPRC, AUROC + +def computeScores(trueEdgesDF, predEdgeDF): + ''' + Computes precision-recall and ROC curves + using scikit-learn for a given set of predictions in the + form of a DataFrame. + + :param trueEdgesDF: A pandas dataframe containing the true classes.The indices of this dataframe are all possible edgesin a graph formed using the genes in the given dataset. This dataframe only has one column to indicate the classlabel of an edge. If an edge is present in the reference network, it gets a class label of 1, else 0. + :type trueEdgesDF: DataFrame + + :param predEdgeDF: A pandas dataframe containing the edge ranks from the prediced network. The indices of this dataframe are all possible edges.This dataframe only has one column to indicate the edge weightsin the predicted network. Higher the weight, higher the edge confidence. + :type predEdgeDF: DataFrame + + + :returns: + - prec: A list of precision values (for PR plot) + - recall: A list of precision values (for PR plot) + - fpr: A list of false positive rates (for ROC plot) + - tpr: A list of true positive rates (for ROC plot) + - AUPRC: Area under the precision-recall curve + - AUROC: Area under the ROC curve + ''' + + + prroc = importr('PRROC') + prCurve = prroc.pr_curve(scores_class0 = FloatVector(list(trueEdgesDF.values)), + weights_class0 = FloatVector(list(predEdgeDF.values)), curve=True) + + + fpr, tpr, thresholds = roc_curve(y_true=trueEdgesDF, + y_score=predEdgeDF, pos_label=1) + + prec, recall, thresholds = precision_recall_curve(y_true=trueEdgesDF, + probas_pred=predEdgeDF, pos_label=1) + + return prec, recall, fpr, tpr, prCurve[1][0], auc(fpr, tpr) \ No newline at end of file diff --git a/SGRNEvaluator.py b/SGRNEvaluator.py new file mode 100644 index 0000000..bac8b86 --- /dev/null +++ b/SGRNEvaluator.py @@ -0,0 +1,75 @@ +#!/usr/bin/env python +# coding: utf-8 + +import os +import yaml +import argparse +import itertools +import numpy as np +import pandas as pd +import networkx as nx +from tqdm import tqdm +import multiprocessing +from pathlib import Path + + +# local imports +import SGRNEval as ev + +def get_parser() -> argparse.ArgumentParser: + ''' + :return: an argparse ArgumentParser object for parsing command + line parameters + ''' + parser = argparse.ArgumentParser( + description='Run pathway reconstruction pipeline.') + + parser.add_argument('-c','--config', default='config.yaml', + help="Configuration file containing list of datasets " + "algorithms and output specifications.\n") + + parser.add_argument('-a', '--auc', action="store_true", default=False, + help="Compute median of areas under Precision-Recall and ROC curves.\n") + + + return parser + +def parse_arguments(): + ''' + Initialize a parser and use it to parse the command line arguments + :return: parsed dictionary of command line arguments + ''' + parser = get_parser() + opts = parser.parse_args() + + return opts + +def main(): + opts = parse_arguments() + config_file = opts.config + + evalConfig = None + + with open(config_file, 'r') as conf: + evalConfig = ev.ConfigParser.parse(conf) + + print('\nPost-run evaluation started...') + evalSummarizer = ev.SGRNEval(evalConfig.input_settings, evalConfig.output_settings) + + outDir = os.path.join(str(evalSummarizer.output_settings.base_dir), \ + str(evalSummarizer.output_settings.output_prefix)) + + # Compute and plot ROC, PRC and report median AUROC, AUPRC + if (opts.auc): + print('\n\nComputing areas under ROC and PR curves...') + + AUPRC, AUROC = evalSummarizer.computeAUC() + AUPRC.to_csv(os.path.join(outDir,'AUPRC.csv')) + AUROC.to_csv(os.path.join(outDir,'AUROC.csv')) + + + print('\n\nEvaluation complete...\n') + + +if __name__ == '__main__': + main() \ No newline at end of file