diff --git a/.gitignore b/.gitignore index 7627003d..fe61b21f 100644 --- a/.gitignore +++ b/.gitignore @@ -8,6 +8,7 @@ old-code/* config_files/* data/* +callflow/unused/* app/node_modules* app/dist diff --git a/app/src/components/callflow.js b/app/src/components/callflow.js index 753a5f80..07cc0126 100644 --- a/app/src/components/callflow.js +++ b/app/src/components/callflow.js @@ -115,7 +115,7 @@ export default { selectedOutlierBand: 4, defaultCallSite: "", modes: ["Ensemble", "Single"], - selectedMode: "Ensemble", + selectedMode: "Single", // Presentation mode variables exhibitModes: ["Presentation", "Default"], selectedExhibitMode: "Default", @@ -164,9 +164,11 @@ export default { mounted() { var socket = io.connect(this.server, { reconnect: false }); + console.log(this.selectedMode) this.$socket.emit("init", { - caseStudy: this.selectedCaseStudy + mode: this.selectedMode }); + EventHandler.$on("lasso_selection", () => { this.$store.resetTargetDataset = true; @@ -201,6 +203,7 @@ export default { this.setTargetDataset(); this.setComponentMap(); + console.log(this.selectedFormat.length, this.selectedMode) if (this.selectedFormat == "SuperGraph") { if (this.selectedMode == "Single") { this.$socket.emit("single_callsite_data", { @@ -265,13 +268,6 @@ export default { }, methods: { - // Feature: Sortby the datasets and show the time. - formatRuntimeWithoutUnits(val) { - let format = d3.format(".2"); - let ret = format(val); - return ret; - }, - // Feature: Sortby the datasets and show the time. sortDatasetsByAttr(datasets, attr) { let ret = datasets.sort((a, b) => { @@ -330,10 +326,10 @@ export default { this.selectedMode = "Single"; } - this.$store.maxExcTime = data["max_excTime"]; - this.$store.minExcTime = data["min_excTime"]; - this.$store.maxIncTime = data["max_incTime"]; - this.$store.minIncTime = data["min_incTime"]; + this.$store.maxExcTime = data["maxExcTime"]; + this.$store.minExcTime = data["minExcTime"]; + this.$store.maxIncTime = data["maxIncTime"]; + this.$store.minIncTime = data["minIncTime"]; this.$store.numOfRanks = data["numOfRanks"]; this.$store.moduleCallsiteMap = data["module_callsite_map"]; @@ -342,8 +338,6 @@ export default { this.$store.selectedMPIBinCount = this.selectedMPIBinCount; this.$store.selectedRunBinCount = this.selectedRunBinCount; - this.selectedIncTime = ((this.selectedFilterPerc * this.$store.maxIncTime[this.selectedTargetDataset] * 0.000001) / 100).toFixed(3); - this.setViewDimensions(); this.$store.auxiliarySortBy = this.auxiliarySortBy; @@ -383,6 +377,7 @@ export default { this.$store.resetTargetDataset = true; } this.$store.selectedMetric = this.selectedMetric; + console.log(this.$store.selectedDatasets) this.datasets = this.sortDatasetsByAttr(this.$store.selectedDatasets, "Inclusive"); let max_dataset = ""; @@ -411,8 +406,9 @@ export default { else { this.$store.selectedTargetDataset = this.selectedTargetDataset; } + this.selectedIncTime = ((this.selectedFilterPerc * this.$store.maxIncTime[this.selectedTargetDataset] * 0.000001) / 100).toFixed(3); - console.log("Minimum among all runtimes: ", this.selectedTargetDataset); + console.log("Maximum among all runtimes: ", this.selectedTargetDataset); }, setComponentMap() { @@ -497,6 +493,7 @@ export default { // Feature: the Supernode hierarchy is automatically selected from the mean metric runtime. sortModulesByMetric(attr) { + console.log(this.$store.modules) let module_list = Object.keys(this.$store.modules["ensemble"]); // Create a map for each dataset mapping the respective mean times. @@ -595,7 +592,9 @@ export default { this.setupColors(); this.setOtherData(); this.setTargetDataset(); - this.setSelectedModule(); + if(this.selectedFormat == 'SuperGraph' && this.selectedMode == 'Ensemble'){ + this.setSelectedModule(); + } console.log("Mode : ", this.selectedMode); console.log("Number of runs :", this.$store.numOfRuns); @@ -623,6 +622,7 @@ export default { this.loadComponents(this.currentEnsembleCallGraphComponents); } else if (this.selectedFormat == "CCT") { + console.log(this.currentEnsembleCCTComponents) this.initComponents(this.currentEnsembleCCTComponents); } } diff --git a/app/src/components/ensembleHistogram/ensembleHistogram.js b/app/src/components/ensembleHistogram/ensembleHistogram.js index 9b3bcf5d..203071bb 100644 --- a/app/src/components/ensembleHistogram/ensembleHistogram.js +++ b/app/src/components/ensembleHistogram/ensembleHistogram.js @@ -105,7 +105,6 @@ export default { EventHandler.$emit("ensemble_histogram", { module: this.$store.selectedModule, - name: "main", dataset: this.$store.runNames, }); }, diff --git a/app/src/components/ensembleScatterplot/ensembleScatterplot.js b/app/src/components/ensembleScatterplot/ensembleScatterplot.js index 91367e79..6bce6200 100644 --- a/app/src/components/ensembleScatterplot/ensembleScatterplot.js +++ b/app/src/components/ensembleScatterplot/ensembleScatterplot.js @@ -79,7 +79,6 @@ export default { this.$refs.ToolTip.init(this.svgID); EventHandler.$emit("ensemble_scatterplot", { module: this.$store.selectedModule, - name: "main", dataset: this.$store.selectedDatasets, }); }, diff --git a/callflow/__init__.py b/callflow/__init__.py index f45be28f..579257c8 100644 --- a/callflow/__init__.py +++ b/callflow/__init__.py @@ -5,11 +5,10 @@ from .datastructures.graphframe import GraphFrame +from .datastructures.supergraph import SuperGraph +from .datastructures.ensemblegraph import EnsembleGraph +from .datastructures.cct import CCT from .datastructures.supergraph_ensemble import EnsembleSuperGraph from .datastructures.supergraph_single import SingleSuperGraph -from .datastructures.cct_ensemble import EnsembleCCT -from .datastructures.cct_single import SingleCCT -from .callflow_base import BaseCallFlow -from .callflow_single import SingleCallFlow -from .callflow_ensemble import EnsembleCallFlow +from .callflow import CallFlow diff --git a/callflow/callflow.py b/callflow/callflow.py new file mode 100644 index 00000000..fb1d61ea --- /dev/null +++ b/callflow/callflow.py @@ -0,0 +1,464 @@ +# Copyright 2017-2020 Lawrence Livermore National Security, LLC and other +# CallFlow Project Developers. See the top-level LICENSE file for details. +# +# SPDX-License-Identifier: MIT + +# ------------------------------------------------------------------------------ +# Library imports +import os +import json + +# ------------------------------------------------------------------------------ +# CallFlow imports +import callflow +from callflow import SuperGraph, EnsembleGraph, CCT, EnsembleSuperGraph +from callflow.modules import ( + EnsembleAuxiliary, + ModuleHierarchy, + ParameterProjection, + FunctionList, +) + +LOGGER = callflow.get_logger(__name__) + +# ------------------------------------------------------------------------------ +# CallFlow class +class CallFlow: + def __init__(self, config={}, process=False, ensemble=False): + """ + Entry interface to access CallFlow's functionalities. " + """ + + # Assert if config is provided. + assert config != None + + # Convert config json to props. Never touch self.config ever. + self.props = json.loads(json.dumps(config, default=lambda o: o.__dict__)) + # Assert ensemble if it really contains more than 1 dataset. + assert ensemble == (len(self.props["dataset_names"]) > 1) + + # Based on option, either process into .callflow or read from .callflow. + if process: + self._create_dot_callflow_folder() + if ensemble: + self._process_ensemble() + else: + self._process_single() + else: # Rendering of call graphs. + if ensemble: + self.supergraphs = self._read_ensemble() + # assertion here is 1 less than self.supergraph.keys, becasuse + # self.supergraphs contains the ensemble supergraph as well. + assert ( + len(self.props["dataset_names"]) == len(self.supergraphs.keys()) - 1 + ) + else: + self.supergraphs = self._read_single() + assert len(self.props["dataset_names"]) == 1 + + # Adds basic information to props. + # Props is later return to client app on "init" request. + self.add_basic_info_to_props() + + # -------------------------------------------------------------------------- + # Processing methods. + def _create_dot_callflow_folder(self): + """ + Create a .callflow directory and empty files. + """ + LOGGER.debug(f"Saved .callflow directory is: {self.props['save_path']}") + + if not os.path.exists(self.props["save_path"]): + os.makedirs(self.props["save_path"]) + os.makedirs(os.path.join(self.props["save_path"], "ensemble")) + + dataset_folders = [] + for dataset in self.props["datasets"]: + dataset_folders.append(dataset["name"]) + dataset_folders.append("ensemble") + + for dataset in dataset_folders: + dataset_dir = os.path.join(self.props["save_path"], dataset) + LOGGER.debug(dataset_dir) + if not os.path.exists(dataset_dir): + # if self.debug: + LOGGER.debug(f"Creating .callflow directory for dataset : {dataset}") + os.makedirs(dataset_dir) + + files = ["df.csv", "nxg.json", "hatchet_tree.txt", "auxiliary_data.json"] + for f in files: + fname = os.path.join(dataset_dir, f) + if not os.path.exists(fname): + open(fname, "w").close() + + def _remove_dot_callflow_folder(self): + """ + TODO: We might want to delete the .callflow folder when we re-process/re-write. + """ + pass + + def _process_single(self): + """ + Single dataset processing. + """ + dataset_name = self.props["dataset_names"][0] + supergraph = SuperGraph(props=self.props, tag=dataset_name, mode="process") + LOGGER.info("#########################################") + LOGGER.info(f"Run: {dataset_name}") + LOGGER.info("#########################################") + + # Process each graphframe. + supergraph.process_gf() + + # Filter by inclusive or exclusive time. + supergraph.filter_gf(mode="single") + + # Group by module. + supergraph.group_gf(group_by="module") + + # Store the graphframe. + supergraph.write_gf("entire") + + supergraph.single_auxiliary( + dataset=dataset_name, binCount=20, process=True, + ) + + def _process_ensemble(self): + """ + Ensemble processing of datasets. + """ + # Before we process the ensemble, we perform single processing on all datasets. + single_supergraphs = {} + for idx, dataset_name in enumerate(self.props["dataset_names"]): + # Create an instance of dataset. + single_supergraphs[dataset_name] = SuperGraph( + props=self.props, tag=dataset_name, mode="process" + ) + LOGGER.info("#########################################") + LOGGER.info(f"Run: {dataset_name}") + LOGGER.info("#########################################") + + # Process each graphframe. + single_supergraphs[dataset_name].process_gf() + + # Write the entire graphframe into .callflow. + single_supergraphs[dataset_name].write_gf("entire") + + # Single auxiliary processing. + single_supergraphs[dataset_name].single_auxiliary( + dataset=dataset_name, binCount=20, process=True, + ) + + # Create a supergraph class for ensemble case. + ensemble_supergraph = EnsembleGraph( + self.props, "ensemble", mode="process", supergraphs=single_supergraphs + ) + + # Write the graphframe to file. + ensemble_supergraph.write_gf("entire") + + # Filter the ensemble graphframe. + ensemble_supergraph.filter_gf(mode="ensemble") + + # Write the filtered graphframe. + ensemble_supergraph.write_gf("filter") + + # Group by module. + ensemble_supergraph.group_gf(group_by="module") + + # Write the grouped graphframe. + ensemble_supergraph.write_gf("group") + + # Ensemble auxiliary processing. + ensemble_supergraph.ensemble_auxiliary( + # MPIBinCount=self.currentMPIBinCount, + # RunBinCount=self.currentRunBinCount, + datasets=self.props["dataset_names"], + MPIBinCount=20, + RunBinCount=20, + process=True, + write=True, + ) + + def _read_single(self): + """ + Read the single .callflow files required for client. + """ + supergraphs = {} + # Only consider the first dataset from the listing. + dataset_name = self.props["dataset_names"][0] + supergraphs[dataset_name] = SuperGraph( + props=self.props, tag=dataset_name, mode="render" + ) + + return supergraphs + + def _read_ensemble(self): + """ + Read the ensemble .callflow files required for client. + """ + supergraphs = {} + + for idx, dataset_name in enumerate(self.props["dataset_names"]): + supergraphs[dataset_name] = SuperGraph( + self.props, dataset_name, mode="render" + ) + supergraphs[dataset_name].read_gf( + read_parameter=self.props["read_parameter"] + ) + + supergraphs["ensemble"] = EnsembleGraph( + props=self.props, tag="ensemble", mode="render" + ) + supergraphs["ensemble"].read_gf(read_parameter=self.props["read_parameter"]) + supergraphs["ensemble"].read_auxiliary_data() + return supergraphs + + # -------------------------------------------------------------------------- + # Reading and rendering methods. + # All the functions below are Public methods that are accessed by the server. + + def add_basic_info_to_props(self): + """ + Adds basic information (like max, min inclusive and exclusive runtime) to self.props. + """ + self.props["maxIncTime"] = {} + self.props["maxExcTime"] = {} + self.props["minIncTime"] = {} + self.props["minExcTime"] = {} + self.props["numOfRanks"] = {} + maxIncTime = 0 + maxExcTime = 0 + minIncTime = 0 + minExcTime = 0 + maxNumOfRanks = 0 + for idx, tag in enumerate(self.supergraphs): + self.props["maxIncTime"][tag] = ( + self.supergraphs[tag].gf.df["time (inc)"].max() + ) + self.props["maxExcTime"][tag] = self.supergraphs[tag].gf.df["time"].max() + self.props["minIncTime"][tag] = ( + self.supergraphs[tag].gf.df["time (inc)"].min() + ) + self.props["minExcTime"][tag] = self.supergraphs[tag].gf.df["time"].min() + # self.props["numOfRanks"][dataset] = len( + # self.datasets[dataset].gf.df["rank"].unique() + # ) + maxExcTime = max(self.props["maxExcTime"][tag], maxExcTime) + maxIncTime = max(self.props["maxIncTime"][tag], maxIncTime) + minExcTime = min(self.props["minExcTime"][tag], minExcTime) + minIncTime = min(self.props["minIncTime"][tag], minIncTime) + # maxNumOfRanks = max(self.props["numOfRanks"][dataset], maxNumOfRanks) + + self.props["maxIncTime"]["ensemble"] = maxIncTime + self.props["maxExcTime"]["ensemble"] = maxExcTime + self.props["minIncTime"]["ensemble"] = minIncTime + self.props["minExcTime"]["ensemble"] = minExcTime + # self.props["numOfRanks"]["ensemble"] = maxNumOfRanks + + def request_single(self, operation): + """ + TODO: Write individual functiosn to do this. + Handles all the socket requests connected to Single CallFlow. + """ + LOGGER.info(f"[Single Mode] {operation}") + operation_tag = operation["name"] + + if operation_tag == "init": + return self.props + + if "groupBy" in operation: + LOGGER.info("Grouping by: {0}".format(operation["groupBy"])) + else: + operation["groupBy"] = "name" + + dataset = operation["dataset"] + + LOGGER.info("The selected Dataset is {0}".format(dataset)) + + # Compare against the different operations + # TODO: Probably remove. + if operation_tag == "reset": + datasets = [dataset] + self.reProcess = True + self.states = self.pipeline( + datasets, operation["filterBy"], operation["filterPerc"] + ) + self.reProcess = False + self.states = self.pipeline(datasets) + return {} + + elif operation_tag == "auxiliary": + return self.supergraphs[dataset].auxiliary_data + + elif operation_tag == "supergraph": + return self.supergraphs[dataset].gf.nxg + + elif operation_tag == "mini-histogram": + minihistogram = MiniHistogram(state) + return minihistogram.result + + elif operation_tag == "cct": + graph = CCT( + supergraphs=self.supergraphs, + tag=operation["dataset"], + callsite_count=operation["functionsInCCT"], + ) + return graph.g + + elif operation_tag == "function": + functionlist = FunctionList(state, operation["module"], operation["nid"]) + return functionlist.result + + def request_ensemble(self, operation): + """ + TODO: Write individual functiosn to do this. + Handles all the socket requests connected to Single CallFlow. + """ + operation_tag = operation["name"] + datasets = self.props["dataset_names"] + + if operation_tag == "init": + return self.props + + elif operation_tag == "ensemble_cct": + result = CCT( + supergraphs=self.supergraphs, + tag="ensemble", + props=self.props, + callsite_count=operation["functionsInCCT"], + ) + return result.gf.nxg + + elif operation_tag == "supergraph": + if "reveal_callsites" in operation: + reveal_callsites = operation["reveal_callsites"] + else: + reveal_callsites = [] + + if "split_entry_module" in operation: + split_entry_module = operation["split_entry_module"] + else: + split_entry_module = "" + + if "split_callee_module" in operation: + split_callee_module = operation["split_callee_module"] + else: + split_callee_module = "" + + ensemble_super_graph = EnsembleSuperGraph( + supergraphs=self.supergraphs, + tag="ensemble", + path="group_path", + group_by_attr="module", + props=self.props, + construct_graph=True, + add_data=True, + reveal_callsites=reveal_callsites, + split_entry_module=split_entry_module, + split_callee_module=split_callee_module, + ) + return ensemble_super_graph.agg_nxg + + # Not used. + elif operation_tag == "scatterplot": + assert False + if operation["plot"] == "bland-altman": + state1 = self.states[operation["dataset"]] + state2 = self.states[operation["dataset2"]] + col = operation["col"] + catcol = operation["catcol"] + dataset1 = operation["dataset"] + dataset2 = operation["dataset2"] + ret = BlandAltman( + state1, state2, col, catcol, dataset1, dataset2 + ).results + return ret + + # Not used. + elif operation_tag == "similarity": + assert False + if operation["module"] == "all": + dirname = self.config.callflow_dir + name = self.config.runName + similarity_filepath = dirname + "/" + "similarity.json" + with open(similarity_filepath, "r") as similarity_file: + self.similarities = json.load(similarity_file) + else: + self.similarities = {} + for idx, dataset in enumerate(datasets): + self.similarities[dataset] = [] + for idx_2, dataset2 in enumerate(datasets): + union_similarity = Similarity( + self.states[dataset2].g, self.states[dataset].g + ) + self.similarities[dataset].append(union_similarity.result) + return self.similarities + + elif operation_tag == "hierarchy": + mH = ModuleHierarchy(self.supergraphs["ensemble"], operation["module"]) + return mH.result + + elif operation_tag == "projection": + self.similarities = {} + # dirname = self.config.callflow_dir + # name = self.config.runName + # similarity_filepath = dirname + '/' + 'similarity.json' + # with open(similarity_filepath, 'r') as similarity_file: + # self.similarities = json.load(similarity_file) + result = ParameterProjection( + self.supergraphs["ensemble"], + self.similarities, + operation["targetDataset"], + n_cluster=operation["numOfClusters"], + ).result + return result.to_json(orient="columns") + + # Not used. + elif operation_tag == "run-information": + assert False + ret = [] + for idx, state in enumerate(self.states): + self.states[state].projection_data["dataset"] = state + ret.append(self.states[state].projection_data) + return ret + + # TODO: need to handle re-processing case. + # The commented code below was used to enable re-processing. + elif operation_tag == "auxiliary": + # print(f"Reprocessing: {operation['re-process']}") + # aux = EnsembleAuxiliary( + # self.states, + # MPIBinCount=operation["MPIBinCount"], + # RunBinCount=operation["RunBinCount"], + # datasets=operation["datasets"], + # config=self.config, + # process=True, + # write=False, + # ) + # if operation["re-process"] == 1: + # result = aux.run() + # else: + + # Need these two variables to belong to some class. Not sure where. + # Will take care when pre-processing is done. + # self.currentMPIBinCount = operation["MPIBinCount"] + # self.currentRunBinCount = operation["RunBinCount"] + + return self.supergraphs["ensemble"].auxiliary_data + + elif operation_tag == "compare": + compareDataset = operation["compareDataset"] + targetDataset = operation["targetDataset"] + if operation["selectedMetric"] == "Inclusive": + selectedMetric = "time (inc)" + elif operation["selectedMetric"] == "Exclusive": + selectedMetric = "time" + + compare = DiffView( + self.supergraphs["ensemble"], + compareDataset, + targetDataset, + selectedMetric, + ) + return compare.result diff --git a/callflow/callflow_base.py b/callflow/callflow_base.py deleted file mode 100644 index 0b0b7c03..00000000 --- a/callflow/callflow_base.py +++ /dev/null @@ -1,131 +0,0 @@ -import os - -import callflow - -LOGGER = callflow.get_logger(__name__) -from callflow.pipeline import Pipeline - - -class AppState: - def __init__(self, config): - self.config = config - - self.maxIncTime = {} - self.maxExcTime = {} - self.minIncTime = {} - self.minExcTime = {} - self.numOfRanks = {} - - def add_target_df(self): - self.target_df = {} - for dataset in self.config.dataset_names: - self.target_df[dataset] = self.states["ensemble_entire"].new_gf.df.loc[ - self.states["ensemble_entire"].new_gf.df["dataset"] == dataset - ] - - def add_basic_info(self): - maxIncTime = 0 - maxExcTime = 0 - minIncTime = 0 - minExcTime = 0 - maxNumOfRanks = 0 - for idx, dataset in enumerate(self.config.dataset_names): - self.maxIncTime[dataset] = self.target_df[dataset]["time (inc)"].max() - self.maxExcTime[dataset] = self.target_df[dataset]["time"].max() - self.minIncTime[dataset] = self.target_df[dataset]["time (inc)"].min() - self.minExcTime[dataset] = self.target_df[dataset]["time"].min() - self.numOfRanks[dataset] = len(self.target_df[dataset]["rank"].unique()) - max_exclusive_time = max(self.maxExcTime[dataset], maxExcTime) - max_inclusive_time = max(self.maxIncTime[dataset], maxIncTime) - min_exclusive_time = min(self.minExcTime[dataset], minExcTime) - min_inclusive_time = min(self.minIncTime[dataset], minIncTime) - max_numOfRanks = max(self.numOfRanks[dataset], max_numOfRanks) - self.maxIncTime["ensemble"] = maxIncTime - self.maxExcTime["ensemble"] = maxExcTime - self.minIncTime["ensemble"] = minIncTime - self.minExcTime["ensemble"] = minExcTime - self.numOfRanks["ensemble"] = maxNumOfRanks - - -class Config: - def __init__(self): - pass - - -class BaseCallFlow: - def __init__(self, config={}, process=False): - - # Assert if config is provided. - assert config != None - self.config = config - - if process: - self.pipeline = Pipeline(self.config) - self._create_dot_callflow_folder() - self.process_states() - - else: - self.appState = AppState(self.config) - self.read_states() - - # -------------------------------------------------------------------------- - # public API. child classes should implement these functions - def process_states(self): - self._process_states() - - def read_states(self): - self._read_states() - - def request(self, operation): - self._request(operation) - - # -------------------------------------------------------------------------- - def displayStats(self, name): - log.warn("==========================") - log.info("Number of datasets : {0}".format(len(self.config[name].paths.keys()))) - log.info("Stats: Dataset ({0}) ".format(name)) - log.warn("==========================") - max_inclusive_time = utils.getMaxIncTime(gf) - max_exclusive_time = utils.getMaxExcTime(gf) - avg_inclusive_time = utils.getAvgIncTime(gf) - avg_exclusive_time = utils.getAvgExcTime(gf) - num_of_nodes = utils.getNumOfNodes(gf) - log.info("[] Rows in dataframe: {0}".format(self.states[name].df.shape[0])) - log.info("Max Inclusive time = {0} ".format(max_inclusive_time)) - log.info("Max Exclusive time = {0} ".format(max_exclusive_time)) - log.info("Avg Inclusive time = {0} ".format(avg_inclusive_time)) - log.info("Avg Exclusive time = {0} ".format(avg_exclusive_time)) - log.info("Number of nodes in CCT = {0}".format(num_of_nodes)) - - # -------------------------------------------------------------------------- - def _create_dot_callflow_folder(self): - """ - Create a .callflow directory and empty files. - """ - LOGGER.debug(f"Saved .callflow directory is: {self.config.save_path}") - - if not os.path.exists(self.config.save_path): - os.makedirs(self.config.save_path) - - for dataset in self.config.datasets: - dataset_dir = os.path.join(self.config.save_path, dataset["name"]) - LOGGER.debug(dataset_dir) - if not os.path.exists(dataset_dir): - #if self.debug: - LOGGER.debug( - f"Creating .callflow directory for dataset : {dataset['name']}" - ) - os.makedirs(dataset_dir) - - files = [ - "entire_df.csv", - "filter_df.csv", - "entire_graph.json", - "filter_graph.json", - ] - for f in files: - fname = os.path.join(dataset_dir, f) - if not os.path.exists(fname): - open(fname, "w").close() - - # -------------------------------------------------------------------------- diff --git a/callflow/callflow_ensemble.py b/callflow/callflow_ensemble.py deleted file mode 100644 index e6def1bd..00000000 --- a/callflow/callflow_ensemble.py +++ /dev/null @@ -1,326 +0,0 @@ -############################################################################## -# Copyright (c) 2018-2019, Lawrence Livermore National Security, LLC. -# Produced at the Lawrence Livermore National Laboratory. -# -# This file is part of Callflow. -# Created by Suraj Kesavan . -# LLNL-CODE-741008. All rights reserved. -# -# For details, see: https://github.com/LLNL/Callflow -# Please also read the LICENSE file for the MIT License notice. -############################################################################## - -import time -import json -import pandas as pd - -import callflow - -LOGGER = callflow.get_logger(__name__) -from callflow.pipeline import State, Pipeline - -from callflow.utils import getMaxExcTime, getMinExcTime, getMaxIncTime, getMinIncTime -from callflow.timer import Timer -from callflow import EnsembleCCT, EnsembleSuperGraph, BaseCallFlow -from callflow.modules import ( - RankHistogram, - EnsembleAuxiliary, - Gradients, - ModuleHierarchy, - ParameterProjection, - DiffView, -) -from callflow.algorithms import DeltaConSimilarity - -# Create states for each dataset. -# Note: gf would never change from create_gf. -# # Note: fgf would be changed when filter props are changed by client. -# Note: df is always updated. -# Note: graph is always updated. -class EnsembleCallFlow(BaseCallFlow): - def __init__(self, config=None, process=None): - super(SingleCallFlow, self).__init__(config, process) - - # Config contains properties set by the input config file. - self.currentMPIBinCount = 20 - self.currentRunBinCount = 20 - - # TODO: should go in appstate - # self.target_df = {} - # for dataset in self.config.dataset_names: - # self.target_df[dataset] = self.states["ensemble_entire"].new_gf.df.loc[ - # self.states["ensemble_entire"].new_gf.df["dataset"] == dataset - # ] - - # -------------------------------------------------------------------------- - # TODo: look at the difference in signature - def _process_states(self, filterBy="Inclusive", filterPerc="10"): - states = {} - # col_names = ["stage", "time"] - # time_perf_df = pd.DataFrame(columns=col_names) - for idx, dataset_name in enumerate(self.config.dataset_names): - states[dataset_name] = State(dataset_name) - LOGGER.info("#########################################") - LOGGER.info(f"Run: {dataset_name}") - LOGGER.info("#########################################") - - stage1 = time.perf_counter() - states[dataset_name] = self.pipeline.create_gf(dataset_name) - stage2 = time.perf_counter() - LOGGER.info(f"Create GraphFrame: {stage2 - stage1}") - LOGGER.info("-----------------------------------------") - - states[dataset_name] = self.pipeline.process_gf( - states[dataset_name], "entire" - ) - stage3 = time.perf_counter() - - LOGGER.info(f"Preprocess GraphFrame: {stage3 - stage2}") - LOGGER.info("-----------------------------------------") - - states[dataset_name] = self.pipeline.hatchetToNetworkX( - states[dataset_name], "path" - ) - stage4 = time.perf_counter() - LOGGER.info(f"Convert to NetworkX graph: {stage4 - stage3}") - LOGGER.info("-----------------------------------------") - - states[dataset_name] = self.pipeline.group(states[dataset_name], "module") - stage5 = time.perf_counter() - LOGGER.info(f"Convert to NetworkX graph: {stage4 - stage3}") - LOGGER.info("-----------------------------------------") - - self.pipeline.write_dataset_gf( - states[dataset_name], dataset_name, "entire", write_graph=False - ) - stage6 = time.perf_counter() - LOGGER.info(f"Write GraphFrame: {stage6 - stage5}") - LOGGER.info("-----------------------------------------") - self.pipeline.write_hatchet_graph(states, dataset_name) - - for idx, dataset_name in enumerate(self.config.dataset_names): - states[dataset_name] = self.pipeline.read_dataset_gf(dataset_name) - - stage7 = time.perf_counter() - states["ensemble_entire"] = self.pipeline.union(states) - stage8 = time.perf_counter() - - LOGGER.info(f"Union GraphFrame: {stage8 - stage7}") - LOGGER.info("-----------------------------------------") - - self.pipeline.write_ensemble_gf(states, "ensemble_entire") - stage9 = time.perf_counter() - LOGGER.info(f"Writing ensemble graph: {stage9 - stage8}") - LOGGER.info("-----------------------------------------") - - stage10 = time.perf_counter() - states["ensemble_filter"] = self.pipeline.filterNetworkX( - states["ensemble_entire"], self.config.filter_perc - ) - stage11 = time.perf_counter() - - LOGGER.info(f"Filter ensemble graph: {stage11 - stage10}") - LOGGER.info("-----------------------------------------") - - stage12 = time.perf_counter() - self.pipeline.write_ensemble_gf(states, "ensemble_filter") - stage13 = time.perf_counter() - LOGGER.info(f"Writing ensemble graph: {stage13 - stage12}") - LOGGER.info("-----------------------------------------") - - stage14 = time.perf_counter() - states["ensemble_group"] = self.pipeline.ensemble_group(states, "module") - stage15 = time.perf_counter() - - LOGGER.info(f"Group ensemble graph: {stage15 - stage14}") - LOGGER.info("-----------------------------------------") - stage16 = time.perf_counter() - self.pipeline.write_ensemble_gf(states, "ensemble_group") - stage17 = time.perf_counter() - - LOGGER.info(f"Write group ensemble graph: {stage17 - stage16}") - LOGGER.info("-----------------------------------------") - - # Need to remove the dependence on reading the dataframe again. - states = {} - states["ensemble_entire"] = self.pipeline.read_ensemble_gf("ensemble_entire") - - stage18 = time.perf_counter() - aux = EnsembleAuxiliary( - states, - MPIBinCount=self.currentMPIBinCount, - RunBinCount=self.currentRunBinCount, - datasets=self.config.dataset_names, - config=self.config, - process=True, - write=True, - ) - aux.run() - stage19 = time.perf_counter() - LOGGER.info(f"Dump Gradient, distribution and variations: {stage19 - stage18}") - LOGGER.info("-----------------------------------------") - - return states - - def _readState(self): - states = {} - states["ensemble_entire"] = self.pipeline.read_ensemble_gf("ensemble_entire") - states["ensemble_filter"] = self.pipeline.read_ensemble_gf("ensemble_filter") - states["ensemble_group"] = self.pipeline.read_ensemble_gf("ensemble_group") - states["all_data"] = self.pipeline.read_all_data() - - return states - - def _request(self, action): - action_name = action["name"] - LOGGER.info(f"Action: {action_name}") - datasets = self.config.dataset_names - - if action_name == "init": - self.addIncExcTime() - return self.config - - elif action_name == "ensemble_cct": - nx = EnsembleCCT( - self.states["ensemble_entire"], action["functionsInCCT"], self.config - ) - return nx.g - - elif action_name == "supergraph": - if "reveal_callsites" in action: - reveal_callsites = action["reveal_callsites"] - else: - reveal_callsites = [] - - if "split_entry_module" in action: - split_entry_module = action["split_entry_module"] - else: - split_entry_module = "" - - if "split_callee_module" in action: - split_callee_module = action["split_callee_module"] - else: - split_callee_module = "" - - self.states["ensemble_group"].g = EnsembleSuperGraph( - self.states, - "group_path", - construct_graph=True, - add_data=True, - reveal_callsites=reveal_callsites, - split_entry_module=split_entry_module, - split_callee_module=split_callee_module, - ).agg_g - return self.states["ensemble_group"].g - - elif action_name == "scatterplot": - if action["plot"] == "bland-altman": - state1 = self.states[action["dataset"]] - state2 = self.states[action["dataset2"]] - col = action["col"] - catcol = action["catcol"] - dataset1 = action["dataset"] - dataset2 = action["dataset2"] - ret = BlandAltman( - state1, state2, col, catcol, dataset1, dataset2 - ).results - return ret - - elif action_name == "Gromov-wasserstein": - ret = {} - return ret - - elif action_name == "similarity": - if action["module"] == "all": - dirname = self.config.callflow_dir - name = self.config.runName - similarity_filepath = dirname + "/" + "similarity.json" - with open(similarity_filepath, "r") as similarity_file: - self.similarities = json.load(similarity_file) - else: - self.similarities = {} - for idx, dataset in enumerate(datasets): - self.similarities[dataset] = [] - for idx_2, dataset2 in enumerate(datasets): - union_similarity = Similarity( - self.states[dataset2].g, self.states[dataset].g - ) - self.similarities[dataset].append(union_similarity.result) - return self.similarities - - elif action_name == "hierarchy": - mH = ModuleHierarchy( - self.states["ensemble_entire"], action["module"], config=self.config - ) - return mH.result - - elif action_name == "projection": - self.similarities = {} - # dirname = self.config.callflow_dir - # name = self.config.runName - # similarity_filepath = dirname + '/' + 'similarity.json' - # with open(similarity_filepath, 'r') as similarity_file: - # self.similarities = json.load(similarity_file) - result = ParameterProjection( - self.states["ensemble_entire"], - self.similarities, - action["targetDataset"], - n_cluster=action["numOfClusters"], - ).result - return result.to_json(orient="columns") - - elif action_name == "run-information": - ret = [] - for idx, state in enumerate(self.states): - self.states[state].projection_data["dataset"] = state - ret.append(self.states[state].projection_data) - return ret - - elif action_name == "mini-histogram": - minihistogram = MiniHistogram( - self.states["ensemble"], target_datasets=action["target-datasets"] - ) - return minihistogram.result - - elif action_name == "histogram": - histogram = RankHistogram(self.states["ensemble"], action["module"]) - return histogram.result - - elif action_name == "auxiliary": - print(f"Reprocessing: {action['re-process']}") - aux = EnsembleAuxiliary( - self.states, - MPIBinCount=action["MPIBinCount"], - RunBinCount=action["RunBinCount"], - datasets=action["datasets"], - config=self.config, - process=True, - write=False, - ) - if action["re-process"] == 1: - result = aux.run() - else: - result = self.states["all_data"] - # result = aux.filter_dict(result) - self.currentMPIBinCount = action["MPIBinCount"] - self.currentRunBinCount = action["RunBinCount"] - - return result - - elif action_name == "compare": - compareDataset = action["compareDataset"] - targetDataset = action["targetDataset"] - if action["selectedMetric"] == "Inclusive": - selectedMetric = "time (inc)" - elif action["selectedMetric"] == "Exclusive": - selectedMetric = "time" - - compare = DiffView( - self.states["ensemble_entire"], - compareDataset, - targetDataset, - selectedMetric, - ) - return compare.result - - # -------------------------------------------------------------------------- diff --git a/callflow/callflow_single.py b/callflow/callflow_single.py deleted file mode 100644 index 783f34b8..00000000 --- a/callflow/callflow_single.py +++ /dev/null @@ -1,148 +0,0 @@ -############################################################################## -# Copyright (c) 2018-2019, Lawrence Livermore National Security, LLC. -# Produced at the Lawrence Livermore National Laboratory. -# -# This file is part of Callflow. -# Created by Suraj Kesavan . -# LLNL-CODE-741008. All rights reserved. -# -# For details, see: https://github.com/LLNL/Callflow -# Please also read the LICENSE file for the MIT License notice. -############################################################################## - -import time -import json - -import callflow - -LOGGER = callflow.get_logger(__name__) - -from callflow.timer import Timer -from callflow.pipeline import State, Pipeline -from callflow.utils import ( - getMaxExcTime, - getMinExcTime, - getMaxIncTime, - getMinIncTime, -) - -from callflow import SingleCCT, SingleSuperGraph, BaseCallFlow - -from callflow.modules import ( - SingleAuxiliary, - RankHistogram, - MiniHistogram, - RuntimeScatterplot, - FunctionList, -) - - -class SingleCallFlow(BaseCallFlow): - - def __init__(self, config=None, process=False): - super(SingleCallFlow, self).__init__(config, process) - - # -------------------------------------------------------------------------- - def _process_states(self): - for dataset_name in self.config.dataset_names: - state = State(dataset_name) - LOGGER.info("#########################################") - LOGGER.info(f"Run: {dataset_name}") - LOGGER.info("#########################################") - - stage1 = time.perf_counter() - state = self.pipeline.create_gf(dataset_name) - stage2 = time.perf_counter() - LOGGER.info(f"Create GraphFrame: {stage2 - stage1}") - LOGGER.info("-----------------------------------------") - - states = self.pipeline.process_gf(state, "entire") - stage3 = time.perf_counter() - LOGGER.info(f"Preprocess GraphFrame: {stage3 - stage2}") - LOGGER.info("-----------------------------------------") - - state = self.pipeline.hatchetToNetworkX(state, "path") - stage4 = time.perf_counter() - LOGGER.info(f"Convert to NetworkX graph: {stage4 - stage3}") - LOGGER.info("-----------------------------------------") - - state = self.pipeline.group(state, "module") - stage5 = time.perf_counter() - LOGGER.info(f"Group GraphFrame: {stage5 - stage4}") - LOGGER.info("-----------------------------------------") - - self.pipeline.write_dataset_gf( - state, dataset_name, "entire", write_graph=False - ) - stage6 = time.perf_counter() - LOGGER.info(f"Write GraphFrame: {stage6 - stage5}") - LOGGER.info("-----------------------------------------") - LOGGER.info(f'Module: {state.new_gf.df["module"].unique()}') - - return state - - def _read_states(self, datasets): - states = {} - for idx, dataset in enumerate(datasets): - states[dataset] = self.pipeline.read_dataset_gf(dataset) - return states - - def _request(self, action): - LOGGER.info("[Single Mode]", action) - action_name = action["name"] - - if action_name == "init": - self.setConfig() - return self.config - - if "groupBy" in action: - LOGGER.info("Grouping by: {0}".format(action["groupBy"])) - else: - action["groupBy"] = "name" - - dataset = action["dataset"] - state = self.states[dataset] - - LOGGER.info("The selected Dataset is {0}".format(dataset)) - - # Compare against the different operations - if action_name == "reset": - datasets = [dataset] - self.reProcess = True - self.states = self.pipeline( - datasets, action["filterBy"], action["filterPerc"] - ) - self.reProcess = False - self.states = self.pipeline(datasets) - return {} - - elif action_name == "auxiliary": - auxiliary = Auxiliary( - self.states[action["dataset"]], - binCount=action["binCount"], - dataset=action["dataset"], - config=self.config, - ) - return auxiliary.result - - elif action_name == "supergraph": - self.states[dataset].g = SuperGraph( - self.states, dataset, "group_path", construct_graph=True, add_data=True - ).g - return self.states[dataset].g - - elif action_name == "mini-histogram": - minihistogram = MiniHistogram(state) - return minihistogram.result - - elif action_name == "cct": - graph = singleCCT( - self.states[action["dataset"]], action["functionsInCCT"], self.config - ) - return graph.g - - elif action_name == "function": - functionlist = FunctionList(state, action["module"], action["nid"]) - return functionlist.result - - # -------------------------------------------------------------------------- diff --git a/callflow/datastructures/cct.py b/callflow/datastructures/cct.py index e69de29b..55628009 100644 --- a/callflow/datastructures/cct.py +++ b/callflow/datastructures/cct.py @@ -0,0 +1,336 @@ +# Copyright 2017-2020 Lawrence Livermore National Security, LLC and other +# CallFlow Project Developers. See the top-level LICENSE file for details. +# +# SPDX-License-Identifier: MIT + +# ------------------------------------------------------------------------------ +# Library imports +import math +import pandas as pd +import networkx as nx +from ast import literal_eval as make_tuple + +# ------------------------------------------------------------------------------ +# CallFlow imports +import callflow +from callflow.timer import Timer +from callflow import SuperGraph + +# ------------------------------------------------------------------------------ +# CCT Rendering class. +class CCT(SuperGraph): + def __init__(self, supergraphs={}, tag="", props={}, callsite_count=50): + # Call the SuperGraph class init. + super(CCT, self).__init__(props=props, tag=tag, mode="render") + + # set the current graph being rendered. + self.supergraph = supergraphs[tag] + + # Number of runs in the state. + self.runs = self.supergraph.gf.df["dataset"].unique() + self.columns = ["time (inc)", "time", "name", "module"] + + # callsite count is bounded by the user's input. + if callsite_count == None: + self.callsite_count = len(self.supergraph.gf.df["name"].unique()) + else: + self.callsite_count = int(callsite_count) + + # Put the top callsites into a list. + self.callsites = self.get_top_n_callsites_by_attr( + df=self.supergraph.gf.df, + callsite_count=self.callsite_count, + sort_attr="time (inc)", + ) + + # Filter out the callsites not in the list. + self.supergraph.gf.df = self.supergraph.gf.df[ + self.supergraph.gf.df["name"].isin(self.callsites) + ] + self.datasets = self.supergraph.gf.df["dataset"].unique() + + with self.timer.phase(f"Creating the ensemble CCT: {self.datasets}"): + self.supergraph.gf.nxg = nx.DiGraph() + + # Add paths by "column" = path. + self.add_paths("path") + + # Add node and edge attributes. + with self.timer.phase(f"Add node and edge attributes."): + self.add_node_attributes() + self.add_edge_attributes() + + # Find cycles in the CCT. + with self.timer.phase(f"Find cycles"): + self.supergraph.gf.nxg.cycles = self.find_cycle(self.supergraph.gf.nxg) + + print(self.timer) + + def get_top_n_callsites_by_attr( + self, df=pd.DataFrame([]), callsite_count=50, sort_attr="time (inc)" + ): + """ + Fetches the top n callsites based on attribute (time/time (inc)). + """ + xgroup_df = self.supergraph.gf.df.groupby(["name"]).mean() + sort_xgroup_df = xgroup_df.sort_values(by=[sort_attr], ascending=False) + callsites_df = sort_xgroup_df.nlargest(callsite_count, sort_attr) + return callsites_df.index.values.tolist() + + def ensemble_map(self, df, nodes): + ret = {} + """ + Construct the ensemble map + """ + for callsite in self.supergraph.gf.nxg.nodes(): + if callsite not in self.props["callsite_module_map"]: + module = self.supergraph.gf.df.loc[ + self.supergraph.gf.df["name"] == callsite + ]["module"].unique()[0] + else: + module = self.props["callsite_module_map"][callsite] + + for column in self.columns: + if column not in ret: + ret[column] = {} + if column == "time (inc)": + ret[column][callsite] = self.name_time_inc_map[(module, callsite)] + elif column == "time": + ret[column][callsite] = self.name_time_exc_map[(module, callsite)] + elif column == "name": + ret[column][callsite] = callsite + elif column == "module": + ret[column][callsite] = module + + return ret + + def dataset_map(self, nodes, run): + """ + Construct maps for each dataset. + """ + ret = {} + for callsite in self.supergraph.gf.nxg.nodes(): + if callsite not in self.props["callsite_module_map"]: + module = self.supergraph.gf.df.loc[ + self.supergraph.gf.df["name"] == callsite + ]["module"].unique()[0] + else: + module = self.props["callsite_module_map"][callsite] + + if callsite in self.target_module_callsite_map[run].keys(): + if callsite not in ret: + ret[callsite] = {} + + for column in self.columns: + if column == "time (inc)": + ret[callsite][column] = self.target_module_time_inc_map[run][ + module + ] + + elif column == "time": + ret[callsite][column] = self.target_module_time_exc_map[run][ + module + ] + + elif column == "module": + ret[callsite][column] = module + + elif column == "name": + ret[callsite][column] = callsite + + return ret + + def add_node_attributes(self): + ensemble_mapping = self.ensemble_map( + self.supergraph.gf.df, self.supergraph.gf.nxg.nodes() + ) + + for idx, key in enumerate(ensemble_mapping): + nx.set_node_attributes( + self.supergraph.gf.nxg, name=key, values=ensemble_mapping[key] + ) + + dataset_mapping = {} + for run in self.runs: + dataset_mapping[run] = self.dataset_map(self.supergraph.gf.nxg.nodes(), run) + nx.set_node_attributes( + self.supergraph.gf.nxg, name=run, values=dataset_mapping[run] + ) + + def add_edge_attributes(self): + num_of_calls_mapping = self.edge_map( + self.supergraph.gf.nxg.edges(), "component_path" + ) + nx.set_edge_attributes( + self.supergraph.gf.nxg, name="count", values=num_of_calls_mapping + ) + + def edge_map(self, edges, attr, source=None, orientation=None): + counter = {} + if not self.supergraph.gf.nxg.is_directed() or orientation in ( + None, + "original", + ): + + def tailhead(edge): + return edge[:2] + + elif orientation == "reverse": + + def tailhead(edge): + return edge[1], edge[0] + + elif orientation == "ignore": + + def tailhead(edge): + if edge[-1] == "reverse": + return edge[1], edge[0] + return edge[:2] + + ret = {} + explored = [] + for start_node in self.supergraph.gf.nxg.nbunch_iter(source): + if start_node in explored: + # No loop is possible. + continue + + edges = [] + # All nodes seen in this iteration of edge_dfs + seen = {start_node} + # Nodes in active path. + active_nodes = {start_node} + previous_head = None + + for edge in nx.edge_dfs(self.supergraph.gf.nxg, start_node, orientation): + tail, head = tailhead(edge) + if edge not in counter: + counter[edge] = 0 + if tail == head: + counter[edge] += 1 + else: + counter[edge] = 1 + + return counter + + def create_source_targets(self, path): + module = "" + edges = [] + + for idx, callsite in enumerate(path): + if idx == len(path) - 1: + break + + source = callflow.utils.sanitize_name(path[idx]) + target = callflow.utils.sanitize_name(path[idx + 1]) + + edges.append( + {"source": source, "target": target,} + ) + return edges + + def add_paths(self, path): + paths = self.supergraph.gf.df[path].tolist() + + for idx, path in enumerate(paths): + if isinstance(path, float): + return [] + path = make_tuple(path) + source_targets = self.create_source_targets(path) + for edge in source_targets: + source = edge["source"] + target = edge["target"] + if not self.supergraph.gf.nxg.has_edge(source, target): + self.supergraph.gf.nxg.add_edge(source, target) + + def find_cycle(self, G, source=None, orientation=None): + if not G.is_directed() or orientation in (None, "original"): + + def tailhead(edge): + return edge[:2] + + elif orientation == "reverse": + + def tailhead(edge): + return edge[1], edge[0] + + elif orientation == "ignore": + + def tailhead(edge): + if edge[-1] == "reverse": + return edge[1], edge[0] + return edge[:2] + + explored = set() + cycle = [] + count = 0 + final_node = None + for start_node in G.nbunch_iter(source): + if start_node in explored: + # No loop is possible. + continue + + edges = [] + # All nodes seen in this iteration of edge_dfs + seen = {start_node} + # Nodes in active path. + active_nodes = {start_node} + previous_head = None + + for edge in nx.edge_dfs(G, start_node, orientation): + # Determine if this edge is a continuation of the active path. + tail, head = tailhead(edge) + if head in explored: + # Then we've already explored it. No loop is possible. + continue + if previous_head is not None and tail != previous_head: + # This edge results from backtracking. + # Pop until we get a node whose head equals the current tail. + # So for example, we might have: + # (0, 1), (1, 2), (2, 3), (1, 4) + # which must become: + # (0, 1), (1, 4) + while True: + try: + popped_edge = edges.pop() + except IndexError: + edges = [] + active_nodes = {tail} + break + else: + popped_head = tailhead(popped_edge)[1] + active_nodes.remove(popped_head) + + if edges: + last_head = tailhead(edges[-1])[1] + if tail == last_head: + break + edges.append(edge) + + if head in active_nodes: + # We have a loop! + cycle.extend(edges) + final_node = head + break + else: + seen.add(head) + active_nodes.add(head) + previous_head = head + + if cycle: + count += 1 + break + else: + explored.update(seen) + + else: + assert len(cycle) == 0 + # raise nx.exception.NetworkXNoCycle('No cycle found.') + + # We now have a list of edges which ends on a cycle. + # So we need to remove from the beginning edges that are not relevant. + i = 0 + for i, edge in enumerate(cycle): + tail, head = tailhead(edge) + if tail == final_node: + break + return cycle[i:] diff --git a/callflow/datastructures/cct_ensemble.py b/callflow/datastructures/cct_ensemble.py deleted file mode 100644 index d4431937..00000000 --- a/callflow/datastructures/cct_ensemble.py +++ /dev/null @@ -1,372 +0,0 @@ -############################################################################## -# Copyright (c) 2018-2019, Lawrence Livermore National Security, LLC. -# Produced at the Lawrence Livermore National Laboratory. -# -# This file is part of Callflow. -# Created by Suraj Kesavan . -# LLNL-CODE-741008. All rights reserved. -# -# For details, see: https://github.com/LLNL/Callflow -# Please also read the LICENSE file for the MIT License notice. -############################################################################## - -import pandas as pd -import networkx as nx -from ast import literal_eval as make_tuple -import math -from callflow.timer import Timer -from callflow.utils import sanitizeName - - -class EnsembleCCT: - def __init__(self, state, functionsInCCT, config): - self.timer = Timer() - self.config = config - number_of_nodes = len(state.new_gf.df["name"].unique()) - self.functionsInCCT = int(functionsInCCT) - - # self.entire_graph = state.g - # self.entire_df = state.df - self.entire_graph = state.new_gf.g - self.entire_df = state.new_gf.df - - self.runs = self.entire_df["dataset"].unique() - self.columns = ["time (inc)", "time", "name", "module"] - # 'imbalance_perc'] - - self.sort_attr = "time" - self.callsites = self.get_top_n_callsites_by(self.functionsInCCT) - - self.fdf = self.entire_df[self.entire_df["name"].isin(self.callsites)] - - self.datasets = self.fdf["dataset"].unique() - with self.timer.phase(f"Creating the ensemble CCT: {self.datasets}"): - self.g = nx.DiGraph() - self.add_paths("path") - - with self.timer.phase(f"Creating the data maps."): - self.cct_df = self.entire_df[self.entire_df["name"].isin(self.g.nodes())] - self.create_ensemble_maps(self.cct_df) - self.create_target_maps(self.cct_df) - - with self.timer.phase(f"Add node and edge attributes."): - self.add_node_attributes() - self.add_edge_attributes() - - with self.timer.phase(f"Find cycles"): - self.g.cycles = self.find_cycle(self.g) - - print(self.timer) - - def get_top_n_callsites_by(self, count): - xgroup_df = self.entire_df.groupby(["name"]).mean() - sort_xgroup_df = xgroup_df.sort_values(by=[self.sort_attr], ascending=False) - callsites_df = sort_xgroup_df.nlargest(self.functionsInCCT, "time (inc)") - - return callsites_df.index.values.tolist() - - def create_target_maps(self, df): - self.target_df = {} - self.target_modules = {} - self.target_module_group_df = {} - self.target_module_name_group_df = {} - self.target_module_callsite_map = {} - self.target_module_time_inc_map = {} - self.target_module_time_exc_map = {} - self.target_name_time_inc_map = {} - self.target_name_time_exc_map = {} - - for run in self.runs: - # Reduce the entire_df to respective target dfs. - self.target_df[run] = df.loc[df["dataset"] == run] - - # Unique modules in the target run - self.target_modules[run] = self.target_df[run]["module"].unique() - - # Group the dataframe in two ways. - # 1. by module - # 2. by module and callsite - self.target_module_group_df[run] = self.target_df[run].groupby(["module"]) - self.target_module_name_group_df[run] = self.target_df[run].groupby( - ["name"] - ) - - # Module map for target run {'module': [Array of callsites]} - self.target_module_callsite_map[run] = self.target_module_group_df[run][ - "name" - ].unique() - - # Inclusive time maps for the module level and callsite level. - self.target_module_time_inc_map[run] = ( - self.target_module_group_df[run]["time (inc)"].max().to_dict() - ) - self.target_name_time_inc_map[run] = ( - self.target_module_name_group_df[run]["time (inc)"].max().to_dict() - ) - - # Exclusive time maps for the module level and callsite level. - self.target_module_time_exc_map[run] = ( - self.target_module_group_df[run]["time"].max().to_dict() - ) - self.target_name_time_exc_map[run] = ( - self.target_module_name_group_df[run]["time"].max().to_dict() - ) - - def create_ensemble_maps(self, df): - self.modules = df["module"].unique() - - self.module_name_group_df = df.groupby(["module", "name"]) - self.module_group_df = df.groupby(["module"]) - - # Module map for ensemble {'module': [Array of callsites]} - self.module_callsite_map = df["name"].unique() - - # Inclusive time maps for the module level and callsite level. - self.module_time_inc_map = self.module_group_df["time (inc)"].max().to_dict() - self.name_time_inc_map = self.module_name_group_df["time (inc)"].max().to_dict() - - # Exclusive time maps for the module level and callsite level. - self.module_time_exc_map = self.module_group_df["time"].max().to_dict() - self.name_time_exc_map = self.module_name_group_df["time"].max().to_dict() - - def ensemble_map(self, df, nodes): - ret = {} - - # loop through the nodes - for callsite in self.g.nodes(): - if callsite not in self.config.callsite_module_map: - module = self.entire_df.loc[self.entire_df["name"] == callsite][ - "module" - ].unique()[0] - else: - module = self.config.callsite_module_map[callsite] - - for column in self.columns: - if column not in ret: - ret[column] = {} - if column == "time (inc)": - ret[column][callsite] = self.name_time_inc_map[(module, callsite)] - elif column == "time": - ret[column][callsite] = self.name_time_exc_map[(module, callsite)] - elif column == "name": - ret[column][callsite] = callsite - elif column == "module": - ret[column][callsite] = module - - return ret - - def dataset_map(self, nodes, run): - ret = {} - for callsite in self.g.nodes(): - if callsite not in self.config.callsite_module_map: - module = self.entire_df.loc[self.entire_df["name"] == callsite][ - "module" - ].unique()[0] - else: - module = self.config.callsite_module_map[callsite] - - if callsite in self.target_module_callsite_map[run].keys(): - if callsite not in ret: - ret[callsite] = {} - - for column in self.columns: - if column == "time (inc)": - ret[callsite][column] = self.target_module_time_inc_map[run][ - module - ] - - elif column == "time": - ret[callsite][column] = self.target_module_time_exc_map[run][ - module - ] - - elif column == "module": - ret[callsite][column] = module - - elif column == "name": - ret[callsite][column] = callsite - - return ret - - def add_node_attributes(self): - ensemble_mapping = self.ensemble_map(self.entire_df, self.g.nodes()) - - for idx, key in enumerate(ensemble_mapping): - nx.set_node_attributes(self.g, name=key, values=ensemble_mapping[key]) - - # dataset_mapping = {} - # for run in self.runs: - # dataset_mapping[run] = self.dataset_map(self.g.nodes(), run) - - # nx.set_node_attributes(self.g, name=run, values=dataset_mapping[run]) - - def add_edge_attributes(self): - num_of_calls_mapping = self.edge_map(self.g.edges(), "component_path") - nx.set_edge_attributes(self.g, name="count", values=num_of_calls_mapping) - - def edge_map(self, edges, attr, source=None, orientation=None): - counter = {} - if not self.g.is_directed() or orientation in (None, "original"): - - def tailhead(edge): - return edge[:2] - - elif orientation == "reverse": - - def tailhead(edge): - return edge[1], edge[0] - - elif orientation == "ignore": - - def tailhead(edge): - if edge[-1] == "reverse": - return edge[1], edge[0] - return edge[:2] - - ret = {} - explored = [] - for start_node in self.g.nbunch_iter(source): - if start_node in explored: - # No loop is possible. - continue - - edges = [] - # All nodes seen in this iteration of edge_dfs - seen = {start_node} - # Nodes in active path. - active_nodes = {start_node} - previous_head = None - - for edge in nx.edge_dfs(self.g, start_node, orientation): - tail, head = tailhead(edge) - if edge not in counter: - counter[edge] = 0 - if tail == head: - counter[edge] += 1 - else: - counter[edge] = 1 - - return counter - - def create_source_targets(self, path): - module = "" - edges = [] - - for idx, callsite in enumerate(path): - if idx == len(path) - 1: - break - - source = sanitizeName(path[idx]) - target = sanitizeName(path[idx + 1]) - - edges.append( - {"source": source, "target": target,} - ) - return edges - - def add_paths(self, path): - paths = self.fdf[path].tolist() - - for idx, path in enumerate(paths): - if isinstance(path, float): - return [] - path = make_tuple(path) - source_targets = self.create_source_targets(path) - for edge in source_targets: - source = edge["source"] - target = edge["target"] - if not self.g.has_edge(source, target): - self.g.add_edge(source, target) - - def find_cycle(self, G, source=None, orientation=None): - if not G.is_directed() or orientation in (None, "original"): - - def tailhead(edge): - return edge[:2] - - elif orientation == "reverse": - - def tailhead(edge): - return edge[1], edge[0] - - elif orientation == "ignore": - - def tailhead(edge): - if edge[-1] == "reverse": - return edge[1], edge[0] - return edge[:2] - - explored = set() - cycle = [] - count = 0 - final_node = None - for start_node in G.nbunch_iter(source): - if start_node in explored: - # No loop is possible. - continue - - edges = [] - # All nodes seen in this iteration of edge_dfs - seen = {start_node} - # Nodes in active path. - active_nodes = {start_node} - previous_head = None - - for edge in nx.edge_dfs(G, start_node, orientation): - # Determine if this edge is a continuation of the active path. - tail, head = tailhead(edge) - if head in explored: - # Then we've already explored it. No loop is possible. - continue - if previous_head is not None and tail != previous_head: - # This edge results from backtracking. - # Pop until we get a node whose head equals the current tail. - # So for example, we might have: - # (0, 1), (1, 2), (2, 3), (1, 4) - # which must become: - # (0, 1), (1, 4) - while True: - try: - popped_edge = edges.pop() - except IndexError: - edges = [] - active_nodes = {tail} - break - else: - popped_head = tailhead(popped_edge)[1] - active_nodes.remove(popped_head) - - if edges: - last_head = tailhead(edges[-1])[1] - if tail == last_head: - break - edges.append(edge) - - if head in active_nodes: - # We have a loop! - cycle.extend(edges) - final_node = head - break - else: - seen.add(head) - active_nodes.add(head) - previous_head = head - - if cycle: - count += 1 - break - else: - explored.update(seen) - - else: - assert len(cycle) == 0 - # raise nx.exception.NetworkXNoCycle('No cycle found.') - - # We now have a list of edges which ends on a cycle. - # So we need to remove from the beginning edges that are not relevant. - i = 0 - for i, edge in enumerate(cycle): - tail, head = tailhead(edge) - if tail == final_node: - break - return cycle[i:] diff --git a/callflow/datastructures/cct_single.py b/callflow/datastructures/cct_single.py deleted file mode 100644 index 7c6206bb..00000000 --- a/callflow/datastructures/cct_single.py +++ /dev/null @@ -1,280 +0,0 @@ -############################################################################## -# Copyright (c) 2018-2019, Lawrence Livermore National Security, LLC. -# Produced at the Lawrence Livermore National Laboratory. -# -# This file is part of Callflow. -# Created by Suraj Kesavan . -# LLNL-CODE-741008. All rights reserved. -# -# For details, see: https://github.com/LLNL/Callflow -# Please also read the LICENSE file for the MIT License notice. -############################################################################## - -import pandas as pd -import networkx as nx -from ast import literal_eval as make_tuple -import math -from callflow.timer import Timer -from callflow.utils import sanitizeName - - -class SingleCCT: - def __init__(self, state, functionsInCCT, config): - self.timer = Timer() - - self.g = state.new_gf.nxg - self.df = state.new_gf.df - self.functionsInCCT = int(functionsInCCT) - self.config = config - - self.columns = ["time (inc)", "time", "name", "module"] - # 'imbalance_perc'] - - self.sort_attr = "time" - - print(f"Total callsite in CCT: {len(self.df['name'].unique())}") - - with self.timer.phase("Creating data maps"): - self.create_ensemble_maps() - - self.callsites = self.get_top_n_callsites_by(self.functionsInCCT) - self.fdf = self.df[self.df["name"].isin(self.callsites)] - - self.dataset = self.fdf["dataset"].unique() - with self.timer.phase(f"Creating the single CCT {self.dataset}"): - self.run() - print(self.timer) - - def get_top_n_callsites_by(self, count): - xgroup_df = self.df.groupby(["name"]).mean() - sort_xgroup_df = xgroup_df.sort_values(by=[self.sort_attr], ascending=False) - callsites_df = sort_xgroup_df.nlargest(self.functionsInCCT, self.sort_attr) - - return callsites_df.index.values.tolist() - - def create_ensemble_maps(self): - self.modules = self.df["module"].unique() - - self.module_name_group_df = self.df.groupby(["module", "name"]) - self.module_group_df = self.df.groupby(["module"]) - - # Module map for ensemble {'module': [Array of callsites]} - self.module_callsite_map = self.module_group_df["name"].unique() - - # Inclusive time maps for the module level and callsite level. - self.module_time_inc_map = self.module_group_df["time (inc)"].max().to_dict() - self.name_time_inc_map = self.module_name_group_df["time (inc)"].max().to_dict() - - # Exclusive time maps for the module level and callsite level. - self.module_time_exc_map = self.module_group_df["time"].max().to_dict() - self.name_time_exc_map = self.module_name_group_df["time"].max().to_dict() - - def dataset_map(self): - ret = {} - for callsite in self.g.nodes(): - if callsite not in self.config.callsite_module_map: - module = self.df.loc[self.df["name"] == callsite]["module"].unique()[0] - else: - module = self.config.callsite_module_map[callsite] - - for column in self.columns: - if column not in ret: - ret[column] = {} - - if column == "time (inc)": - ret[column][callsite] = self.name_time_inc_map[(module, callsite)] - - elif column == "time": - ret[column][callsite] = self.name_time_exc_map[(module, callsite)] - - elif column == "module": - ret[column][callsite] = module - - elif column == "name": - ret[column][callsite] = callsite - - return ret - - def add_node_attributes(self): - dataset_mapping = self.dataset_map() - - for idx, key in enumerate(dataset_mapping): - nx.set_node_attributes(self.g, name=key, values=dataset_mapping[key]) - - def add_edge_attributes(self): - num_of_calls_mapping = self.edge_map(self.g.edges(), "component_path") - nx.set_edge_attributes(self.g, name="count", values=num_of_calls_mapping) - - def edge_map(self, edges, attr, source=None, orientation=None): - counter = {} - if not self.g.is_directed() or orientation in (None, "original"): - - def tailhead(edge): - return edge[:2] - - elif orientation == "reverse": - - def tailhead(edge): - return edge[1], edge[0] - - elif orientation == "ignore": - - def tailhead(edge): - if edge[-1] == "reverse": - return edge[1], edge[0] - return edge[:2] - - ret = {} - explored = [] - for start_node in self.g.nbunch_iter(source): - if start_node in explored: - # No loop is possible. - continue - - edges = [] - # All nodes seen in this iteration of edge_dfs - seen = {start_node} - # Nodes in active path. - active_nodes = {start_node} - previous_head = None - - for edge in nx.edge_dfs(self.g, start_node, orientation): - tail, head = tailhead(edge) - if edge not in counter: - counter[edge] = 0 - if tail == head: - counter[edge] += 1 - else: - counter[edge] = 1 - - return counter - - def create_source_targets(self, path): - module = "" - edges = [] - - for idx, callsite in enumerate(path): - if idx == len(path) - 1: - break - - source = sanitizeName(path[idx]) - target = sanitizeName(path[idx + 1]) - - edges.append( - {"source": source, "target": target,} - ) - return edges - - def add_paths(self, path): - paths = self.fdf[path].tolist() - - for idx, path in enumerate(paths): - if isinstance(path, float): - return [] - path = make_tuple(path) - source_targets = self.create_source_targets(path) - for edge in source_targets: - source = edge["source"] - target = edge["target"] - if not self.g.has_edge(source, target): - self.g.add_edge(source, target) - - def find_cycle(self, G, source=None, orientation=None): - if not G.is_directed() or orientation in (None, "original"): - - def tailhead(edge): - return edge[:2] - - elif orientation == "reverse": - - def tailhead(edge): - return edge[1], edge[0] - - elif orientation == "ignore": - - def tailhead(edge): - if edge[-1] == "reverse": - return edge[1], edge[0] - return edge[:2] - - explored = set() - cycle = [] - count = 0 - final_node = None - for start_node in G.nbunch_iter(source): - if start_node in explored: - # No loop is possible. - continue - - edges = [] - # All nodes seen in this iteration of edge_dfs - seen = {start_node} - # Nodes in active path. - active_nodes = {start_node} - previous_head = None - - for edge in nx.edge_dfs(G, start_node, orientation): - # Determine if this edge is a continuation of the active path. - tail, head = tailhead(edge) - if head in explored: - # Then we've already explored it. No loop is possible. - continue - if previous_head is not None and tail != previous_head: - # This edge results from backtracking. - # Pop until we get a node whose head equals the current tail. - # So for example, we might have: - # (0, 1), (1, 2), (2, 3), (1, 4) - # which must become: - # (0, 1), (1, 4) - while True: - try: - popped_edge = edges.pop() - except IndexError: - edges = [] - active_nodes = {tail} - break - else: - popped_head = tailhead(popped_edge)[1] - active_nodes.remove(popped_head) - - if edges: - last_head = tailhead(edges[-1])[1] - if tail == last_head: - break - edges.append(edge) - - if head in active_nodes: - # We have a loop! - cycle.extend(edges) - final_node = head - break - else: - seen.add(head) - active_nodes.add(head) - previous_head = head - - if cycle: - count += 1 - break - else: - explored.update(seen) - - else: - assert len(cycle) == 0 - # raise nx.exception.NetworkXNoCycle('No cycle found.') - - # We now have a list of edges which ends on a cycle. - # So we need to remove from the beginning edges that are not relevant. - i = 0 - for i, edge in enumerate(cycle): - tail, head = tailhead(edge) - if tail == final_node: - break - return cycle[i:] - - def run(self): - self.g = nx.DiGraph() - self.add_paths("path") - self.add_node_attributes() - self.add_edge_attributes() - self.g.cycles = self.find_cycle(self.g) diff --git a/callflow/datastructures/ensemblegraph.py b/callflow/datastructures/ensemblegraph.py new file mode 100644 index 00000000..21a667e7 --- /dev/null +++ b/callflow/datastructures/ensemblegraph.py @@ -0,0 +1,140 @@ +import networkx as nx +import pandas as pd +import callflow +from callflow import GraphFrame, SuperGraph + +LOGGER = callflow.get_logger(__name__) + + +class EnsembleGraph(SuperGraph): + """ + TODO: Clean this up. + SuperGraph that handles the ensemble processing. + """ + + def __init__(self, props={}, tag="", mode="process", supergraphs={}): + # this stores the mapping for each run's data (i.e., Dataset) + self.supergraphs = supergraphs + + super().__init__(props, tag, mode) + + # For each callsite we store the vector here. + self.vector = {} + + def create_gf(self, data=None): + """ + Create the graphframes for the ensemble operation. + """ + # Set the gf as first of the dataset's gf + if data: + self.gf = callflow.GraphFrame.from_data(data) + else: + first_dataset = list(self.supergraphs.keys())[0] + LOGGER.debug(f"Base for the union operation is: {first_dataset}") + + # TODO: do a deep copy. + # Instead of a deep copy, create a new graphframe and return it. + self.gf = self.supergraphs[first_dataset].gf + self.gf.df = self.union_df() + # There is no way to convert networkX to hatchet graph yet. So we are setting this to None. + self.gf.graph = None + self.gf.nxg = self.union_nxg() + + assert isinstance(self.gf, callflow.GraphFrame) + + def union_df(self): + """ + Union the dataframes. + """ + df = pd.DataFrame([]) + for idx, tag in enumerate(self.supergraphs): + gf = self.supergraphs[tag].gf + + df = pd.concat([df, gf.df], sort=True) + + assert isinstance(df, pd.DataFrame) + return df + + def union_nxg(self): + """ + Union the netwprkX graph. + """ + nxg = nx.DiGraph() + for idx, tag in enumerate(self.supergraphs): + LOGGER.debug("-=========================-") + LOGGER.debug(tag) + self.union_nxg_recurse(nxg, self.supergraphs[tag].gf.nxg) + + return nxg + + # Return the union of graphs G and H. + def union_nxg_recurse(self, nxg_1, nxg_2, name=None, rename=(None, None)): + """ + Iterative concatenation of nodes from nxg_2 to nxg_1. + """ + if not nxg_1.is_multigraph() == nxg_2.is_multigraph(): + raise nx.NetworkXError("G and H must both be graphs or multigraphs.") + + nxg_1.update(nxg_2) + + renamed_nodes = self.add_prefix(nxg_1, rename[1]) + + is_same = set(nxg_1) == set(nxg_2) + LOGGER.debug(f"Nodes in Graph 1 and Graph 2 are same? : {is_same}") + if set(nxg_1) != set(nxg_2): + LOGGER.debug(f"Difference is { list(set(nxg_1) - set(nxg_2))}") + LOGGER.debug(f"Nodes in Graph 1: {set(nxg_1)}") + LOGGER.debug(f"Nodes in Graph 2: {set(nxg_2)}") + LOGGER.debug("-=========================-") + + if nxg_2.is_multigraph(): + new_edges = nxg_2.edges(keys=True, data=True) + else: + new_edges = nxg_2.edges(data=True) + + # add nodes and edges. + nxg_1.add_nodes_from(nxg_2) + nxg_1.add_edges_from(new_edges) + + # # add node attributes for each run + # for n in renamed_nodes: + # self.add_node_attributes(nxg_1, n, name) + + return nxg_1 + + # rename graph to obtain disjoint node labels + def add_prefix(self, graph, prefix): + if prefix is None: + return graph + + def label(x): + if is_string_like(x): + name = prefix + x + else: + name = prefix + repr(x) + return name + + return nx.relabel_nodes(graph, label) + + def add_edge_attributes(self): + number_of_runs_mapping = self.number_of_runs() + nx.set_edge_attributes( + self.union, name="number_of_runs", values=number_of_runs_mapping + ) + + def number_of_runs(self): + ret = {} + for idx, name in enumerate(self.unionuns): + for edge in self.unionuns[name].edges(): + if edge not in ret: + ret[edge] = 0 + ret[edge] += 1 + return ret + + def add_node_attributes(self, H, node, dataset_name): + """ + TODO: Hoist this information to the df directly. + """ + for idx, (key, val) in enumerate(H.nodes.items()): + if dataset_name not in self.nxg.nodes[node]: + self.nxg.nodes[node] = self.vector[node] diff --git a/callflow/datastructures/graphframe.py b/callflow/datastructures/graphframe.py index 159421f6..abe2fc10 100644 --- a/callflow/datastructures/graphframe.py +++ b/callflow/datastructures/graphframe.py @@ -1,13 +1,28 @@ -import os +# Copyright 2017-2020 Lawrence Livermore National Security, LLC and other +# CallFlow Project Developers. See the top-level LICENSE file for details. +# +# SPDX-License-Identifier: MIT +# ------------------------------------------------------------------------------ +# Library imports +import os +import pandas as pd import hatchet as ht +import networkx as nx + +# ------------------------------------------------------------------------------ +# CallFlow imports import callflow LOGGER = callflow.get_logger(__name__) - +# ------------------------------------------------------------------------------ +# GraphFrame Class class GraphFrame(ht.GraphFrame): def __init__(self, graph=None, dataframe=None, exc_metrics=None, inc_metrics=None): + """ + + """ # TODO: will we ever want to create a graphframe without data? if graph is not None and dataframe is not None: @@ -17,45 +32,183 @@ def __init__(self, graph=None, dataframe=None, exc_metrics=None, inc_metrics=Non self.df = self.dataframe # save a networkx graph - self.nxg = None + if graph: + self.nxg = self.hatchet_graph_to_nxg(graph) # -------------------------------------------------------------------------- - # promote a hatchet graph frame to callflow graph frame + # Hatchet's GraphFrame utilities. + @staticmethod def from_hatchet(gf): - + """ + Promotes a hatchet graph frame to callflow graph frame + """ assert isinstance(gf, ht.GraphFrame) return GraphFrame(gf.graph, gf.dataframe, gf.exc_metrics, gf.inc_metrics) # create a graph frame directly from the config @staticmethod def from_config(config, name): - + """ + Uses config file to create a graphframe. + """ LOGGER.info(f"Creating graphframes: {name}") - LOGGER.info(f"Data path: {config.data_path}") + LOGGER.info(f"Data path: {config['data_path']}") - if config.format[name] == "hpctoolkit": - gf = ht.GraphFrame.from_hpctoolkit(config.data_path) + if config["format"][name] == "hpctoolkit": + gf = ht.GraphFrame.from_hpctoolkit(config["data_path"]) - elif config.format[name] == "caliper": - gf = ht.GraphFrame.from_caliper(config.data_path) + elif config["format"][name] == "caliper": + gf = ht.GraphFrame.from_caliper(config["data_path"]) - elif config.format[name] == "caliper_json": - data_path = os.path.join(config.data_path, config.paths[name]) + elif config["format"][name] == "caliper_json": + data_path = os.path.join(config["data_path"], config["paths"][name]) gf = ht.GraphFrame.from_caliper(data_path, query="") - elif config.format[name] == "gprof": - gf = ht.GraphFrame.from_grof_dot(config.data_path) + elif config["format"][name] == "gprof": + gf = ht.GraphFrame.from_grof_dot(config["data_path"]) - elif config.format[name] == "literal": - gf = ht.GraphFrame.from_literal(config.data_path) + elif config["format"][name] == "literal": + gf = ht.GraphFrame.from_literal(config["data_path"]) - elif config.format[name] == "lists": - gf = ht.GraphFrame.from_lists(config.data_path) + elif config["format"][name] == "lists": + gf = ht.GraphFrame.from_lists(config["data_path"]) return GraphFrame.from_hatchet(gf) + @staticmethod + def from_data(data): + """ + Create GraphFrame from 3 sets of information : df, graph, nxg. + """ + # Hatchet requires node and rank to be indexes. + data["df"] = data["df"].set_index(["node", "rank"]) + + # Create a graphframe using Hatchet. + gf = GraphFrame(dataframe=data["df"], graph=data["graph"]) + + # Store the nxg. + gf.nxg = data["nxg"] + + # remove the set indexes to maintain consistency. + gf.df = gf.df.reset_index(drop=False) + return gf + # -------------------------------------------------------------------------- + # callflow.graph utilities. + # + @staticmethod + def hatchet_graph_to_nxg(hatchet_graph): + """ + Constructs a networkX graph from hatchet graph. + """ + nxg = nx.DiGraph() + for root in hatchet_graph.roots: + node_gen = root.traverse() + + root_dict = callflow.utils.node_dict_from_frame(root.frame) + root_name = root_dict["name"] + root_paths = root.paths() + node = root + + try: + while node: + # `getNodeDictFromFrame` converts the hatchet's frame to + node_dict = callflow.utils.node_dict_from_frame(node.frame) + node_name = node_dict["name"] + + # Get all node paths from hatchet. + node_paths = node.paths() + + # Loop through all the node paths. + for node_path in node_paths: + if len(node_path) >= 2: + + source_node_dict = callflow.utils.node_dict_from_frame( + node_path[-2] + ) + target_node_dict = callflow.utils.node_dict_from_frame( + node_path[-1] + ) + + if source_node_dict["line"] != "NA": + source_node_name = ( + callflow.utils.sanitize_name( + source_node_dict["name"] + ) + + ":" + + str(source_node_dict["line"]) + ) + else: + source_node_name = callflow.utils.sanitize_name( + source_node_dict["name"] + ) + if target_node_dict["line"] != "NA": + target_node_name = ( + callflow.utils.sanitize_name( + target_node_dict["name"] + ) + + ":" + + str(target_node_dict["line"]) + ) + else: + target_node_name = callflow.utils.sanitize_name( + target_node_dict["name"] + ) + + nxg.add_edge(source_node_name, target_node_name) + + node = next(node_gen) + + except StopIteration: + pass + finally: + del root + + return nxg + + # -------------------------------------------------------------------------- + # callflow.nxg utilities. + + @staticmethod + def add_prefix(graph, prefix): + """ + Rename graph to obtain disjoint node labels + """ + if prefix is None: + return graph + + def label(x): + if is_string_like(x): + name = prefix + x + else: + name = prefix + repr(x) + return name + + return nx.relabel_nodes(graph, label) + + @staticmethod + def tailhead(edge): + return (edge[0], edge[1]) + + @staticmethod + def tailheadDir(edge): + return (str(edge[0]), str(edge[1]), self.edge_direction[edge]) + + @staticmethod + def leaves_below(nxg, node): + return set( + sum( + ( + [vv for vv in v if nxg.out_degree(vv) == 0] + for k, v in nx.dfs_successors(nxg, node).items() + ), + [], + ) + ) + + # -------------------------------------------------------------------------- + # callflow.df utilities def lookup(self, node): return self.df.loc[ (self.df["name"] == node.callpath[-1]) & (self.df["nid"] == node.nid) @@ -70,30 +223,7 @@ def lookup_with_name(self, name): def lookup_with_vis_nodeName(self, name): return self.df.loc[self.df["vis_node_name"] == name] - """ - def lookup_by_column(self, _hash, col_name): - ret = [] - node_df = self.df.loc[self.df["node"] == self.map[str(_hash)]] - node_df_T = node_df.T.squeeze() - node_df_T_attr = node_df_T.loc[col_name] - if node_df_T_attr is not None: - if type(node_df_T_attr) is str or type(node_df_T_attr) is float: - ret.append(node_df_T_attr) - else: - ret = node_df_T_attr.tolist() - return ret - """ - # -------------------------------------------------------------------------- def update_df(self, col_name, mapping): self.df[col_name] = self.df["name"].apply( lambda node: mapping[node] if node in mapping.keys() else "" ) - - def grouped_df(self, attr): - pass - """ - self.gdf[attr] = self.df.groupby(attr, as_index=True, squeeze=True) - self.gdfKeys = self.gdf[attr].groups.keys() - """ - - # -------------------------------------------------------------------------- diff --git a/callflow/datastructures/supergraph.py b/callflow/datastructures/supergraph.py index 525a26ca..50d4ae45 100644 --- a/callflow/datastructures/supergraph.py +++ b/callflow/datastructures/supergraph.py @@ -1,21 +1,418 @@ +# Copyright 2017-2020 Lawrence Livermore National Security, LLC and other +# CallFlow Project Developers. See the top-level LICENSE file for details. +# +# SPDX-License-Identifier: MIT + +# ------------------------------------------------------------------------------ +# Library imports +import os +import json +import copy +import pandas as pd import networkx as nx +from networkx.readwrite import json_graph +from ast import literal_eval as make_list + +# ------------------------------------------------------------------------------ +# CallFlow imports +import callflow +from callflow.timer import Timer +from callflow.operations import Process, Group, Filter +from callflow.modules import EnsembleAuxiliary, SingleAuxiliary + +LOGGER = callflow.get_logger(__name__) + +# ------------------------------------------------------------------------------ +# SuperGraph Class +class SuperGraph(object): + def __init__(self, props={}, tag="", mode="process"): + self.timer = Timer() + + # Props is the information contained in config object. + # We duplicate this to add more information to config and not modify it as a side effect. + self.props = props + self.dirname = self.props["save_path"] + + # it appears we're using name as "union", "filter", etc. + # this is not a data set name! + self.tag = tag + + # Mode is either process or render. + self.mode = mode + + # Variables used by `create_target_maps`. + self.target_df = {} + self.target_modules = {} + self.target_module_group_df = {} + self.target_module_name_group_df = {} + self.target_module_callsite_map = {} + self.target_module_time_inc_map = {} + self.target_module_time_exc_map = {} + self.target_name_time_inc_map = {} + self.target_name_time_exc_map = {} + + # Create a graphframe based on the mode. + if mode == "process": + self.create_gf() + elif mode == "render": + data = self.read_gf(read_parameter=self.props["read_parameter"]) + self.create_gf(data=data) + self.auxiliary_data = self.read_auxiliary_data() + + with self.timer.phase(f"Creating the data maps."): + self.cct_df = self.gf.df[self.gf.df["name"].isin(self.gf.nxg.nodes())] + self.create_ensemble_maps() + for dataset in self.props["dataset_names"]: + self.create_target_maps(dataset) + + self.projection_data = {} + + def _getter(self): + """ + Getter for graphframe. Returns the graphframe. + """ + return self.gf + + def _setter(self, gf): + """ + Setter for graphframe. Hooks the graphframe. + """ + assert isinstance(gf, callflow.GraphFrame) + + self.gf = gf + + def create_gf(self, data=None): + """ + Creates a graphframe using config and networkX grapg from hatchet graph. + Each graphframe is tagged by a unique identifier. + e.g., here is the runName from config file or JSON. + """ + if data: + self.gf = callflow.GraphFrame.from_data(data) + else: + gf = callflow.GraphFrame.from_config(self.props, self.tag) + self.gf = copy.deepcopy(gf) + + def process_gf(self): + """ + Process graphframe to add properties depending on the format. + Current processing is supported for hpctoolkit and caliper. + """ + gf = self._getter() + if self.props["format"][self.tag] == "hpctoolkit": + process = ( + Process.Builder(gf, self.tag) + .add_path() + .create_name_module_map() + .add_callers_and_callees() + .add_dataset_name() + .add_imbalance_perc() + .add_module_name_hpctoolkit() + .add_vis_node_name() + .build() + ) + elif self.props["format"][self.tag] == "caliper_json": + process = ( + Process.Builder(gf, self.tag) + .add_time_columns() + .add_rank_column() + .add_callers_and_callees() + .add_dataset_name() + .add_imbalance_perc() + .add_module_name_caliper(self.props["callsite_module_map"]) + .create_name_module_map() + .add_vis_node_name() + .add_path() + .build() + ) + + self._setter(process.gf) + + def group_gf(self, group_by="module"): + """ + Group the graphframe based on `group_by` parameter. + """ + gf = self._getter() + group = Group(gf, group_by) + + self._setter(group.gf) + + def filter_gf(self, mode="single"): + """ + Filter the graphframe. + """ + gf = self._getter() + filter_res = Filter( + gf=gf, + mode=mode, + filter_by=self.props["filter_by"], + filter_perc=self.props["filter_perc"], + ) + self._setter(filter_res.gf) + + def ensemble_gf(self, supergraphs): + + EnsembleGraph( + self.props, "ensemble", mode="process", supergraphs=single_supergraphs + ) + + def ensemble_auxiliary( + self, datasets, MPIBinCount=20, RunBinCount=20, process=True, write=True + ): + gf = self._getter() + EnsembleAuxiliary( + gf, + datasets=datasets, + props=self.props, + MPIBinCount=MPIBinCount, + RunBinCount=RunBinCount, + process=process, + write=write, + ) + + def single_auxiliary(self, dataset="", binCount=20, process=True): + gf = self._getter() + SingleAuxiliary( + gf, + dataset=dataset, + props=self.props, + MPIBinCount=binCount, + process=process, + ) + + # ------------------------------------------------------------------------------ + # Utilities. + + def get_top_n_callsites_by_attr(self, count, sort_attr): + """ + Returns an array of callsites (sorted by `sort_attr`) + """ + xgroup_df = self.entire_df.groupby(["name"]).mean() + sort_xgroup_df = xgroup_df.sort_values(by=[sort_attr], ascending=False) + callsites_df = sort_xgroup_df.nlargest(count, sort_attr) + return callsites_df.index.values.tolist() + + def read_gf(self, read_parameter=True, read_graph=False): + """ + # Read a single dataset stored in .callflow directory. + """ + LOGGER.info("Reading the dataset: {0}".format(self.tag)) + + df_file_name = "df.csv" + df_file_path = os.path.join(self.dirname, self.tag, df_file_name) + df = pd.read_csv(df_file_path) + if df.empty: + raise ValueError(f"{df_file_path} is empty.") + + nxg_file_name = "nxg.json" + nxg_file_path = os.path.join(self.dirname, self.tag, nxg_file_name) + with open(nxg_file_path, "r") as nxg_file: + graph = json.load(nxg_file) + nxg = json_graph.node_link_graph(graph) + assert nxg != None + + graph = {} + if read_graph: + graph_file_name = "hatchet_tree.txt" + graph_file_path = os.path.join(self.dirname, self.tag, graph_file_name) + with open(graph_file_path, "r") as graph_file: + graph = json.load(graph_file) + assert isinstance(graph, ht.GraphFrame.Graph) + + parameters = {} + if read_parameter: + parameters_filepath = os.path.join(self.dirname, self.tag, "env_params.txt") + for line in open(parameters_filepath, "r"): + s = 0 + for num in line.strip().split(","): + split_num = num.split("=") + parameters[split_num[0]] = split_num[1] + + return {"df": df, "nxg": nxg, "graph": graph, "parameters": parameters} + + def write_gf(self, write_df=True, write_graph=False, write_nxg=True): + """ + # Write the dataset to .callflow directory. + """ + # Get the save path. + dirname = self.props["save_path"] + + gf = self.gf + # dump the filtered dataframe to csv if write_df is true. + if write_df: + df_file_name = "df.csv" + df_file_path = os.path.join(dirname, self.tag, df_file_name) + gf.df.to_csv(df_file_path) + + # TODO: Writing fails. + if write_nxg: + nxg_file_name = "nxg.json" + nxg_file_path = os.path.join(dirname, self.tag, nxg_file_name) + nxg_data = json_graph.node_link_data(self.gf.nxg) + with open(nxg_file_path, "w") as nxg_file: + json.dump(nxg_data, nxg_file) + + if write_graph: + graph_filepath = os.path.join(dirname, self.tag, "hatchet_tree.txt") + with open(graph_filepath, "a") as hatchet_graphFile: + hatchet_graphFile.write(self.gf.tree(color=False)) + + def write_similarity(self, datasets, states, type): + """ + # Write the pair-wise graph similarities into .callflow directory. + """ + ret = {} + for idx, dataset in enumerate(datasets): + ret[dataset] = [] + for idx_2, dataset2 in enumerate(datasets): + union_similarity = Similarity(states[dataset2].g, states[dataset].g) + ret[dataset].append(union_similarity.result) + + dirname = self.config.callflow_dir + name = self.config.runName + # similarity_filepath = dirname + "/" + "similarity.json" + similarity_filepath = os.path.join(dirname, "similarity.json") + with open(similarity_filepath, "w") as json_file: + json.dump(ret, json_file) + + def read_auxiliary_data(self): + """ + # Read the auxiliary data from all_data.json. + """ + all_data_filepath = os.path.join( + self.props["save_path"], self.tag, "auxiliary_data.json" + ) + LOGGER.info(f"[Read] {all_data_filepath}") + with open(all_data_filepath, "r") as filter_graphFile: + data = json.load(filter_graphFile) + return data + + # ------------------------------------------------------------------------------ + # NetworkX graph utility functions. + def create_target_maps(self, dataset): + # Reduce the entire_df to respective target dfs. + self.target_df[dataset] = self.gf.df.loc[self.gf.df["dataset"] == dataset] + + # Unique modules in the target run + self.target_modules[dataset] = self.target_df[dataset]["module"].unique() + + # Group the dataframe in two ways. + # 1. by module + # 2. by module and callsite + self.target_module_group_df[dataset] = self.target_df[dataset].groupby( + ["module"] + ) + self.target_module_name_group_df[dataset] = self.target_df[dataset].groupby( + ["module", "name"] + ) + + # Module map for target run {'module': [Array of callsites]} + self.target_module_callsite_map[dataset] = ( + self.target_module_group_df[dataset]["name"].unique().to_dict() + ) + + # Inclusive time maps for the module level and callsite level. + self.target_module_time_inc_map[dataset] = ( + self.target_module_group_df[dataset]["time (inc)"].max().to_dict() + ) + self.target_name_time_inc_map[dataset] = ( + self.target_module_name_group_df[dataset]["time (inc)"].max().to_dict() + ) + + # Exclusive time maps for the module level and callsite level. + self.target_module_time_exc_map[dataset] = ( + self.target_module_group_df[dataset]["time"].max().to_dict() + ) + self.target_name_time_exc_map[dataset] = ( + self.target_module_name_group_df[dataset]["time"].max().to_dict() + ) + def create_ensemble_maps(self): + self.modules = self.gf.df["module"].unique() -class SuperGraph(ht.GraphFrame): - def __init__(self, graph=None, dataframe=None, exc_metrics=None, inc_metrics=None): + self.module_name_group_df = self.gf.df.groupby(["module", "name"]) + self.module_group_df = self.gf.df.groupby(["module"]) + self.name_group_df = self.gf.df.groupby(["name"]) - # TODO: will we ever want to create a graphframe without data? - if graph is not None and dataframe is not None: - super().__init__(graph, dataframe, exc_metrics, inc_metrics) + # Module map for ensemble {'module': [Array of callsites]} + self.module_callsite_map = self.module_group_df["name"].unique().to_dict() - # shortcut! - self.df = self.dataframe + # Inclusive time maps for the module level and callsite level. + self.module_time_inc_map = self.module_group_df["time (inc)"].max().to_dict() + self.name_time_inc_map = self.module_name_group_df["time (inc)"].max().to_dict() - # save a networkx graph - self.nxg = None + # Exclusive time maps for the module level and callsite level. + self.module_time_exc_map = self.module_group_df["time"].max().to_dict() + self.name_time_exc_map = self.module_name_group_df["time"].max().to_dict() + + def remove_cycles_in_paths(self, path): + ret = [] + moduleMapper = {} + dataMap = {} + + if isinstance(path, float): + return [] + path = make_list(path) + for idx, elem in enumerate(path): + callsite = elem.split("=")[1] + module = elem.split("=")[0] + if module not in dataMap: + moduleMapper[module] = 0 + dataMap[module] = [ + {"callsite": callsite, "module": module, "level": idx} + ] + else: + flag = [p["level"] == idx for p in dataMap[module]] + if np.any(np.array(flag)): + moduleMapper[module] += 1 + dataMap[module].append( + { + "callsite": callsite, + "module": module + "=" + callsite, + "level": idx, + } + ) + else: + dataMap[module].append( + {"callsite": callsite, "module": module, "level": idx} + ) + ret.append(dataMap[module][-1]) + + return ret + + def print_information(self): + LOGGER.info("Modules: {0}".format(self.supergraph.gf.df["module"].unique())) + LOGGER.info("Top 10 Inclusive time: ") + top = 10 + rank_df = self.supergraph.gf.df.groupby(["name", "nid"]).mean() + top_inclusive_df = rank_df.nlargest(top, "time (inc)", keep="first") + for name, row in top_inclusive_df.iterrows(): + LOGGER.info("{0} [{1}]".format(name, row["time (inc)"])) + + LOGGER.info("Top 10 Enclusive time: ") + top_exclusive_df = rank_df.nlargest(top, "time", keep="first") + for name, row in top_exclusive_df.iterrows(): + LOGGER.info("{0} [{1}]".format(name, row["time"])) + + for node in self.supergraph.gf.nxg.nodes(data=True): + LOGGER.info("Node: {0}".format(node)) + for edge in self.supergraph.gf.nxg.edges(): + LOGGER.info("Edge: {0}".format(edge)) + + LOGGER.info("Nodes in the tree: {0}".format(len(self.supergraph.gf.nxg.nodes))) + LOGGER.info("Edges in the tree: {0}".format(len(self.supergraph.gf.nxg.edges))) + LOGGER.info("Is it a tree? : {0}".format(nx.is_tree(self.supergraph.gf.nxg))) + LOGGER.info( + "Flow hierarchy: {0}".format(nx.flow_hierarchy(self.supergraph.gf.nxg)) + ) + + # ------------------------------------------------------------------------------ + # Module hierarchy. + # TODO: we might have to delete the module hierarchy file in modules later. + # TODO: This might fail. @staticmethod - def _create_source_targets(self, path): + def _create_source_targets(path): module = "" edges = [] @@ -30,7 +427,7 @@ def _create_source_targets(self, path): return edges @staticmethod - def _check_cycles(self, hierarchy, G): + def _check_cycles(hierarchy, G): try: cycles = list(nx.find_cycle(self.hierarchy, orientation="ignore")) except: @@ -39,7 +436,7 @@ def _check_cycles(self, hierarchy, G): return cycles @staticmethod - def _remove_cycles(self, hierarchy, G, cycles): + def _remove_cycles(hierarchy, G, cycles): for cycle in cycles: source = cycle[0] target = cycle[1] @@ -93,3 +490,239 @@ def module_hierarchy(self, module=None): print(f"cycles: {cycles}") return hierarchy + + # ------------------------------------------------------------------------------ + # Add paths according to what input is provided. + # Should be implemented by the child classes. + def add_paths(self, path): + pass + + def add_node_attributes(self): + pass + + def add_edge_attribtues(self): + pass + + # ------------------------------------------------------------------------------ + # Reveal a callsite's path + # TODO: not tested. Could break. + def create_source_targets(self, component_path): + module = "" + edges = [] + for idx, callsite in enumerate(component_path): + if idx == 0: + module = component_path[0] + edges.append( + { + "module": module, + "source": module, + "target": module + "=" + component_path[idx + 1], + } + ) + pass + elif idx == len(component_path) - 1: + pass + else: + edges.append( + { + "module": module, + "source": module + "=" + component_path[idx], + "target": module + "=" + component_path[idx + 1], + } + ) + + return edges + + def callsite_paths(self, callsites): + paths = [] + for callsite in callsites: + df = self.name_group_df.get_group(callsite) + paths.append( + { + "group_path": make_list(df["group_path"].unique()[0]), + "path": make_list(df["path"].unique()[0]), + "component_path": make_list(df["component_path"].unique()[0]), + } + ) + return paths + + def add_reveal_paths(self, reveal_callsites): + paths = self.callsite_paths(reveal_callsites) + + for path in paths: + component_edges = self.create_source_targets(path["component_path"]) + for idx, edge in enumerate(component_edges): + module = edge["module"] + + # format module + '=' + callsite + source = edge["source"] + target = edge["target"] + + if not self.supergraph.gf.nxg.has_edge(source, target): + if idx == 0: + source_callsite = source + source_df = self.module_group_df.get_group((module)) + source_node_type = "super-node" + else: + source_callsite = source.split("=")[1] + source_df = self.module_name_group_df.get_group( + (module, source_callsite) + ) + source_node_type = "component-node" + + target_callsite = target.split("=")[1] + target_df = self.module_name_group_df.get_group( + (module, target_callsite) + ) + target_node_type = "component-node" + + source_weight = source_df["time (inc)"].max() + target_weight = target_df["time (inc)"].max() + + edge_type = "normal" + + print(f"Adding edge: {source_callsite}, {target_callsite}") + self.supergraph.gf.nxg.add_node( + source, attr_dict={"type": source_node_type} + ) + self.supergraph.gf.nxg.add_node( + target, attr_dict={"type": target_node_type} + ) + self.supergraph.gf.nxg.add_edge( + source, + target, + attr_dict=[ + { + "source_callsite": source_callsite, + "target_callsite": target_callsite, + "edge_type": edge_type, + "weight": target_weight, + "edge_type": "reveal_edge", + } + ], + ) + + def add_exit_callsite(): + # TODO: This code is missing for some reason. + pass + + # ------------------------------------------------------------------------------ + # Create a module hierarchy for a chosen module. + # Not fully tested. Might break. + def module_entry_functions_map(self, graph): + entry_functions = {} + for edge in graph.edges(data=True): + attr_dict = edge[2]["attr_dict"] + edge_tuple = (edge[0], edge[1]) + for edge_attr in attr_dict: + if edge_tuple[1] not in entry_functions: + entry_functions[edge_tuple[1]] = [] + entry_functions[edge_tuple[1]].append(edge_attr["target_callsite"]) + return entry_functions + + def create_source_targets_from_group_path(self, path): + module = "" + edges = [] + for idx, callsite in enumerate(path): + if idx == len(path) - 1: + break + source = path[idx].split("=") + target = path[idx + 1].split("=") + edges.append( + { + "source": source[0], + "target": target[0], + "source_callsite": source[1], + "target_callsite": target[1], + } + ) + return edges + + def same_source_edges(self, component_edges, reveal_module): + ret = [] + for idx, edge in enumerate(component_edges): + source = edge["source"] + target = edge["target"] + + if source == reveal_module: + ret.append(edge) + return ret + + def same_target_edges(self, component_edges, reveal_module): + ret = [] + for idx, edge in enumerate(component_edges): + source = edge["source"] + target = edge["target"] + + if target == reveal_module: + ret.append(edge) + return ret + + def add_entry_callsite(self, reveal_module): + entry_functions_map = self.module_entry_functions_map(self.supergraph.gf.nxg) + reveal_callsites = entry_functions_map[reveal_module] + paths = self.callsitePathInformation(reveal_callsites) + + for path in paths: + component_edges = self.create_source_targets_from_group_path( + path["group_path"] + ) + source_edges_to_remove = self.same_source_edges( + component_edges, reveal_module + ) + target_edges_to_remove = self.same_target_edges( + component_edges, reveal_module + ) + + if len(source_edges_to_remove) != 0: + for edge in source_edges_to_remove: + if self.supergraph.gf.nxg.has_edge(edge["source"], edge["target"]): + self.supergraph.gf.nxg.remove_edge( + (edge["source"], edge["target"]) + ) + self.supergraph.gf.nxg.add_node( + reveal_module + "=" + edge["source_callsite"], + attr_dict={"type": "component-node"}, + ) + self.supergraph.gf.nxg.add_edge( + (reveal_module + "=" + edge["source_callsite"], edge["target"]), + attr_dict=[ + { + "source_callsite": edge["source_callsite"], + "target_callsite": edge["target_callsite"], + "edge_type": "normal", + "weight": self.module_name_group_df.get_group( + (reveal_module, edge["source_callsite"]) + )["time (inc)"].max(), + "edge_type": "reveal_edge", + } + ], + ) + + if len(target_edges_to_remove) != 0: + for edge in target_edges_to_remove: + if self.supergraph.gf.nxg.has_edge(edge["source"], edge["target"]): + self.supergraph.gf.nxg.remove_edge( + edge["source"], edge["target"] + ) + self.supergraph.gf.nxg.add_node( + reveal_module + "=" + edge["target_callsite"], + attr_dict={"type": "component-node"}, + ) + self.supergraph.gf.nxg.add_edge( + edge["source"], + reveal_module + "=" + edge["target_callsite"], + attr_dict=[ + { + "source_callsite": edge["source_callsite"], + "target_callsite": edge["target_callsite"], + "edge_type": "normal", + "weight": self.module_name_group_df.get_group( + (edge["target"], edge["target_callsite"]) + )["time (inc)"].max(), + "edge_type": "reveal_edge", + } + ], + ) + + self.supergraph.gf.nxg.remove_node(reveal_module) diff --git a/callflow/datastructures/supergraph_ensemble.py b/callflow/datastructures/supergraph_ensemble.py index 116c6f50..d54aaa6e 100644 --- a/callflow/datastructures/supergraph_ensemble.py +++ b/callflow/datastructures/supergraph_ensemble.py @@ -1,52 +1,53 @@ +# Copyright 2017-2020 Lawrence Livermore National Security, LLC and other +# CallFlow Project Developers. See the top-level LICENSE file for details. +# +# SPDX-License-Identifier: MIT + +# ------------------------------------------------------------------------------ +# Library imports import networkx as nx import numpy as np import pandas as pd import math, json from ast import literal_eval as make_list +# ------------------------------------------------------------------------------ +# CallFlow imports import callflow LOGGER = callflow.get_logger(__name__) -from callflow.timer import Timer - +from callflow import SuperGraph -class EnsembleSuperGraph(nx.Graph): - # Attributes: - # 1. State => Pass the state which needs to be handled. - # 2. path => '', 'path', 'group_path' or 'component_path' - # 3. construct_graph -> To decide if we should construct graph from path - # 4. add_data => To +# ------------------------------------------------------------------------------ +# Ensemble Super Graph class. +class EnsembleSuperGraph(SuperGraph): def __init__( self, - states, - path, + supergraphs={}, + tag="", + path="path", group_by_attr="module", + props={}, construct_graph=True, add_data=False, reveal_callsites=[], split_entry_module="", split_callee_module="", ): - super(EnsembleSuperGraph, self).__init__() - self.states = states - self.timer = Timer() - - # Store the ensemble graph (Since it is already processed.) - self.state_entire = self.states["ensemble_entire"] - self.state_filter = self.states["ensemble_filter"] - self.state_group = self.states["ensemble_group"] - self.ensemble_g = self.state_group.new_gf.nxg - self.node_list = np.array(list(self.ensemble_g.nodes())) - - # Path type to group by - # TODO: Generalize to any group the user provides. + # Call the SuperGraph class init. + super(EnsembleSuperGraph, self).__init__(props=props, tag=tag, mode="render") + + # Stores all the SuperGraphs using a Map. + self.supergraphs = supergraphs + self.path = path self.group_by = group_by_attr - self.entire_df = self.state_entire.new_gf.df - self.group_df = self.state_group.new_gf.df + # Need to remove. + self.ensemble_supergraph = self.supergraphs["ensemble"] + self.group_df = self.ensemble_supergraph.gf.df + # Columns to consider. - # TODO: Generalize it either all columns or let user specify the value using config.json self.columns = [ "time (inc)", "module", @@ -57,13 +58,7 @@ def __init__( "actual_time", ] - # Store all the names of runs in self.runs. - # TODO: Change name in the df from 'dataset' to 'run' - self.runs = self.entire_df["dataset"].unique() - - with self.timer.phase("Creating data maps"): - self.create_ensemble_maps() - self.create_target_maps() + self.runs = self.group_df["dataset"].unique() self.reveal_callsites = reveal_callsites self.split_entry_module = split_entry_module @@ -72,11 +67,11 @@ def __init__( with self.timer.phase("Construct Graph"): if construct_graph: LOGGER.info( - "Creating a SuperGraph for {0}.".format(self.state_group.name) + "Creating a SuperGraph for {0}.".format(self.supergraphs.keys()) ) self.cct = nx.DiGraph() - self.agg_g = nx.DiGraph() + self.agg_nxg = nx.DiGraph() self.add_paths(path) self.add_reveal_paths(self.reveal_callsites) if self.split_entry_module != "": @@ -84,331 +79,18 @@ def __init__( if self.split_callee_module != "": self.add_exit_callees_paths() else: - print("Using the existing graph from state {0}".format(self.state.name)) + LOGGER.debug(f"Using the existing graph from state {self.state.name}") - add_data = True with self.timer.phase("Add graph attributes"): - if add_data == True: - self.add_node_attributes() - self.add_edge_attributes() - # else: - # print("Creating a Graph without node or edge attributes.") + self.add_node_attributes() + self.add_edge_attributes() print(self.timer) - def create_target_maps(self): - self.target_df = {} - self.target_modules = {} - self.target_module_group_df = {} - self.target_module_name_group_df = {} - self.target_module_callsite_map = {} - self.target_module_time_inc_map = {} - self.target_module_time_exc_map = {} - self.target_name_time_inc_map = {} - self.target_name_time_exc_map = {} - - for run in self.runs: - # Reduce the entire_df to respective target dfs. - self.target_df[run] = self.entire_df.loc[self.entire_df["dataset"] == run] - - # Unique modules in the target run - self.target_modules[run] = self.target_df[run]["module"].unique() - - # Group the dataframe in two ways. - # 1. by module - # 2. by module and callsite - self.target_module_group_df[run] = self.target_df[run].groupby(["module"]) - self.target_module_name_group_df[run] = self.target_df[run].groupby( - ["module", "name"] - ) - - # Module map for target run {'module': [Array of callsites]} - self.target_module_callsite_map[run] = ( - self.target_module_group_df[run]["name"].unique().to_dict() - ) - - # Inclusive time maps for the module level and callsite level. - self.target_module_time_inc_map[run] = ( - self.target_module_group_df[run]["time (inc)"].max().to_dict() - ) - self.target_name_time_inc_map[run] = ( - self.target_module_name_group_df[run]["time (inc)"].max().to_dict() - ) - - # Exclusive time maps for the module level and callsite level. - self.target_module_time_exc_map[run] = ( - self.target_module_group_df[run]["time"].max().to_dict() - ) - self.target_name_time_exc_map[run] = ( - self.target_module_name_group_df[run]["time"].max().to_dict() - ) - - def create_ensemble_maps(self): - self.modules = self.entire_df["module"].unique() - - self.module_name_group_df = self.entire_df.groupby(["module", "name"]) - self.module_group_df = self.entire_df.groupby(["module"]) - self.name_group_df = self.entire_df.groupby(["name"]) - - # Module map for ensemble {'module': [Array of callsites]} - self.module_callsite_map = self.module_group_df["name"].unique().to_dict() - - # Inclusive time maps for the module level and callsite level. - self.module_time_inc_map = self.module_group_df["time (inc)"].max().to_dict() - self.name_time_inc_map = self.module_name_group_df["time (inc)"].max().to_dict() - - # Exclusive time maps for the module level and callsite level. - self.module_time_exc_map = self.module_group_df["time"].max().to_dict() - self.name_time_exc_map = self.module_name_group_df["time"].max().to_dict() - - def construct_cycle_free_paths(self, path): - ret = [] - moduleMapper = {} - dataMap = {} - - if isinstance(path, float): - return [] - path = make_list(path) - for idx, elem in enumerate(path): - callsite = elem.split("=")[1] - module = elem.split("=")[0] - if module not in dataMap: - moduleMapper[module] = 0 - dataMap[module] = [ - {"callsite": callsite, "module": module, "level": idx} - ] - else: - flag = [p["level"] == idx for p in dataMap[module]] - if np.any(np.array(flag)): - moduleMapper[module] += 1 - dataMap[module].append( - { - "callsite": callsite, - "module": module + "=" + callsite, - "level": idx, - } - ) - else: - dataMap[module].append( - {"callsite": callsite, "module": module, "level": idx} - ) - ret.append(dataMap[module][-1]) - - return ret - - def create_source_targets(self, component_path): - module = "" - edges = [] - for idx, callsite in enumerate(component_path): - if idx == 0: - module = component_path[0] - edges.append( - { - "module": module, - "source": module, - "target": module + "=" + component_path[idx + 1], - } - ) - pass - elif idx == len(component_path) - 1: - pass - else: - edges.append( - { - "module": module, - "source": module + "=" + component_path[idx], - "target": module + "=" + component_path[idx + 1], - } - ) - - return edges - - def callsitePathInformation(self, callsites): - paths = [] - for callsite in callsites: - df = self.name_group_df.get_group(callsite) - paths.append( - { - "group_path": make_list(df["group_path"].unique()[0]), - "path": make_list(df["path"].unique()[0]), - "component_path": make_list(df["component_path"].unique()[0]), - } - ) - return paths - - def add_reveal_paths(self, reveal_callsites): - paths = self.callsitePathInformation(reveal_callsites) - - for path in paths: - component_edges = self.create_source_targets(path["component_path"]) - for idx, edge in enumerate(component_edges): - module = edge["module"] - - # format module + '=' + callsite - source = edge["source"] - target = edge["target"] - - if not self.agg_g.has_edge(source, target): - if idx == 0: - source_callsite = source - source_df = self.module_group_df.get_group((module)) - source_node_type = "super-node" - else: - source_callsite = source.split("=")[1] - source_df = self.module_name_group_df.get_group( - (module, source_callsite) - ) - source_node_type = "component-node" - - target_callsite = target.split("=")[1] - target_df = self.module_name_group_df.get_group( - (module, target_callsite) - ) - target_node_type = "component-node" - - source_weight = source_df["time (inc)"].max() - target_weight = target_df["time (inc)"].max() - - edge_type = "normal" - - print(f"Adding edge: {source_callsite}, {target_callsite}") - self.agg_g.add_node(source, attr_dict={"type": source_node_type}) - self.agg_g.add_node(target, attr_dict={"type": target_node_type}) - self.agg_g.add_edge( - source, - target, - attr_dict=[ - { - "source_callsite": source_callsite, - "target_callsite": target_callsite, - "edge_type": edge_type, - "weight": target_weight, - "edge_type": "reveal_edge", - } - ], - ) - - ######################### Entry function interaction ################################ - - def module_entry_functions_map(self, graph): - entry_functions = {} - for edge in graph.edges(data=True): - attr_dict = edge[2]["attr_dict"] - edge_tuple = (edge[0], edge[1]) - print(edge_tuple) - for edge_attr in attr_dict: - if edge_tuple[1] not in entry_functions: - entry_functions[edge_tuple[1]] = [] - entry_functions[edge_tuple[1]].append(edge_attr["target_callsite"]) - return entry_functions - - def create_source_targets_from_group_path(self, path): - module = "" - edges = [] - for idx, callsite in enumerate(path): - if idx == len(path) - 1: - break - source = path[idx].split("=") - target = path[idx + 1].split("=") - edges.append( - { - "source": source[0], - "target": target[0], - "source_callsite": source[1], - "target_callsite": target[1], - } - ) - return edges - - def same_source_edges(self, component_edges, reveal_module): - ret = [] - for idx, edge in enumerate(component_edges): - source = edge["source"] - target = edge["target"] - - if source == reveal_module: - ret.append(edge) - return ret - - def same_target_edges(self, component_edges, reveal_module): - ret = [] - for idx, edge in enumerate(component_edges): - source = edge["source"] - target = edge["target"] - - if target == reveal_module: - ret.append(edge) - return ret - - def add_entry_callsite_paths(self, reveal_module): - entry_functions_map = self.module_entry_functions_map(self.agg_g) - reveal_callsites = entry_functions_map[reveal_module] - paths = self.callsitePathInformation(reveal_callsites) - - for path in paths: - component_edges = self.create_source_targets_from_group_path( - path["group_path"] - ) - source_edges_to_remove = self.same_source_edges( - component_edges, reveal_module - ) - target_edges_to_remove = self.same_target_edges( - component_edges, reveal_module - ) - - if len(source_edges_to_remove) != 0: - for edge in source_edges_to_remove: - if self.agg_g.has_edge(edge["source"], edge["target"]): - self.agg_g.remove_edge((edge["source"], edge["target"])) - self.agg_g.add_node( - reveal_module + "=" + edge["source_callsite"], - attr_dict={"type": "component-node"}, - ) - self.agg_g.add_edge( - (reveal_module + "=" + edge["source_callsite"], edge["target"]), - attr_dict=[ - { - "source_callsite": edge["source_callsite"], - "target_callsite": edge["target_callsite"], - "edge_type": "normal", - "weight": self.module_name_group_df.get_group( - (reveal_module, edge["source_callsite"]) - )["time (inc)"].max(), - "edge_type": "reveal_edge", - } - ], - ) - - if len(target_edges_to_remove) != 0: - for edge in target_edges_to_remove: - if self.agg_g.has_edge(edge["source"], edge["target"]): - self.agg_g.remove_edge(edge["source"], edge["target"]) - self.agg_g.add_node( - reveal_module + "=" + edge["target_callsite"], - attr_dict={"type": "component-node"}, - ) - self.agg_g.add_edge( - edge["source"], - reveal_module + "=" + edge["target_callsite"], - attr_dict=[ - { - "source_callsite": edge["source_callsite"], - "target_callsite": edge["target_callsite"], - "edge_type": "normal", - "weight": self.module_name_group_df.get_group( - (edge["target"], edge["target_callsite"]) - )["time (inc)"].max(), - "edge_type": "reveal_edge", - } - ], - ) - - self.agg_g.remove_node(reveal_module) - def add_paths(self, path): paths_df = self.group_df.groupby(["name", "group_path"]) for (callsite, path_str), path_df in paths_df: - path_list = self.construct_cycle_free_paths(path_str) + path_list = self.remove_cycles_in_paths(path_str) for callsite_idx, callsite in enumerate(path_list): if callsite_idx != len(path_list) - 1: source = path_list[callsite_idx] @@ -427,8 +109,10 @@ def add_paths(self, path): (target_module, target_callsite) ) - has_caller_edge = self.agg_g.has_edge(source_module, target_module) - has_callback_edge = self.agg_g.has_edge( + has_caller_edge = self.agg_nxg.has_edge( + source_module, target_module + ) + has_callback_edge = self.agg_nxg.has_edge( target_module, source_module ) has_cct_edge = self.cct.has_edge(source_callsite, target_callsite) @@ -468,21 +152,21 @@ def add_paths(self, path): print( f"Add {edge_type} edge for : {source_module}--{target_module}" ) - self.agg_g.add_node(source_module, attr_dict=node_dict) - self.agg_g.add_node(target_module, attr_dict=node_dict) - self.agg_g.add_edge( + self.agg_nxg.add_node(source_module, attr_dict=node_dict) + self.agg_nxg.add_node(target_module, attr_dict=node_dict) + self.agg_nxg.add_edge( source_module, target_module, attr_dict=[edge_dict] ) elif not has_cct_edge and not has_callback_edge: # print(f"Edge already exists for : {source_module}--{target_module}") - edge_data = self.agg_g.get_edge_data( + edge_data = self.agg_nxg.get_edge_data( *(source_module, target_module) ) - self.agg_g[source_module][target_module]["attr_dict"].append( + self.agg_nxg[source_module][target_module]["attr_dict"].append( edge_dict ) - # print(agg_g[source_module][target_module]) + # print(agg_nxg[source_module][target_module]) if not has_cct_edge: self.cct.add_edge( @@ -492,21 +176,23 @@ def add_paths(self, path): ) def add_edge_attributes(self): - # runs_mapping = self.run_counts(self.agg_g) - # nx.set_edge_attributes(self.agg_g, name="number_of_runs", values=runs_mapping) - edge_type_mapping = self.edge_type(self.agg_g) - nx.set_edge_attributes(self.agg_g, name="edge_type", values=edge_type_mapping) - flow_mapping = self.flows(self.agg_g) - nx.set_edge_attributes(self.agg_g, name="weight", values=flow_mapping) - # target_flow_mapping = self.target_flows(self.agg_g) - # nx.set_edge_attributes(self.agg_g, name="target_weight", values=target_flow_mapping) - entry_functions_mapping = self.entry_functions(self.agg_g) + # runs_mapping = self.run_counts(self.agg_nxg) + # nx.set_edge_attributes(self.agg_nxg, name="number_of_runs", values=runs_mapping) + + edge_type_mapping = self.edge_type(self.agg_nxg) + nx.set_edge_attributes(self.agg_nxg, name="edge_type", values=edge_type_mapping) + + flow_mapping = self.flows(self.agg_nxg) + nx.set_edge_attributes(self.agg_nxg, name="weight", values=flow_mapping) + + entry_functions_mapping = self.entry_functions(self.agg_nxg) nx.set_edge_attributes( - self.agg_g, name="entry_callsites", values=entry_functions_mapping + self.agg_nxg, name="entry_callsites", values=entry_functions_mapping ) - exit_functions_mapping = self.exit_functions(self.agg_g) + + exit_functions_mapping = self.exit_functions(self.agg_nxg) nx.set_edge_attributes( - self.agg_g, name="exit_callsites", values=exit_functions_mapping + self.agg_nxg, name="exit_callsites", values=exit_functions_mapping ) def run_counts(self, graph): @@ -523,7 +209,7 @@ def edge_type(self, graph): def flows(self, graph): self.weight_map = {} - for edge in self.agg_g.edges(data=True): + for edge in self.agg_nxg.edges(data=True): if (edge[0], edge[1]) not in self.weight_map: self.weight_map[(edge[0], edge[1])] = 0 @@ -537,7 +223,6 @@ def flows(self, graph): if edge_tuple not in self.weight_map: # Check if it s a reveal edge attr_dict = edge[2]["attr_dict"] - print(attr_dict) if attr_dict["edge_type"] == "reveal_edge": self.weight_map[edge_tuple] = attr_dict["weight"] ret[edge_tuple] = self.weight_map[edge_tuple] @@ -548,9 +233,10 @@ def flows(self, graph): return ret + # Not used. def target_flows(self, graph): self.weight_map = {} - for edge in self.agg_g.edges(data=True): + for edge in self.agg_nxg.edges(data=True): if (edge[0], edge[1]) not in self.weight_map: self.weight_map[(edge[0], edge[1])] = 0 @@ -564,7 +250,6 @@ def target_flows(self, graph): if edge_tuple not in self.weight_map: # Check if it s a reveal edge attr_dict = edge[2]["attr_dict"] - print(attr_dict) if attr_dict["edge_type"] == "reveal_edge": self.weight_map[edge_tuple] = attr_dict["weight"] ret[edge_tuple] = self.weight_map[edge_tuple] @@ -608,16 +293,16 @@ def exit_functions(self, graph): return exit_functions def add_node_attributes(self): - ensemble_mapping = self.ensemble_map(self.agg_g.nodes()) + ensemble_mapping = self.ensemble_map(self.agg_nxg.nodes()) for idx, key in enumerate(ensemble_mapping): - nx.set_node_attributes(self.agg_g, name=key, values=ensemble_mapping[key]) + nx.set_node_attributes(self.agg_nxg, name=key, values=ensemble_mapping[key]) dataset_mapping = {} for run in self.runs: - dataset_mapping[run] = self.dataset_map(self.agg_g.nodes(), run) + dataset_mapping[run] = self.dataset_map(self.agg_nxg.nodes(), run) - nx.set_node_attributes(self.agg_g, name=run, values=dataset_mapping[run]) + nx.set_node_attributes(self.agg_nxg, name=run, values=dataset_mapping[run]) def callsite_time(self, group_df, module, callsite): callsite_df = group_df.get_group((module, callsite)) @@ -642,13 +327,11 @@ def ensemble_map(self, nodes): ret = {} # loop through the nodes - for node in self.agg_g.nodes(data=True): + for node in self.agg_nxg.nodes(data=True): node_name = node[0] node_dict = node[1]["attr_dict"] - print(node_name, node_dict) if node_dict["type"] == "component-node": - print(node_name, node_dict) module = node_name.split("=")[0] callsite = node_name.split("=")[1] actual_time = self.callsite_time( @@ -696,7 +379,7 @@ def ensemble_map(self, nodes): def dataset_map(self, nodes, run): ret = {} - for node in self.agg_g.nodes(data=True): + for node in self.agg_nxg.nodes(data=True): node_name = node[0] node_dict = node[1]["attr_dict"] if node_name in self.target_module_callsite_map[run].keys(): diff --git a/callflow/datastructures/supergraph_single.py b/callflow/datastructures/supergraph_single.py index 06756e1f..0c11f1c9 100644 --- a/callflow/datastructures/supergraph_single.py +++ b/callflow/datastructures/supergraph_single.py @@ -1,92 +1,67 @@ -############################################################################## -# Copyright (c) 2018-2019, Lawrence Livermore National Security, LLC. -# Produced at the Lawrence Livermore National Laboratory. +# Copyright 2017-2020 Lawrence Livermore National Security, LLC and other +# CallFlow Project Developers. See the top-level LICENSE file for details. # -# This file is part of Callflow. -# Created by Suraj Kesavan . -# LLNL-CODE-741008. All rights reserved. -# -# For details, see: https://github.com/LLNL/Callflow -# Please also read the LICENSE file for the MIT License notice. -############################################################################## +# SPDX-License-Identifier: MIT +# ------------------------------------------------------------------------------ +# Library imports import sys import networkx as nx import math import json from ast import literal_eval as make_tuple + +# ------------------------------------------------------------------------------ +# CallFlow imports +import callflow from callflow.timer import Timer +from callflow import SuperGraph +LOGGER = callflow.get_logger(__name__) -class SingleSuperGraph(nx.Graph): +# ------------------------------------------------------------------------------ +# Single Super Graph class. +class SingleSuperGraph(SuperGraph): def __init__( self, - states, + supergraphs, + tag, dataset, path, group_by_attr="module", construct_graph=True, add_data=True, - debug=True, ): - super(SingleSuperGraph, self).__init__() - self.log = Log("supergraph") - self.state = states[dataset] - self.dataset = dataset - self.timer = Timer() + super(SingleSuperGraph, self).__init__(props=props, tag=tag, mode="render") - self.graph = state.new_gf.graph - self.df = state.new_gf.df - self.g = state.new_gf.nxg + self.ensemble_supergraph = self.supergraphs[tag] + self.group_df = self.ensemble_supergraph.gf.df + self.path = path self.group_by = group_by_attr + # Columns to consider. self.columns = [ "time (inc)", - "group_path", + "module", "name", "time", - "callers", - "callees", - "vis_name", + "type", "module", - "show_node", + "actual_time", ] with self.timer.phase("Construct Graph"): if construct_graph: - log.info("Creating the SuperGraph for {0}.".format(self.state.name)) + LOGGER.info("Creating the SuperGraph for {0}.".format(self.state.name)) self.mapper = {} self.g = nx.DiGraph() self.add_paths(path) + self.add_callback_paths() else: print("Using the existing graph from state {0}".format(self.state.name)) - if debug: - log.warn("Modules: {0}".format(self.df["module"].unique())) - log.warn("Top 10 Inclusive time: ") - top = 10 - rank_df = self.df.groupby(["name", "nid"]).mean() - top_inclusive_df = rank_df.nlargest(top, "time (inc)", keep="first") - for name, row in top_inclusive_df.iterrows(): - log.info("{0} [{1}]".format(name, row["time (inc)"])) - - log.warn("Top 10 Enclusive time: ") - top_exclusive_df = rank_df.nlargest(top, "time", keep="first") - for name, row in top_exclusive_df.iterrows(): - log.info("{0} [{1}]".format(name, row["time"])) - - for node in self.g.nodes(data=True): - log.info("Node: {0}".format(node)) - for edge in self.g.edges(): - log.info("Edge: {0}".format(edge)) - - log.warn("Nodes in the tree: {0}".format(len(self.g.nodes))) - log.warn("Edges in the tree: {0}".format(len(self.g.edges))) - log.warn("Is it a tree? : {0}".format(nx.is_tree(self.g))) - log.warn("Flow hierarchy: {0}".format(nx.flow_hierarchy(self.g))) - - # Variables to control the data properties globally. + # Remove. self.callbacks = [] self.edge_direction = {} @@ -94,30 +69,12 @@ def __init__( if add_data == True: self.add_node_attributes() self.add_edge_attributes() - # else: - # print("Creating a Graph without node or edge attributes.") - - log.info(self.timer) - - def no_cycle_path(self, path): - ret = [] - moduleMapper = {} - for idx, elem in enumerate(path): - call_site = elem.split("=")[1] - module = self.df.loc[self.df.name == call_site]["module"].tolist()[0] - if module not in moduleMapper and elem in self.mapper: - self.mapper[elem] += 1 - moduleMapper[module] = True - ret.append(elem) - elif elem not in self.mapper: - self.mapper[elem] = 0 else: - self.mapper[elem] += 1 - return tuple(ret) + LOGGER.info("Creating a Graph without node or edge attributes.") + + LOGGER.debug(self.timer) def add_paths(self, path): - # path_df = self.df[path].fillna("()") - # paths = path_df.drop_duplicates().tolist() paths = self.df[path].unique() for idx, path_str in enumerate(paths): if not isinstance(path_str, float): @@ -138,6 +95,7 @@ def add_paths(self, path): }, ) + # TODO: remove this if not needed. def add_callback_paths(self): for from_module, to_modules in self.callbacks.items(): for idx, to_module in enumerate(to_modules): @@ -196,23 +154,6 @@ def calculate_flows(self, graph): return ret - def tailhead(self, edge): - return (edge[0], edge[1]) - - def tailheadDir(self, edge): - return (str(edge[0]), str(edge[1]), self.edge_direction[edge]) - - def leaves_below(self, graph, node): - return set( - sum( - ( - [vv for vv in v if graph.out_degree(vv) == 0] - for k, v in nx.dfs_successors(graph, node).items() - ), - [], - ) - ) - def dataset_map(self, nodes, dataset): ret = {} for node in self.g.nodes(): diff --git a/callflow/datastructures/uniongraph.py b/callflow/datastructures/uniongraph.py deleted file mode 100644 index 426f4c90..00000000 --- a/callflow/datastructures/uniongraph.py +++ /dev/null @@ -1,76 +0,0 @@ -import networkx as nx - - -class UnionGraph: - def __init__(self): - self.R = nx.DiGraph() - self.runs = {} - self.diffset = {} - - # Return the union of graphs G and H. - def unionize(self, H, name=None, rename=(None, None)): - if not self.R.is_multigraph() == H.is_multigraph(): - raise nx.NetworkXError("G and H must both be graphs or multigraphs.") - - self.R.graph.update(H.graph) - - renamed_nodes = self.add_prefix(H, rename[1]) - - debug = False - if debug: - print("-=========================-") - print("Nodes in R and H are same? ", set(self.R) == set(H)) - if set(self.R) != set(H): - print("Difference is ", list(set(H) - set(self.R))) - print("Nodes in R", set(self.R)), - print("Nodes in H", set(H)) - print("-=========================-") - - if H.is_multigraph(): - H_edges = H.edges(keys=True, data=True) - else: - H_edges = H.edges(data=True) - - # add nodes and edges. - self.R.add_nodes_from(H) - self.R.add_edges_from(H_edges) - - # add node attributes for each run - for n in renamed_nodes: - self.add_node_attributes(H, n, name) - - # rename graph to obtain disjoint node labels - def add_prefix(self, graph, prefix): - if prefix is None: - return graph - - def label(x): - if is_string_like(x): - name = prefix + x - else: - name = prefix + repr(x) - return name - - return nx.relabel_nodes(graph, label) - - def add_edge_attributes(self): - number_of_runs_mapping = self.number_of_runs() - nx.set_edge_attributes( - self.R, name="number_of_runs", values=number_of_runs_mapping - ) - - def number_of_runs(self): - ret = {} - for idx, name in enumerate(self.runs): - for edge in self.runs[name].edges(): - if edge not in ret: - ret[edge] = 0 - ret[edge] += 1 - return ret - - def add_node_attributes(self, H, node, dataset_name): - for idx, (key, val) in enumerate(H.nodes.items()): - if dataset_name not in self.R.nodes[node]: - self.R.nodes[node][dataset_name] = 0 - if key == node: - self.R.nodes[node][dataset_name] = 1 diff --git a/callflow/logger.py b/callflow/logger.py index c94fc790..39f0a94e 100644 --- a/callflow/logger.py +++ b/callflow/logger.py @@ -25,9 +25,6 @@ def init_logger(**kwargs): level = int(kwargs.get("level", 2)) do_color = str(kwargs.get("color", True)) - # print ('level = ({})'.format(level)) - # print ('do_color = ({})'.format(do_color)) - # -------------------------------------------------------------------------- # get logging level in "logging" format assert level >= 1 and level <= 5 @@ -44,18 +41,6 @@ def init_logger(**kwargs): # -------------------------------------------------------------------------- # get loging format - """ - aliases = { - logging.DEBUG: "%(log_color)s(%(name)s.py) %(msg)s ", - logging.ERROR: "\033 %(log_color)s(%(name)s.py) ERROR: %(msg)s", - logging.CRITICAL: "\033 %(log_color)s(%(name)s.py) CRITICAL: %(msg)s", - logging.WARNING: "\033 %(log_color)s(%(name)s.py) WARN: %(msg)s", - logging.INFO: "%(log_color)s%(msg)s", - } - LOG_FMT = aliases[level] - """ - - # Harsh's suggestion # here, the initialization of the format doesnt depend upon "level" LOG_FMT = ( "%(asctime)s - %(name)s:%(funcName)s:%(lineno)s - %(levelname)s - %(message)s" diff --git a/callflow/modules/auxiliary_ensemble.py b/callflow/modules/auxiliary_ensemble.py index 7077526c..5fbbd69d 100644 --- a/callflow/modules/auxiliary_ensemble.py +++ b/callflow/modules/auxiliary_ensemble.py @@ -30,27 +30,57 @@ class EnsembleAuxiliary: def __init__( self, - states, + gf=callflow.GraphFrame, + datasets=[], + props={}, MPIBinCount="20", RunBinCount="20", - datasets=[], - config={}, process=True, write=False, ): - self.timer = Timer() - self.df = self.select_rows(states["ensemble_entire"].new_gf.df, datasets) + self.gf = gf self.MPIBinCount = MPIBinCount self.RunBinCount = RunBinCount - self.config = config - self.states = states + self.timer = Timer() + self.props = props + self.datasets = self.props["dataset_names"] + + self.df = self.select_rows(self.gf.df, self.datasets) + self.process = process self.write = write - self.datasets = datasets - self.props = ["rank", "name", "dataset", "all_ranks"] + self.hist_props = ["rank", "name", "dataset", "all_ranks"] self.filter = True + if process: + self.compute() + else: + self.read() + print(self.timer) + + def compute(self): + ret = {} + path = os.path.join(self.props["save_path"], "ensemble/auxiliary_data.json") + + LOGGER.info("Calculating Gradients, Mean runtime variations, and Distribution.") + with self.timer.phase("Process data"): + self.group_frames() + with self.timer.phase("Collect Callsite data"): + ret["callsite"] = self.callsite_data() + with self.timer.phase("Collect Module data"): + ret["module"] = self.module_data() + with self.timer.phase("Module callsite map data"): + ret["moduleCallsiteMap"] = self.get_module_callsite_map() + # with self.timer.phase("Callsite module map data"): + # ret['callsiteModuleMap'] = self.get_callsite_module_map() + # if self.write: + with self.timer.phase("Writing data"): + with open(path, "w") as f: + json.dump(ret, f) + + return ret + def filter_dict(self, result): ret = {} @@ -61,14 +91,14 @@ def filter_dict(self, result): ret["callsite"] = {} group_df = self.df.groupby(["name"]).mean() - if self.config.filter_by == "time": + if self.props["filter_by"] == "time": f_group_df = group_df.loc[ - group_df[self.config.filter_by] > self.config.filter_below + group_df[self.props["filter_by"]] > self.props["filter_below"] ] - elif self.config.filter_by == "time (inc)": + elif self.props["filter_by"] == "time (inc)": f_group_df = group_df.loc[ - group_df[self.config.filter_by] - > 0.01 * self.config.filter_perc * group_df["time (inc)"].max() + group_df[self.props["filter_by"]] + > 0.01 * self.props["filter_perc"] * group_df["time (inc)"].max() ] callsites = f_group_df.index.values.tolist() @@ -85,13 +115,12 @@ def filter_dict(self, result): def group_frames(self): if self.filter: - # self.df = self.df.loc[self.df['time'] > 0.01*self.config.filter_perc*self.df['time'].max() ] - # self.df = self.df.loc[self.df['time (inc)'] > self.config.filter_perc]['name'].unique() xgroup_df = self.df.groupby(["name"]).mean() sort_xgroup_df = xgroup_df.sort_values(by=["time (inc)"], ascending=False) top100callsites = sort_xgroup_df.nlargest(50, "time (inc)") self.df = self.df[self.df["name"].isin(top100callsites.index.values)] + self.df.drop(["rank"], axis=1) self.module_name_group_df = self.df.groupby(["module", "name"]) self.module_group_df = self.df.groupby(["module"]) self.name_group_df = self.df.groupby(["name"]) @@ -112,6 +141,122 @@ def group_frames(self): ["name"] ) + # Callsite grouped information + def callsite_data(self): + ret = {} + + # Create the data dict. + ensemble = {} + for callsite, callsite_df in self.name_group_df: + callsite_ensemble_df = self.name_group_df.get_group(callsite) + hists = {} + hists["Inclusive"] = {} + hists["Exclusive"] = {} + for prop in self.hist_props: + prop_histograms = self.histogram_by_property_ensemble( + callsite_ensemble_df, prop + ) + hists["Inclusive"][prop] = prop_histograms["Inclusive"] + hists["Exclusive"][prop] = prop_histograms["Exclusive"] + + gradients = Gradients(self.target_df, binCount=self.RunBinCount).run( + columnName="name", callsiteOrModule=callsite + ) + boxplot = BoxPlot(callsite_df) + ensemble[callsite] = self.pack_json( + callsite_df, + callsite, + gradients=gradients, + q=boxplot.q, + outliers=boxplot.outliers, + prop_hists=hists, + ) + + ret["ensemble"] = ensemble + + ## Target data. + # Loop through datasets and group the callsite by name. + for dataset in self.datasets: + name_grouped = self.target_name_group_df[dataset] + target = {} + for callsite, callsite_df in name_grouped: + callsite_ensemble_df = self.name_group_df.get_group(callsite) + callsite_target_df = callsite_df + + if not callsite_df.empty: + hists = {} + hists["Inclusive"] = {} + hists["Exclusive"] = {} + for prop in self.hist_props: + prop_histograms = self.histogram_by_property( + callsite_ensemble_df, callsite_target_df, prop + ) + hists["Inclusive"][prop] = prop_histograms["Inclusive"] + hists["Exclusive"][prop] = prop_histograms["Exclusive"] + + boxplot = BoxPlot(callsite_df) + target[callsite] = self.pack_json( + df=callsite_target_df, + name=callsite, + prop_hists=hists, + q=boxplot.q, + outliers=boxplot.outliers, + ) + ret[dataset] = target + + return ret + + def module_data(self): + ret = {} + # Module grouped information + modules = self.df["module"].unique() + ensemble = {} + for module, module_df in self.module_group_df: + module_ensemble_df = self.module_group_df.get_group(module) + hists = {"Inclusive": {}, "Exclusive": {}} + for prop in self.hist_props: + prop_histograms = self.histogram_by_property_ensemble( + module_ensemble_df, prop + ) + hists["Inclusive"][prop] = prop_histograms["Inclusive"] + hists["Exclusive"][prop] = prop_histograms["Exclusive"] + + # Calculate gradients + gradients = Gradients(self.target_df, binCount=self.RunBinCount).run( + columnName="module", callsiteOrModule=module + ) + ensemble[module] = self.pack_json( + df=module_df, name=module, gradients=gradients, prop_hists=hists + ) + + ret["ensemble"] = ensemble + + for dataset in self.datasets: + target = {} + module_group_df = self.target_module_group_df[dataset] + for module, module_df in module_group_df: + module_ensemble_df = self.module_group_df.get_group(module) + module_target_df = module_df + gradients = {"Inclusive": {}, "Exclusive": {}} + hists = {"Inclusive": {}, "Exclusive": {}} + if not module_target_df.empty: + for prop in self.hist_props: + prop_histograms = self.histogram_by_property( + module_ensemble_df, module_target_df, prop + ) + hists["Inclusive"][prop] = prop_histograms["Inclusive"] + hists["Exclusive"][prop] = prop_histograms["Exclusive"] + target[module] = self.pack_json( + df=module_target_df, + name=module, + gradients=gradients, + prop_hists=hists, + ) + + ret[dataset] = target + + return ret + def select_rows(self, df, search_strings): unq, IDs = np.unique(df["dataset"], return_inverse=True) unqIDs = np.searchsorted(unq, search_strings) @@ -188,8 +333,8 @@ def pack_json( "id": "node-" + str(df["nid"].tolist()[0]), "dataset": df["dataset"].unique().tolist(), "module": df["module"].tolist()[0], - "callers": df["callers"].unique().tolist(), - "callees": df["callees"].unique().tolist(), + # "callers": df["callers"].unique().tolist(), + # "callees": df["callees"].unique().tolist(), "component_path": df["component_path"].unique().tolist(), "component_level": df["component_level"].unique().tolist(), "Inclusive": { @@ -243,6 +388,7 @@ def histogram_by_property_ensemble(self, ensemble_df, prop): time_ensemble_exclusive_arr = np.array(ensemble_df["time"].tolist()) elif prop == "rank": + ensemble_df.reset_index(drop=True, inplace=True) ensemble_prop = ensemble_df.groupby(["dataset", prop])[ ["time", "time (inc)"] ].mean() @@ -287,9 +433,12 @@ def histogram_by_property(self, ensemble_df, target_df, prop): time_target_inclusive_arr = np.array(target_df["time (inc)"].tolist()) time_target_exclusive_arr = np.array(target_df["time"].tolist()) elif prop == "rank": + ensemble_df.reset_index(drop=True, inplace=True) ensemble_prop = ensemble_df.groupby(["dataset", prop])[ ["time", "time (inc)"] ].mean() + + target_df.reset_index(drop=True, inplace=True) target_prop = target_df.groupby(["dataset", prop])[ ["time", "time (inc)"] ].mean() @@ -344,146 +493,3 @@ def histogram_by_property(self, ensemble_df, target_df, prop): "target": self.histogram_format(histogram_target_exclusive_grid), } return ret - - # Callsite grouped information - def callsite_data(self): - ret = {} - - # Create the data dict. - ensemble = {} - for callsite, callsite_df in self.name_group_df: - callsite_ensemble_df = self.name_group_df.get_group(callsite) - hists = {} - hists["Inclusive"] = {} - hists["Exclusive"] = {} - for prop in self.props: - prop_histograms = self.histogram_by_property_ensemble( - callsite_ensemble_df, prop - ) - hists["Inclusive"][prop] = prop_histograms["Inclusive"] - hists["Exclusive"][prop] = prop_histograms["Exclusive"] - - gradients = Gradients(self.target_df, binCount=self.RunBinCount).run( - columnName="name", callsiteOrModule=callsite - ) - boxplot = BoxPlot(callsite_df) - ensemble[callsite] = self.pack_json( - callsite_df, - callsite, - gradients=gradients, - q=boxplot.q, - outliers=boxplot.outliers, - prop_hists=hists, - ) - - ret["ensemble"] = ensemble - - ## Target data. - # Loop through datasets and group the callsite by name. - for dataset in self.datasets: - name_grouped = self.target_name_group_df[dataset] - target = {} - for callsite, callsite_df in name_grouped: - callsite_ensemble_df = self.name_group_df.get_group(callsite) - callsite_target_df = callsite_df - - if not callsite_df.empty: - hists = {} - hists["Inclusive"] = {} - hists["Exclusive"] = {} - for prop in self.props: - prop_histograms = self.histogram_by_property( - callsite_ensemble_df, callsite_target_df, prop - ) - hists["Inclusive"][prop] = prop_histograms["Inclusive"] - hists["Exclusive"][prop] = prop_histograms["Exclusive"] - - boxplot = BoxPlot(callsite_df) - target[callsite] = self.pack_json( - df=callsite_target_df, - name=callsite, - prop_hists=hists, - q=boxplot.q, - outliers=boxplot.outliers, - ) - ret[dataset] = target - - return ret - - def module_data(self): - ret = {} - # Module grouped information - modules = self.df["module"].unique() - ensemble = {} - for module, module_df in self.module_group_df: - module_ensemble_df = self.module_group_df.get_group(module) - hists = {"Inclusive": {}, "Exclusive": {}} - for prop in self.props: - prop_histograms = self.histogram_by_property_ensemble( - module_ensemble_df, prop - ) - hists["Inclusive"][prop] = prop_histograms["Inclusive"] - hists["Exclusive"][prop] = prop_histograms["Exclusive"] - - # Calculate gradients - gradients = Gradients(self.target_df, binCount=self.RunBinCount).run( - columnName="module", callsiteOrModule=module - ) - ensemble[module] = self.pack_json( - df=module_df, name=module, gradients=gradients, prop_hists=hists - ) - - ret["ensemble"] = ensemble - - for dataset in self.datasets: - target = {} - module_group_df = self.target_module_group_df[dataset] - for module, module_df in module_group_df: - module_ensemble_df = self.module_group_df.get_group(module) - module_target_df = module_df - gradients = {"Inclusive": {}, "Exclusive": {}} - hists = {"Inclusive": {}, "Exclusive": {}} - if not module_target_df.empty: - for prop in self.props: - prop_histograms = self.histogram_by_property( - module_ensemble_df, module_target_df, prop - ) - hists["Inclusive"][prop] = prop_histograms["Inclusive"] - hists["Exclusive"][prop] = prop_histograms["Exclusive"] - target[module] = self.pack_json( - df=module_target_df, - name=module, - gradients=gradients, - prop_hists=hists, - ) - - ret[dataset] = target - - return ret - - def run(self): - ret = {} - path = os.path.join(self.config.save_path, "all_data.json") - - if self.process: - LOGGER.info( - "Calculating Gradients, Mean runtime variations, and Distribution." - ) - with self.timer.phase("Process data"): - self.group_frames() - with self.timer.phase("Collect Callsite data"): - ret["callsite"] = self.callsite_data() - with self.timer.phase("Collect Module data"): - ret["module"] = self.module_data() - with self.timer.phase("Module callsite map data"): - ret["moduleCallsiteMap"] = self.get_module_callsite_map() - # with self.timer.phase("Callsite module map data"): - # ret['callsiteModuleMap'] = self.get_callsite_module_map() - if self.write: - with self.timer.phase("Writing data"): - with open(path, "w") as f: - json.dump(ret, f) - - LOGGER.debug(self.timer) - - return ret diff --git a/callflow/modules/auxiliary_single.py b/callflow/modules/auxiliary_single.py index ecf9982e..8ff8f859 100644 --- a/callflow/modules/auxiliary_single.py +++ b/callflow/modules/auxiliary_single.py @@ -21,13 +21,13 @@ class SingleAuxiliary: - def __init__(self, state, binCount="20", dataset="", config={}, process=True): - self.graph = state.new_gf.graph - self.df = state.new_gf.df - self.config = config + def __init__(self, gf, dataset="", MPIBinCount=20, props={}, process=True): + self.graph = gf.graph + self.df = gf.df + self.props = props self.process = process self.dataset = dataset - self.binCount = binCount + self.binCount = MPIBinCount ret_df = pd.DataFrame([]) self.timer = Timer() @@ -98,6 +98,9 @@ def pack_json(self, group_df, node_name, data_type): hist_inc_grid = self.histogram(time_inc_target_arr) hist_exc_grid = self.histogram(time_exc_target_arr) + if "rank" not in group_df.keys(): + group_df = group_df.reset_index(drop=False) + result = { "name": node_name, "time (inc)": group_df["time (inc)"].tolist(), @@ -168,11 +171,7 @@ def module_data(self): def run(self): ret = {} - path = ( - self.config.processed_path - + f"/{self.config.runName}" - + f"/{self.dataset}/all_data.json" - ) + path = self.props["save_path"] + f"/{self.dataset}/auxiliary_data.json" # self.process = True if os.path.exists(path) and not self.process: diff --git a/callflow/modules/function_list.py b/callflow/modules/function_list.py index 2d695da9..30faf082 100644 --- a/callflow/modules/function_list.py +++ b/callflow/modules/function_list.py @@ -37,7 +37,6 @@ def add_paths(self, path_name): for idx, row in self.df.iterrows(): # if row.show_node: path = row[path_name] - print(path) # TODO: Sometimes the path becomes a string. Find why it happens. # If it becomes a string if isinstance(path, str): diff --git a/callflow/modules/module_hierarchy.py b/callflow/modules/module_hierarchy.py index 40cdb0a7..ea3d8e31 100644 --- a/callflow/modules/module_hierarchy.py +++ b/callflow/modules/module_hierarchy.py @@ -11,13 +11,11 @@ LOGGER = callflow.get_logger(__name__) from callflow.timer import Timer -from callflow.utils import sanitizeName class ModuleHierarchy: - def __init__(self, state, module, config={}): - self.df = state.new_gf.df - self.config = config + def __init__(self, supergraph, module): + self.df = supergraph.gf.df self.module = module # Create the Super node's hierarchy. @@ -34,8 +32,8 @@ def create_source_targets(self, path): if idx == len(path) - 1: break - source = sanitizeName(path[idx]) - target = sanitizeName(path[idx + 1]) + source = callflow.utils.sanitize_name(path[idx]) + target = callflow.utils.sanitize_name(path[idx + 1]) edges.append({"source": source, "target": target}) return edges diff --git a/callflow/modules/parameter_projection.py b/callflow/modules/parameter_projection.py index cdecd85b..ac112c99 100644 --- a/callflow/modules/parameter_projection.py +++ b/callflow/modules/parameter_projection.py @@ -24,23 +24,22 @@ class ParameterProjection: - def __init__(self, state, similarities={}, targetDataset="", n_cluster=3): - # self.similarities = similarities[targetDataset] - # self.datasetOrder = {k: idx for idx, (k, v) in enumerate(similarities.items())} - self.state = state - self.df = state.new_gf.df - self.datasets = state.new_gf.df["dataset"].unique().tolist() + def __init__(self, supergraph, similarities={}, targetDataset="", n_cluster=3): + + self.df = supergraph.gf.df + self.datasets = self.df["dataset"].unique().tolist() self.projection = "MDS" self.clustering = "k_means" self.n_cluster = int(n_cluster) self.targetDataset = targetDataset - if len(self.datasets) > self.n_cluster: + if len(self.datasets) >= self.n_cluster: self.result = self.run() else: self.result = pd.DataFrame({}) def add_df_params(self, dataset): ret = {} + print(self.df) ret["max_inclusive_time"] = self.df.loc[self.df["dataset"] == dataset][ "time (inc)" ].max() diff --git a/callflow/operations/__init__.py b/callflow/operations/__init__.py new file mode 100644 index 00000000..7d815be2 --- /dev/null +++ b/callflow/operations/__init__.py @@ -0,0 +1,5 @@ +from .process import Process +from .group import Group +from .union_delete import Union +from .filter import Filter +from .read_config import ConfigFileReader diff --git a/callflow/pipeline/filter_networkx.py b/callflow/operations/filter.py similarity index 58% rename from callflow/pipeline/filter_networkx.py rename to callflow/operations/filter.py index 36dea19b..75731c8f 100644 --- a/callflow/pipeline/filter_networkx.py +++ b/callflow/operations/filter.py @@ -7,35 +7,41 @@ LOGGER = callflow.get_logger(__name__) -class FilterNetworkX: - def __init__(self, state): - self.df = state.new_gf.df - self.dataset_df = self.df.groupby(["dataset"]) - self.dataset_idx = {} +class Filter: + def __init__( + self, gf=None, mode="single", filter_by="time (inc)", filter_perc="10" + ): + self.gf = gf + self.filter_perc = filter_perc + self.set_max_min_times() + if filter_by == "time (inc)": + self.gf.df = self.df_by_time_inc() + self.gf.nxg = self.graph_by_time_inc() + elif filter_by == "time": + self.gf.df = self.df_by_time() + self.gf.nxg = self.graph_by_time() + def set_max_min_times(self): self.max_time_inc_list = np.array([]) self.min_time_inc_list = np.array([]) self.max_time_exc_list = np.array([]) self.min_time_exc_list = np.array([]) - count = 0 - for dataset, df in self.dataset_df: - self.dataset_idx[dataset] = count - self.max_time_inc_list = np.hstack( - [self.max_time_inc_list, df["time (inc)"].max()] - ) - self.min_time_inc_list = np.hstack( - [self.min_time_inc_list, df["time (inc)"].min()] - ) - self.max_time_exc_list = np.hstack( - [self.max_time_exc_list, df["time"].max()] - ) - self.min_time_exc_list = np.hstack( - [self.min_time_exc_list, df["time"].min()] - ) - count += 1 - LOGGER.info("Dataset idx: ", self.dataset_idx) + + self.max_time_inc_list = np.hstack( + [self.max_time_inc_list, self.gf.df["time (inc)"].max()] + ) + self.min_time_inc_list = np.hstack( + [self.min_time_inc_list, self.gf.df["time (inc)"].min()] + ) + self.max_time_exc_list = np.hstack( + [self.max_time_exc_list, self.gf.df["time"].max()] + ) + self.min_time_exc_list = np.hstack( + [self.min_time_exc_list, self.gf.df["time"].min()] + ) + LOGGER.info(f"Min. time (inc): {self.min_time_inc_list}") LOGGER.info(f"Max. time (inc): {self.max_time_inc_list}") LOGGER.info(f"Min. time (exc): {self.min_time_exc_list}") @@ -46,35 +52,33 @@ def set_max_min_times(self): self.max_time_exc = np.max(self.max_time_exc_list) self.min_time_exc = np.min(self.min_time_exc_list) - def filter_df_by_time_inc(self, perc): - LOGGER.debug(f"[Filter] By Inclusive time : {perc}") - df = self.df.loc[(self.df["time (inc)"] > perc * 0.01 * self.max_time_inc)] + def df_by_time_inc(self): + LOGGER.debug(f"[Filter] By Inclusive time : {self.filter_perc}") + df = self.gf.df.loc[ + (self.gf.df["time (inc)"] > self.filter_perc * 0.01 * self.max_time_inc) + ] filter_call_sites = df["name"].unique() return df[df["name"].isin(filter_call_sites)] - def filter_df_by_time(self, perc): - LOGGER.debug(f"[Filter] By Exclusive time : {perc}") - # df = self.df.loc[self.df['time'] > perc * 0.01 * self.max_time_exc] - df = self.df.loc[self.df["time"] > perc] + def df_by_time(self, perc): + LOGGER.debug(f"[Filter] By Exclusive time : {self.filter_perc}") + df = self.gf.df.loc[self.gf.df["time"] > self.filter_perc] filter_call_sites = df["name"].unique() - print(filter_call_sites) return df[df["name"].isin(filter_call_sites)] - def filter_graph_by_time_inc(self, df, g): - callsites = df["name"].unique() + def graph_by_time_inc(self): + callsites = self.gf.df["name"].unique() ret = nx.DiGraph() - - for edge in g.edges(): + for edge in self.gf.nxg.edges(): # If source is present in the callsites list if edge[0] in callsites and edge[1] in callsites: ret.add_edge(edge[0], edge[1]) else: - LOGGER.info(f"Removing the edge: {edge}") + LOGGER.debug(f"Removing the edge: {edge}") return ret - # Refer https://stackoverflow.com/questions/28095646/finding-all-paths-walks-of-given-length-in-a-networkx-graph def findPaths(self, g, u, n, excludeSet=None): if excludeSet == None: excludeSet = set([u]) @@ -96,7 +100,7 @@ def findPaths(self, g, u, n, excludeSet=None): excludeSet.remove(u) return paths - def filter_graph_by_time(self, df, g): + def graph_by_time(self, df, g): callsites = df["name"].unique() ret = nx.DiGraph() diff --git a/callflow/pipeline/group_by_module.py b/callflow/operations/group.py similarity index 70% rename from callflow/pipeline/group_by_module.py rename to callflow/operations/group.py index b9432630..a3df6881 100644 --- a/callflow/pipeline/group_by_module.py +++ b/callflow/operations/group.py @@ -3,57 +3,119 @@ import networkx as nx from ast import literal_eval as make_list +import callflow -class Callsite: - def __init__(self, name, module): - self.name = name - self.module = module +LOGGER = callflow.get_logger(__name__) -class groupBy: - def __init__(self, state, group_by): - self.state = state - # self.g = state.g - # self.df = self.state.df - self.g = self.state.new_gf.nxg - self.df = self.state.new_gf.df +class Group(callflow.GraphFrame): + def __init__(self, gf=None, group_by="name"): + self.gf = gf self.group_by = group_by - self.eliminate_funcs = [] + + # Data. + self.callsite_module_map = self.gf.df.set_index("name")["module"].to_dict() + self.callsite_path_map = self.gf.df.set_index("name")["path"].to_dict() + + # Variables used by grouping operation. self.entry_funcs = {} - self.module_func_map = {} self.other_funcs = {} - self.module_id_map = {} - - self.drop_eliminate_funcs() - self.name_module_map = self.df.set_index("name")["module"].to_dict() - self.name_path_map = self.df.set_index("name")["path"].to_dict() - - self.run() - self.df = self.state.new_gf.df - self.graph = self.state.new_gf.graph - # self.df = self.state.df - # self.graph = self.state.graph - - # Drop all entries user does not want to see. - def drop_eliminate_funcs(self): - for idx, func in enumerate(self.eliminate_funcs): - # self.state.df = self.state.df[self.state.df["module"] != func] - self.state.new_gf.df = self.state.new_gf.df[ - self.state.new_gf.df["module"] != func - ] + # TODO: remove this. + # self.module_id_map = {} + + self.compute() + + def compute(self): + group_path = {} + component_path = {} + component_level = {} + entry_func = {} + show_node = {} + node_name = {} + module = {} + change_name = {} + + # module_idx = {} + # module_id_map = {} + # module_count = 0 + + LOGGER.debug( + f"Nodes: {len(self.gf.nxg.nodes())}, Edges: {len(self.gf.nxg.edges())}" + ) + + for idx, edge in enumerate(self.gf.nxg.edges()): + snode = edge[0] + tnode = edge[1] + + if "/" in snode: + snode = snode.split("/")[-1] + if "/" in tnode: + tnode = tnode.split("/")[-1] + + spath = self.callsite_path_map[snode] + tpath = self.callsite_path_map[tnode] + + stage1 = time.perf_counter() + temp_group_path_results = self.create_group_path(spath) + group_path[snode] = temp_group_path_results + stage2 = time.perf_counter() + + stage3 = time.perf_counter() + component_path[snode] = self.create_component_path(spath, group_path[snode]) + component_level[snode] = len(component_path[snode]) + stage4 = time.perf_counter() + + temp_group_path_results = self.create_group_path(tpath) + group_path[tnode] = temp_group_path_results + + component_path[tnode] = self.create_component_path(tpath, group_path[tnode]) + component_level[tnode] = len(component_path[tnode]) + + if component_level[snode] == 2: + entry_func[snode] = True + show_node[snode] = True + else: + entry_func[snode] = False + show_node[snode] = False + + node_name[snode] = self.callsite_module_map[snode] + "=" + snode + + # TODO: remove if not used. + # if module[tnode] not in module_id_map: + # module_count += 1 + # module_id_map[module[tnode]] = module_count + # module_idx[tnode] = module_id_map[module[tnode]] + # else: + # module_idx[tnode] = module_id_map[module[tnode]] + + if component_level[tnode] == 2: + entry_func[tnode] = True + show_node[tnode] = True + else: + entry_func[tnode] = False + show_node[tnode] = False + + node_name[tnode] = self.callsite_module_map[snode] + "=" + tnode + + self.update_df("group_path", group_path) + self.update_df("component_path", component_path) + self.update_df("show_node", entry_func) + self.update_df("vis_name", node_name) + self.update_df("component_level", component_level) + # self.update_df("mod_index", module_idx) + self.update_df("entry_function", entry_func) def create_group_path(self, path): if isinstance(path, str): path = make_list(path) - group_path = [] prev_module = None for idx, callsite in enumerate(path): if idx == 0: # Assign the first callsite as from_callsite and not push into an array. from_callsite = callsite - - from_module = self.name_module_map[from_callsite] + # from_module = self.entire_df.loc[self.entire_df['name'] == from_callsite]['module'].unique()[0] + from_module = self.callsite_module_map[from_callsite] # Store the previous module to check the hierarchy later. prev_module = from_module @@ -75,8 +137,8 @@ def create_group_path(self, path): to_callsite = callsite if "/" in to_callsite: to_callsite = to_callsite.split("/")[-1] - # to_module = self.entire_df.loc[self.entire_df['name'] == to_callsite]['module'].unique()[0] - to_module = self.name_module_map[to_callsite] + + to_module = self.callsite_module_map[to_callsite] if prev_module != to_module: group_path.append(to_module + "=" + to_callsite) @@ -96,12 +158,8 @@ def create_group_path(self, path): from_callsite = path[idx - 1] to_callsite = callsite - # Get their modules. - # from_module = self.entire_df.loc[self.entire_df['name'] == from_callsite]['module'].unique()[0] - # to_module = self.entire_df.loc[self.entire_df['name'] == to_callsite]['module'].unique()[0] - - from_module = self.name_module_map[from_callsite] - to_module = self.name_module_map[to_callsite] + from_module = self.callsite_module_map[from_callsite] + to_module = self.callsite_module_map[to_callsite] # Create the entry function and other function dict if not already present. if to_module not in self.entry_funcs: @@ -123,7 +181,7 @@ def create_group_path(self, path): elif to_module == prev_module: to_callsite = callsite # to_module = self.entire_df.loc[self.entire_df['name'] == to_callsite]['module'].unique()[0] - to_module = self.name_module_map[to_callsite] + to_module = self.callsite_module_map[to_callsite] prev_module = to_module @@ -140,7 +198,7 @@ def create_component_path(self, path, group_path): node_func = node if "/" in node: node = node.split("/")[-1] - module = self.name_module_map[node] + module = self.callsite_module_map[node] if component_module == module: component_path.append(node_func) @@ -148,63 +206,6 @@ def create_component_path(self, path, group_path): return tuple(component_path) def update_df(self, col_name, mapping): - self.df[col_name] = self.df["name"].apply( + self.gf.df[col_name] = self.gf.df["name"].apply( lambda node: mapping[node] if node in mapping.keys() else "" ) - - def run(self): - group_path = {} - component_path = {} - component_level = {} - entry_func = {} - show_node = {} - node_name = {} - module = {} - change_name = {} - module_idx = {} - source_nid = {} - - module_id_map = {} - module_count = 0 - - edge_count = 0 - - for edge in self.g.edges(): - edge_count += 1 - snode = edge[0] - tnode = edge[1] - - spath = self.name_path_map[snode] - tpath = self.name_path_map[tnode] - - temp_group_path_results = self.create_group_path(spath) - group_path[snode] = temp_group_path_results - - component_path[snode] = self.create_component_path(spath, group_path[snode]) - component_level[snode] = len(component_path[snode]) - module[snode] = self.name_module_map[snode] - - temp_group_path_results = self.create_group_path(tpath) - group_path[tnode] = temp_group_path_results - - component_path[tnode] = self.create_component_path(tpath, group_path[tnode]) - component_level[tnode] = len(component_path[tnode]) - module[tnode] = self.name_module_map[tnode] - - if component_level[snode] == 2: - entry_func[snode] = True - show_node[snode] = True - else: - entry_func[snode] = False - show_node[snode] = False - - node_name[snode] = self.name_module_map[snode] + "=" + snode - - self.update_df("group_path", group_path) - self.update_df("component_path", component_path) - self.update_df("show_node", entry_func) - self.update_df("vis_name", node_name) - self.update_df("component_level", component_level) - self.update_df("change_name", change_name) - self.update_df("mod_index", module_idx) - self.update_df("entry_function", entry_func) diff --git a/callflow/pipeline/process.py b/callflow/operations/process.py similarity index 58% rename from callflow/pipeline/process.py rename to callflow/operations/process.py index d6bf34fd..9961b3f8 100644 --- a/callflow/pipeline/process.py +++ b/callflow/operations/process.py @@ -16,79 +16,28 @@ import numpy as np from scipy.stats import kurtosis, skew -from callflow.utils import ( - sanitizeName, - visModuleCallsiteName, - getNodeDictFromFrame, - getPathListFromFrames, -) - -# from callflow.logger import Log import callflow LOGGER = callflow.get_logger(__name__) -""" -# no need for this decorator -def logger(func): - @wraps(func) - def tmp(*args, **kwargs): - log = Log("process") - log.info("Preprocessing : {0}".format(func.__name__)) - return func(*args, **kwargs) - return tmp -""" - - -class PreProcess: +class Process: """ Preprocess the dataframe Builder object Preprocess.add_X().add_Y()..... """ - def __init__(self, builder): - self.gf = builder.gf - self.df = builder.df - self.graph = builder.graph + def __init__(self, gf, tag): + self.gf = gf + self.tag = tag class Builder(object): - def __init__(self, state, gf_type="entire"): - # self.log = Log("process") - self.state = state + def __init__(self, gf, tag): + self.gf = gf + self.tag = tag - self.callers = {} - self.callees = {} - self.frames = {} - self.paths = {} - self.hatchet_nodes = {} - - if gf_type == "filter": - # self.gf = state.gf - # self.df = state.df - # self.graph = state.entire_graph - self.gf = state.new_gf - self.df = state.new_gf.df - self.graph = state.new_entire_gf.graph - elif gf_type == "entire": - # self.gf = state.entire_gf - # self.df = state.entire_df - # self.graph = state.entire_graph - self.gf = state.new_entire_gf - self.df = state.new_entire_gf.df - self.graph = state.new_entire_gf.graph - - # Logger Information - self.cct_nodes = [] - self.callgraph_nodes = [] - self.supergraph_nodes = [] - self.unmapped_targets = [] - - self.callgraph_nodes_np = np.array([]) - self.cct_nodes_np = np.array([]) self.graphMapper() - self.map = {} def convertFrameList(self, nodes): ret = [] @@ -97,18 +46,23 @@ def convertFrameList(self, nodes): return ret def graphMapper(self): - graph = self.graph + self.callers = {} + self.callees = {} + self.paths = {} + self.hatchet_nodes = {} - for node in graph.traverse(): - node_dict = getNodeDictFromFrame(node.frame) + for node in self.gf.graph.traverse(): + node_dict = callflow.utils.node_dict_from_frame(node.frame) if node_dict["type"] == "loop": - node_name = "Loop@" + sanitizeName( + node_name = "Loop@" + callflow.utils.sanitize_name( node_dict["name"] + ":" + str(node_dict["line"]) ) elif node_dict["type"] == "statement": node_name = ( - sanitizeName(node_dict["name"]) + ":" + str(node_dict["line"]) + callflow.utils.sanitize_name(node_dict["name"]) + + ":" + + str(node_dict["line"]) ) else: node_name = node_dict["name"] @@ -120,19 +74,19 @@ def graphMapper(self): self.hatchet_nodes[node_name] = node def build(self): - return PreProcess(self) + return Process(self.gf, self.tag) # Add the path information from the node object - # @logger def add_path(self): self.raiseExceptionIfNodeCountNotEqual(self.paths) - self.df["path"] = self.df["name"].apply( - lambda node_name: getPathListFromFrames(self.paths[node_name]) + self.gf.df["path"] = self.gf.df["name"].apply( + lambda node_name: callflow.utils.path_list_from_frames( + self.paths[node_name] + ) ) return self # Imbalance percentage Series in the dataframe - # @logger def add_imbalance_perc(self): inclusive = {} exclusive = {} @@ -145,8 +99,8 @@ def add_imbalance_perc(self): kurtosis_inclusive = {} kurtosis_exclusive = {} - for node_name in self.df["name"].unique(): - node_df = self.df.loc[self.df["name"] == node_name] + for node_name in self.gf.df["name"].unique(): + node_df = self.gf.df.loc[self.gf.df["name"] == node_name] max_incTime = node_df["time"].mean() mean_incTime = node_df["time (inc)"].mean() @@ -173,112 +127,112 @@ def add_imbalance_perc(self): kurtosis_inclusive[node_name] = kurtosis(node_df["time (inc)"].tolist()) kurtosis_exclusive[node_name] = kurtosis(node_df["time"].tolist()) - self.df["imbalance_perc_inclusive"] = self.df["name"].apply( + self.gf.df["imbalance_perc_inclusive"] = self.gf.df["name"].apply( lambda name: inclusive[name] ) - self.df["imbalance_perc_exclusive"] = self.df["name"].apply( + self.gf.df["imbalance_perc_exclusive"] = self.gf.df["name"].apply( lambda name: exclusive[name] ) - self.df["std_deviation_inclusive"] = self.df["name"].apply( + self.gf.df["std_deviation_inclusive"] = self.gf.df["name"].apply( lambda name: std_deviation_inclusive[name] ) - self.df["std_deviation_exclusive"] = self.df["name"].apply( + self.gf.df["std_deviation_exclusive"] = self.gf.df["name"].apply( lambda name: std_deviation_exclusive[name] ) - self.df["skewness_inclusive"] = self.df["name"].apply( + self.gf.df["skewness_inclusive"] = self.gf.df["name"].apply( lambda name: skewness_inclusive[name] ) - self.df["skewness_exclusive"] = self.df["name"].apply( + self.gf.df["skewness_exclusive"] = self.gf.df["name"].apply( lambda name: skewness_exclusive[name] ) - self.df["kurtosis_inclusive"] = self.df["name"].apply( + self.gf.df["kurtosis_inclusive"] = self.gf.df["name"].apply( lambda name: kurtosis_inclusive[name] ) - self.df["kurtosis_exclusive"] = self.df["name"].apply( + self.gf.df["kurtosis_exclusive"] = self.gf.df["name"].apply( lambda name: kurtosis_exclusive[name] ) return self - # @logger def add_callers_and_callees(self): - self.df["callees"] = self.df["name"].apply(lambda node: self.callees[node]) - self.df["callers"] = self.df["name"].apply(lambda node: self.callers[node]) + self.gf.df["callees"] = self.gf.df["name"].apply( + lambda node: self.callees[node] + ) + self.gf.df["callers"] = self.gf.df["name"].apply( + lambda node: self.callers[node] + ) return self # node_name is different from name in dataframe. So creating a copy of it. - # @logger def add_vis_node_name(self): - self.module_group_df = self.df.groupby(["module"]) + self.module_group_df = self.gf.df.groupby(["module"]) self.module_callsite_map = self.module_group_df["name"].unique() - self.name_group_df = self.df.groupby(["name"]) + self.name_group_df = self.gf.df.groupby(["name"]) self.callsite_module_map = self.name_group_df["module"].unique().to_dict() - self.df["vis_node_name"] = self.df["name"].apply( - lambda name: sanitizeName(self.callsite_module_map[name][0]) + self.gf.df["vis_node_name"] = self.gf.df["name"].apply( + lambda name: callflow.utils.sanitize_name( + self.callsite_module_map[name][0] + ) + "=" + name ) return self - # @logger def add_node_name_hpctoolkit(self, node_name_map): - self.df["node_name"] = self.df["name"].apply( + self.gf.df["node_name"] = self.gf.df["name"].apply( lambda name: node_name_map[name] ) return self - # @logger def add_module_name_hpctoolkit(self): - self.df["module"] = self.df["module"].apply(lambda name: sanitizeName(name)) + self.gf.df["module"] = self.gf.df["module"].apply( + lambda name: callflow.utils.sanitize_name(name) + ) return self - # @logger def add_node_name_caliper(self, node_module_map): - self.df["node_name"] = self.df["name"].apply( + self.gf.df["node_name"] = self.gf.df["name"].apply( lambda name: name_module_map[name] ) - # @logger def add_module_name_caliper(self, module_map): - self.df["module"] = self.df["name"].apply(lambda name: module_map[name]) + self.gf.df["module"] = self.gf.df["name"].apply( + lambda name: module_map[name] + ) return self - # @logger def add_dataset_name(self): - self.df["dataset"] = self.state.name + self.gf.df["dataset"] = self.tag return self - # @logger def add_rank_column(self): - if "rank" not in self.df.columns: - self.df["rank"] = 0 + if "rank" not in self.gf.df.columns: + self.gf.df["rank"] = 0 return self - # @logger def add_time_columns(self): - if "time (inc)" not in self.df.columns: - self.df["time (inc)"] = self.df["inclusive#time.duration"] + if "time (inc)" not in self.gf.df.columns: + self.gf.df["time (inc)"] = self.gf.df["inclusive#time.duration"] - if "time" not in self.df.columns: - self.df["time"] = self.df["sum#time.duration"] + if "time" not in self.gf.df.columns: + self.gf.df["time"] = self.gf.df["sum#time.duration"] return self - # @logger def create_name_module_map(self): self.name_module_map = ( - self.df.groupby(["name"])["module"].unique().to_dict() + self.gf.df.groupby(["name"])["module"].unique().to_dict() ) return self def raiseExceptionIfNodeCountNotEqual(self, attr): map_node_count = len(attr.keys()) - df_node_count = len(self.df["name"].unique()) + df_node_count = len(self.gf.df["name"].unique()) LOGGER.debug( f"[Validation] Map contains: {map_node_count} callsites, graph contains: {df_node_count} callsites" ) @@ -287,9 +241,8 @@ def raiseExceptionIfNodeCountNotEqual(self, attr): f"Unmatched Preprocessing maps: Map contains: {map_node_count} nodes, graph contains: {df_node_count} nodes" ) - # @logger def logInformation(self): LOGGER.info(f"CCT node count : {len(self.cct_nodes)}") LOGGER.info(f"CallGraph node count: {len(self.callgraph_nodes)}") - LOGGER.info(f"SuperGraph node count: {len(self.df['module'].unique())}") + LOGGER.info(f"SuperGraph node count: {len(self.gf.df['module'].unique())}") return self diff --git a/callflow/pipeline/read_config.py b/callflow/operations/read_config.py similarity index 98% rename from callflow/pipeline/read_config.py rename to callflow/operations/read_config.py index 2bf6719f..73ba8b9a 100644 --- a/callflow/pipeline/read_config.py +++ b/callflow/operations/read_config.py @@ -39,6 +39,7 @@ def __init__(self, filepath=None, config_json=None): self.datasets = self.json["datasets"] self.runName = self.json["run_name"] self.save_path = os.path.join(self.data_path, ".callflow") + self.read_parameter = self.json["read_parameter"] self.run() diff --git a/callflow/operations/union_delete.py b/callflow/operations/union_delete.py new file mode 100644 index 00000000..4878ba75 --- /dev/null +++ b/callflow/operations/union_delete.py @@ -0,0 +1,74 @@ +import networkx as nx + +# Mostly derive from supergraph. +# Should contain the vector that stores the properties as explained in paper. +# should contain a function `create` which contains the +class Union(nx.DiGraph): + def __init__(self): + self.union = nx.DiGraph() + + # Return the union of graphs G and H. + def unionize(self, nxg, name=None, rename=(None, None)): + if not self.union.is_multigraph() == H.is_multigraph(): + raise nx.NetworkXError("G and H must both be graphs or multigraphs.") + + self.union.graph.update(nxg) + + renamed_nodes = self.add_prefix(nxg, rename[1]) + + LOGGER.debug("-=========================-") + LOGGER.debug("Nodes in R and H are same? ", set(self.union) == set(nxg)) + if set(self.union) != set(H): + LOGGER.debug("Difference is ", list(set(H) - set(self.union))) + LOGGER.debug("Nodes in R", set(self.union)), + LOGGER.debug("Nodes in H", set(nxg)) + LOGGER.debug("-=========================-") + + if nxg.is_multigraph(): + new_edges = nxg.edges(keys=True, data=True) + else: + new_edges = nxg.edges(data=True) + + # add nodes and edges. + self.union.add_nodes_from(nxg) + self.union.add_edges_from(new_edges) + + # add node attributes for each run + for n in renamed_nodes: + self.add_node_attributes(nxg, n, name) + + # rename graph to obtain disjoint node labels + def add_prefix(self, graph, prefix): + if prefix is None: + return graph + + def label(x): + if is_string_like(x): + name = prefix + x + else: + name = prefix + repr(x) + return name + + return nx.relabel_nodes(graph, label) + + def add_edge_attributes(self): + number_of_runs_mapping = self.number_of_runs() + nx.set_edge_attributes( + self.union, name="number_of_runs", values=number_of_runs_mapping + ) + + def number_of_runs(self): + ret = {} + for idx, name in enumerate(self.unionuns): + for edge in self.unionuns[name].edges(): + if edge not in ret: + ret[edge] = 0 + ret[edge] += 1 + return ret + + def add_node_attributes(self, H, node, dataset_name): + for idx, (key, val) in enumerate(H.nodes.items()): + if dataset_name not in self.union.nodes[node]: + self.union.nodes[node][dataset_name] = 0 + if key == node: + self.union.nodes[node][dataset_name] = 1 diff --git a/callflow/pipeline/__init__.py b/callflow/pipeline/__init__.py deleted file mode 100644 index 9e43ccf6..00000000 --- a/callflow/pipeline/__init__.py +++ /dev/null @@ -1,7 +0,0 @@ -from .read_config import ConfigFileReader -from .filter_networkx import FilterNetworkX -from .group_by_module import groupBy -from .process import PreProcess -from .index import Pipeline -from .state import State -from .convert_hatchet_to_networkx import HatchetToNetworkX diff --git a/callflow/pipeline/convert_hatchet_to_networkx.py b/callflow/pipeline/convert_hatchet_to_networkx.py deleted file mode 100644 index 20d5dd33..00000000 --- a/callflow/pipeline/convert_hatchet_to_networkx.py +++ /dev/null @@ -1,144 +0,0 @@ -import networkx as nx -import math -import json -from ast import literal_eval as make_tuple - -import callflow - -LOGGER = callflow.get_logger(__name__) -from callflow.utils import getNodeDictFromFrame, sanitizeName - - -class HatchetToNetworkX(nx.Graph): - # Attributes: - # 1. State => Pass the state which needs to be handled. - # 2. path => '', 'path', 'group_path' or 'component_path' - # 3. construct_graph -> To decide if we should construct graph from path - # 4. add_data => To - def __init__( - self, - state, - graph_type="entire", - path_column_name="path", - construct_graph=True, - add_data=True, - ): - super(HatchetToNetworkX, self).__init__() - self.path_column_name = path_column_name - self.state = state - - if graph_type == "entire": - self.df = state.new_entire_gf.df - self.graph = state.new_entire_gf.graph - else: - self.df = state.new_gf.df - self.graph = state.new_gf.graph - - if construct_graph: - LOGGER.info("Creating a Graph for {0}.".format(self.state.name)) - self.nxg = nx.DiGraph() - self.add_paths_from_graph() - else: - print("Using the existing graph from state {0}".format(state.name)) - self.nxg = state.new_gf.nxg - - self.adj_matrix = nx.adjacency_matrix(self.nxg) - self.dense_adj_matrix = self.adj_matrix.todense() - - # TODO: Store the adjacency matrix also somewhere. - - if add_data: - self.add_node_attributes() - self.add_edge_attributes() - else: - pass - - # TODO: Need to raise exception when the state.g is incorrect. - # self.raiseExceptionIfNetworkXGraphIsIncorrect() - - def no_cycle_path(self, path): - ret = [] - mapper = {} - for idx, elem in enumerate(path): - if elem not in mapper: - mapper[elem] = 1 - ret.append(elem) - else: - ret.append(elem + "_" + str(mapper[elem])) - mapper[elem] += 1 - - return tuple(ret) - - # This is really slow for large dataframes. - def add_paths_from_df(self): - for idx, row in self.df.iterrows(): - if row.show_node: - if isinstance(row[self.path_column_name], list): - path_tuple = row[self.path_column_name] - else: - path_tuple = make_tuple(row[self.path_column_name]) - corrected_path = self.no_cycle_path(path_tuple) - self.nxg.add_path(corrected_path) - - def add_paths_from_graph(self): - graph = self.graph - - for root in graph.roots: - node_gen = root.traverse() - - root_dict = getNodeDictFromFrame(root.frame) - root_name = root_dict["name"] - root_paths = root.paths() - node = root - - try: - while node: - node_dict = getNodeDictFromFrame(node.frame) - node_name = node_dict["name"] - - # Get all node paths from hatchet. - node_paths = node.paths() - - # - for node_path in node_paths: - if len(node_path) >= 2: - - source_node_dict = getNodeDictFromFrame(node_path[-2]) - target_node_dict = getNodeDictFromFrame(node_path[-1]) - - if source_node_dict["line"] != "NA": - source_node_name = ( - sanitizeName(source_node_dict["name"]) - + ":" - + str(source_node_dict["line"]) - ) - else: - source_node_name = sanitizeName( - source_node_dict["name"] - ) - if target_node_dict["line"] != "NA": - target_node_name = ( - sanitizeName(target_node_dict["name"]) - + ":" - + str(target_node_dict["line"]) - ) - else: - target_node_name = sanitizeName( - target_node_dict["name"] - ) - self.nxg.add_edge(source_node_name, target_node_name) - node = next(node_gen) - - except StopIteration: - pass - finally: - del root - - def add_node_attributes(self): - pass - - def add_edge_attributes(self): - pass - - def raiseExceptionIfNetworkXGraphIsIncorrect(self): - print(len(self.graph), len(self.nxg.nodes)) diff --git a/callflow/pipeline/filter_hatchet.py b/callflow/pipeline/filter_hatchet.py deleted file mode 100644 index e8a85bae..00000000 --- a/callflow/pipeline/filter_hatchet.py +++ /dev/null @@ -1,90 +0,0 @@ -############################################################################## -# Copyright (c) 2018-2019, Lawrence Livermore National Security, LLC. -# Produced at the Lawrence Livermore National Laboratory. -# -# This file is part of Callflow. -# Created by Suraj Kesavan . -# LLNL-CODE-741008. All rights reserved. -# -# For details, see: https://github.com/LLNL/Callflow -# Please also read the LICENSE file for the MIT License notice. -############################################################################## -import pandas as pd -import time - -import callflow - -LOGGER = callflow.get_logger(__name__) - - -class FilterHatchet: - """ - Filter the graphframe. - Input: State object, parameter to filterBy (could be inclusive/exclusive, - filterPerc: user provided filter percentage (1-100)) - """ - - def __init__(self, state, filterBy, filterPerc): - self.state = state - - self.graph = state.new_entire_gf.graph - self.df = state.new_entire_gf.df - self.gf = state.new_entire_gf - - # self.df.set_index(['node', 'rank'], drop=False, inplace=True) - - # self.df = pd.MultiIndex.from_frame(self.df, names=['node', 'rank']) - self.gf.dataframe = self.df - - self.filterBy = filterBy - self.filterPercInDecimals = int(1) / 100 - # self.filterPercInDecimals = 0.001 - - self.fgf = self.run() - self.fgf = self.graft() - - # update df and graph after filtering. - self.df = self.fgf.dataframe - self.graph = self.fgf.graph - - def run(self): - LOGGER.info("Filtering the graph.") - t = time.time() - if self.filterBy == "Inclusive": - max_inclusive_time = utils.getMaxIncTime_from_gf(self.graph, self.df) - filter_gf = self.gf.filter( - lambda x: True - if (x["time (inc)"] > self.filterPercInDecimals * max_inclusive_time) - else False - ) - elif filterBy == "Exclusive": - max_exclusive_time = utils.getMaxExcTime_from_gf(self.graph, self.df) - LOGGER.info("[Filter] By Exclusive time = {0})".format(max_exclusive_time)) - filter_gf = self.gf.filter( - lambda x: True - if (x["time"] >= self.filterPercInDecimals * max_exclusive_time) - else False - ) - else: - LOGGER.warn("Not filtering.... Can take forever. Thou were warned") - filter_gf = self.gf - - LOGGER.info( - "[Filter] Removed {0} rows. (time={1})".format( - self.gf.dataframe.shape[0] - filter_gf.dataframe.shape[0], - time.time() - t, - ) - ) - - return filter_gf - - def graft(self): - LOGGER.info("Squashing the graph.") - t = time.time() - fgf = self.fgf.squash() - LOGGER.info( - "[Squash] {1} rows in dataframe (time={0})".format( - time.time() - t, fgf.dataframe.shape[0] - ) - ) - return fgf diff --git a/callflow/pipeline/group_by_module_ensemble.py b/callflow/pipeline/group_by_module_ensemble.py deleted file mode 100644 index 41a33242..00000000 --- a/callflow/pipeline/group_by_module_ensemble.py +++ /dev/null @@ -1,266 +0,0 @@ -import pandas as pd -import time -import networkx as nx -from ast import literal_eval as make_list - -import callflow - -LOGGER = callflow.get_logger(__name__) - - -class ensembleGroupBy: - def __init__(self, state_entire, state_filter, group_by): - self.state_filter = state_filter - self.state_entire = state_entire - self.entire_df = self.state_entire.new_gf.df - self.filter_df = self.state_filter.new_gf.df - self.filter_g = self.state_filter.new_gf.nxg - - self.group_by = group_by - self.eliminate_funcs = [] - self.entry_funcs = {} - self.module_func_map = {} - self.other_funcs = {} - self.module_id_map = {} - - self.drop_eliminate_funcs() - self.name_module_map = self.entire_df.set_index("name")["module"].to_dict() - self.entire_df["path"] = self.entire_df["path"].apply( - lambda path: make_list(path) - ) - self.name_path_map = self.entire_df.set_index("name")["path"].to_dict() - - # Drop all entries user does not want to see. - def drop_eliminate_funcs(self): - for idx, func in enumerate(self.eliminate_funcs): - self.state.new_gf.df = self.state.new_gf.df[ - self.state.new_gf.df["module"] != func - ] - - def create_group_path_time(self, path): - if isinstance(path, str): - path = make_list(path) - group_path = [] - prev_module = None - for idx, callsite in enumerate(path): - if idx == 0: - # Assign the first callsite as from_callsite and not push into an array. - from_callsite = callsite - # from_module = self.entire_df.loc[self.entire_df['name'] == from_callsite]['module'].unique()[0] - from_module = self.name_module_map[from_callsite] - - # Store the previous module to check the hierarchy later. - prev_module = from_module - - # Create the entry function and other functions dict. - if from_module not in self.entry_funcs: - self.entry_funcs[from_module] = [] - if from_module not in self.other_funcs: - self.other_funcs[from_module] = [] - - # Push into entry function dict since it is the first callsite. - self.entry_funcs[from_module].append(from_callsite) - - # Append to the group path. - group_path.append(from_module + "=" + from_callsite) - - elif idx == len(path) - 1: - # Final callsite in the path. - to_callsite = callsite - if "/" in to_callsite: - to_callsite = to_callsite.split("/")[-1] - # to_module = self.entire_df.loc[self.entire_df['name'] == to_callsite]['module'].unique()[0] - to_module = self.name_module_map[to_callsite] - - if prev_module != to_module: - group_path.append(to_module + "=" + to_callsite) - - if to_module not in self.entry_funcs: - self.entry_funcs[to_module] = [] - if to_module not in self.other_funcs: - self.other_funcs[to_module] = [] - - if to_callsite not in self.other_funcs[to_module]: - self.other_funcs[to_module].append(to_callsite) - - if to_callsite not in self.entry_funcs[to_module]: - self.entry_funcs[to_module].append(to_callsite) - else: - # Assign the from and to callsite. - from_callsite = path[idx - 1] - to_callsite = callsite - - # Get their modules. - # from_module = self.entire_df.loc[self.entire_df['name'] == from_callsite]['module'].unique()[0] - # to_module = self.entire_df.loc[self.entire_df['name'] == to_callsite]['module'].unique()[0] - - from_module = self.name_module_map[from_callsite] - to_module = self.name_module_map[to_callsite] - - # Create the entry function and other function dict if not already present. - if to_module not in self.entry_funcs: - self.entry_funcs[to_module] = [] - if to_module not in self.other_funcs: - self.other_funcs[to_module] = [] - - # if previous module is not same as the current module. - if to_module != prev_module: - # TODO: Come back and check if it is in the path. - if to_module in group_path: - prev_module = to_module - else: - group_path.append(to_module + "=" + to_callsite) - prev_module = to_module - if to_callsite not in self.entry_funcs[to_module]: - self.entry_funcs[to_module].append(to_callsite) - - elif to_module == prev_module: - to_callsite = callsite - # to_module = self.entire_df.loc[self.entire_df['name'] == to_callsite]['module'].unique()[0] - to_module = self.name_module_map[to_callsite] - - prev_module = to_module - - if to_callsite not in self.other_funcs[to_module]: - self.other_funcs[to_module].append(to_callsite) - - return group_path - - def create_component_path(self, path, group_path): - component_path = [] - component_module = group_path[len(group_path) - 1].split("=")[0] - - for idx, node in enumerate(path): - node_func = node - if "/" in node: - node = node.split("/")[-1] - module = self.name_module_map[node] - if component_module == module: - component_path.append(node_func) - - component_path.insert(0, component_module) - return tuple(component_path) - - def find_all_paths(self, df): - ret = [] - unique_paths = df["path"].unique() - for idx, path in enumerate(unique_paths): - ret.append(df.loc[df["path"] == path]) - return ret - - def update_df(self, col_name, mapping): - self.filter_df[col_name] = self.filter_df["name"].apply( - lambda node: mapping[node] if node in mapping.keys() else "" - ) - - def run(self): - group_path = {} - component_path = {} - component_level = {} - entry_func = {} - show_node = {} - node_name = {} - module = {} - change_name = {} - module_idx = {} - source_nid = {} - - module_id_map = {} - module_count = 0 - - LOGGER.debug( - f"Nodes: {len(self.filter_g.nodes())}, Edges: {len(self.filter_g.edges())}" - ) - - for idx, edge in enumerate(self.filter_g.edges()): - snode = edge[0] - tnode = edge[1] - - if "/" in snode: - snode = snode.split("/")[-1] - if "/" in tnode: - tnode = tnode.split("/")[-1] - - spath = self.name_path_map[snode] - tpath = self.name_path_map[tnode] - - stage1 = time.perf_counter() - temp_group_path_results = self.create_group_path_time(spath) - group_path[snode] = temp_group_path_results - stage2 = time.perf_counter() - # print(f"Group path: {stage2 - stage1}") - - stage3 = time.perf_counter() - component_path[snode] = self.create_component_path(spath, group_path[snode]) - component_level[snode] = len(component_path[snode]) - stage4 = time.perf_counter() - # print(f"Component path: {stage3 - stage2}") - - temp_group_path_results = self.create_group_path_time(tpath) - group_path[tnode] = temp_group_path_results - - component_path[tnode] = self.create_component_path(tpath, group_path[tnode]) - component_level[tnode] = len(component_path[tnode]) - - # if module[snode] not in module_id_map: - # module_count += 1 - # module_id_map[module[snode]] = module_count - # module_idx[snode] = module_id_map[module[snode]] - # else: - # module_idx[snode] = module_id_map[module[snode]] - - if component_level[snode] == 2: - entry_func[snode] = True - show_node[snode] = True - else: - entry_func[snode] = False - show_node[snode] = False - - node_name[snode] = self.name_module_map[snode] + "=" + snode - - # if module[tnode] not in module_id_map: - # module_count += 1 - # module_id_map[module[tnode]] = module_count - # module_idx[tnode] = module_id_map[module[tnode]] - # else: - # module_idx[tnode] = module_id_map[module[tnode]] - - if component_level[tnode] == 2: - entry_func[tnode] = True - show_node[tnode] = True - else: - entry_func[tnode] = False - show_node[tnode] = False - - node_name[tnode] = self.name_module_map[snode] + "=" + tnode - - # print('Node: ', snode) - # print("entry function:", entry_func[snode]) - # print("node path: ", spath) - # print("group path: ", group_path[snode]) - # print("component path: ", component_path[snode]) - # print("component level: ", component_level[snode]) - # print("Show node: ", show_node[snode]) - # print("name: ", node_name[snode]) - # print('Module: ', module[snode]) - # print("=================================") - # print('Node: ', tnode) - # print("entry function:", entry_func[tnode]) - # print("node path: ", tpath) - # print("group path: ", group_path[tnode]) - # print("component path: ", component_path[tnode]) - # print("component level: ", component_level[tnode]) - # print("Show node: ", show_node[tnode]) - # print("name: ", node_name[tnode]) - # print('Module: ', module[tnode]) - # print('#################################') - - self.update_df("group_path", group_path) - self.update_df("component_path", component_path) - self.update_df("show_node", entry_func) - self.update_df("vis_name", node_name) - self.update_df("component_level", component_level) - self.update_df("mod_index", module_idx) - self.update_df("entry_function", entry_func) - - return {"df": self.filter_df, "g": self.filter_g} diff --git a/callflow/pipeline/index.py b/callflow/pipeline/index.py deleted file mode 100644 index c60ef471..00000000 --- a/callflow/pipeline/index.py +++ /dev/null @@ -1,363 +0,0 @@ -import json -from networkx.readwrite import json_graph -import pandas as pd -import os - -# from .create_graphframe import CreateGraphFrame - -from .group_by_module import groupBy -from .group_by_module_ensemble import ensembleGroupBy -from .filter_hatchet import FilterHatchet -from .filter_networkx import FilterNetworkX -from callflow.pipeline.convert_hatchet_to_networkx import HatchetToNetworkX -from callflow.datastructures.uniongraph import UnionGraph -from callflow.algorithms.deltacon_similarity import DeltaConSimilarity -from callflow.modules.auxiliary_ensemble import EnsembleAuxiliary -from .process import PreProcess -from .state import State -from callflow import GraphFrame - -# from callflow.logger import Log -import callflow - -LOGGER = callflow.get_logger(__name__) - - -class Pipeline: - def __init__(self, config): - # self.log = Log("pipeline") - self.config = config - self.dirname = self.config.save_path - self.debug = True - - ##################### Pipeline Functions ########################### - # All pipeline functions avoid the state being mutated by reference to create separate instances of State variables. - - # Create the State from the hatchet's graphframe. - def create_gf(self, name): - - state = State(name) - state.new_entire_gf = GraphFrame.from_config(self.config, name) - - print(state.new_entire_gf) - print(type(state.new_entire_gf)) - # state.entire_gf = state.new_entire_gf - # state.entire_df = state.new_entire_gf.df - # state.entire_graph = state.new_entire_gf.graph - - """ - create = CreateGraphFrame(self.config, name) - #state.entire_gf = create.gf - #state.entire_df = create.df - #state.entire_graph = create.graph - """ - - LOGGER.info( - f"Number of call sites in CCT (From dataframe): {len(state.new_entire_gf.df['name'].unique())}" - ) - - return state - - # Pre-process the dataframe and Graph to add attributes to the networkX graph. - # PreProcess class is a builder. Additional attributes can be added by chained calls. - def process_gf(self, state, gf_type): - if self.config.format[state.name] == "hpctoolkit": - preprocess = ( - PreProcess.Builder(state, gf_type) - .add_path() - .create_name_module_map() - .add_callers_and_callees() - .add_dataset_name() - .add_imbalance_perc() - .add_module_name_hpctoolkit() - .add_vis_node_name() - .build() - ) - elif self.config.format[state.name] == "caliper_json": - preprocess = ( - PreProcess.Builder(state, gf_type) - .add_time_columns() - .add_rank_column() - .add_callers_and_callees() - .add_dataset_name() - .add_imbalance_perc() - .add_module_name_caliper(self.config.callsite_module_map) - .create_name_module_map() - .add_vis_node_name() - .add_path() - .build() - ) - - print(preprocess.gf) - state.new_gf = preprocess.gf - # state.df = preprocess.new_gf.df - # state.graph = preprocess.new_gf.graph - self.entire_df = state.new_gf.df - return state - - # Converts a hatchet graph to networkX graph. - def hatchetToNetworkX(self, state, path): - convert = HatchetToNetworkX(state, path, construct_graph=True, add_data=False) - - # state.g = convert.g - state.new_entire_gf.nxg = convert.nxg - state.new_gf.nxg = convert.nxg - - return state - - # Uses the hatchet's filter method. - # Filter by hatchet graphframe. - def filterHatchet(self, state, filterBy, filterPerc): - filter_obj = Filter(state, filterBy, filterPerc) - - state.new_gf = filter_obj.gf - state.new_gf.df = filter_obj.df - state.new_gf.graph = filter_obj.graph - - return state - - # Union of all the networkX graphs. - def union(self, states): - u_graph = UnionGraph() - u_df = pd.DataFrame() - for idx, dataset in enumerate(states): - u_graph.unionize(states[dataset].new_gf.nxg, dataset) - u_df = pd.concat([u_df, states[dataset].new_gf.df], sort=True) - - state = State("union") - state.new_gf = GraphFrame() - state.new_gf.df = u_df - state.new_gf.nxg = u_graph.R - - # state.df = state.new_gf.df - # state.g = state.new_gf.nxg - - """ - #state.df = u_df - #state.g = u_graph.R - """ - - if True: # self.debug: - LOGGER.debug("Done with Union.") - LOGGER.debug( - f"Number of callsites in dataframe: {len(state.new_gf.df['name'].unique())}" - ) - LOGGER.debug( - f"Number of callsites in the graph: {len(state.new_gf.nxg.nodes())}" - ) - LOGGER.debug( - f"Number of modules in the graph: {len(state.new_gf.df['module'].unique())}" - ) - - return state - - # Filter the networkX graph based on the attribute specified in the config file. - def filterNetworkX(self, state, perc): - filter_obj = FilterNetworkX(state) - if self.config.filter_by == "time (inc)": - df = filter_obj.filter_df_by_time_inc(perc) - g = filter_obj.filter_graph_by_time_inc(df, state.new_gf.nxg) - elif self.config.filter_by == "time": - df = filter_obj.filter_df_by_time(perc) - g = filter_obj.filter_graph_by_time(df, state.new_gf.nxg) - - state = State("filter_union") - state.new_gf = GraphFrame() - state.new_gf.df = df - state.new_gf.nxg = g - - # state.df = state.new_gf.df - # state.g = state.new_gf.nxg - - """ - #state.df = df - #state.g = g - """ - - if True: # self.debug: - LOGGER.debug("Done with Filtering the Union graph.") - LOGGER.debug( - f"Number of callsites in dataframe: {len(state.new_gf.df['name'].unique())}" - ) - LOGGER.debug( - f"Number of callsites in the graph: {len(state.new_gf.nxg.nodes())}" - ) - LOGGER.debug( - f"Number of modules in the graph: {len(state.new_gf.df['module'].unique())}" - ) - - return state - - def group(self, state, attr): - print(state.new_gf.nxg) - grouped_graph = groupBy(state, attr) - - # state.new_gf = groupBy(state, attr) - - state.new_gf.nxg = grouped_graph.g - state.new_gf.df = grouped_graph.df - return state - - def ensemble_group(self, state, attr): - grouped_graph = ensembleGroupBy( - state["ensemble_entire"], state["ensemble_filter"], attr - ).run() - - state = State("ensemble_union") - state.new_gf = GraphFrame() - state.new_gf.df = grouped_graph["df"] - state.new_gf.nxg = grouped_graph["g"] - # state.g = state.new_gf.nxg - # state.df = state.new_gf.df - - """ - #state.g = grouped_graph["g"] - #state.df = grouped_graph["df"] - """ - - if True: # self.debug: - LOGGER.debug( - f"Number of callsites in dataframe: {len(state.new_gf.df['name'].unique())}" - ) - LOGGER.debug( - f"Number of callsites in the graph: {len(state.new_gf.nxg.nodes())}" - ) - LOGGER.debug(f"Modules in the graph: {state.new_gf.df['module'].unique()}") - - return state - - ##################### Write Functions ########################### - # Write the dataset's graphframe to the file. - def write_dataset_gf(self, state, state_name, format_of_df, write_graph=True): - # dump the filtered dataframe to csv. - - df_filepath = os.path.join(self.dirname, state_name, format_of_df + "_df.csv") - graph_filepath = os.path.join( - self.dirname, state_name, format_of_df + "_graph.json" - ) - - state.new_gf.df.to_csv(df_filepath) - - g_data = json_graph.node_link_data(state.new_gf.nxg) - with open(graph_filepath, "w") as graphFile: - json.dump(g_data, graphFile) - - # Write the ensemble State to the file. - def write_ensemble_gf(self, states, state_name): - state = states[state_name] - - # dump the filtered dataframe to csv. - df_filepath = os.path.join(self.dirname, state_name + "_df.csv") - graph_filepath = os.path.join(self.dirname, state_name + "_graph.json") - - state.new_gf.df.to_csv(df_filepath) - - g_data = json_graph.node_link_data(state.new_gf.nxg) - with open(graph_filepath, "w") as graphFile: - json.dump(g_data, graphFile) - - # Write the hatchet graph to a text file. - def write_hatchet_graph(self, states, state_name): - state = states[state_name] - gf = state.new_gf - - graph_filepath = os.path.join(self.dirname, state_name, "hatchet_graph.txt") - with open(graph_filepath, "a") as hatchet_graphFile: - hatchet_graphFile.write(gf.tree(color=False)) - - # TODO: why are the filenames hardcoded? - graph_filepath = os.path.join( - self.dirname, state_name, "hatchet_graph_10_percent.txt" - ) - with open(graph_filepath, "a") as hatchet_graphFile: - hatchet_graphFile.write(gf.tree(color=False, threshold=0.10)) - - ##################### Read Functions ########################### - # Read the ensemble graph and dataframe. - def read_ensemble_gf(self, name): - LOGGER.info(f"[Process] Reading the union dataframe and graph : {name}") - state = State(name) - dirname = self.config.save_path - - union_df_filepath = os.path.join(dirname, name + "_df.csv") - union_graph_filepath = os.path.join(dirname, name + "_graph.json") - - with open(union_graph_filepath, "r") as union_graphFile: - union_graph = json.load(union_graphFile) - - state.new_gf = GraphFrame() - state.new_gf.nxg = json_graph.node_link_graph(union_graph) - state.new_gf.df = pd.read_csv(union_df_filepath) - - # state.g = state.new_gf.nxg - # state.df = state.new_gf.df - - """ - #state.g = json_graph.node_link_graph(union_graph) - #state.df = pd.read_csv(union_df_filepath) - """ - - return state - - # Read a single dataset, pass the dataset name as a parameter. - def read_dataset_gf(self, name): - state = State(name) - LOGGER.info( - "[Process] Reading the dataframe and graph of state: {0}".format(name) - ) - dataset_dirname = os.path.abspath(os.path.join(__file__, "../../..")) + "/data" - - df_filepath = os.path.join(self.dirname, name, "entire_df.csv") - entire_df_filepath = os.path.join(self.dirname, name, "entire_df.csv") - graph_filepath = os.path.join(self.dirname, name, "entire_graph.json") - entire_graph_filepath = os.path.join(self.dirname, name, "entire_graph.json") - - parameters_filepath = os.path.join( - dataset_dirname, self.config.runName, name, "env_params.txt" - ) - - state.new_gf = GraphFrame() - state.new_gf.df = pd.read_csv(df_filepath) - # state.df = state.new_gf.df - - # state.df = pd.read_csv(df_filepath) - with open(graph_filepath, "r") as filter_graphFile: - graph = json.load(filter_graphFile) - - state.new_gf.nxg = json_graph.node_link_graph(graph) - # state.g = state.new_gf.nxg - # state.g = json_graph.node_link_graph(graph) - - if self.config.runName.split("_")[0] == "osu_bcast": - state.projection_data = {} - for line in open(parameters_filepath, "r"): - s = 0 - for num in line.strip().split(","): - split_num = num.split("=") - state.projection_data[split_num[0]] = split_num[1] - - return state - - # Write the graph similarities to a file. - def deltaconSimilarity(self, datasets, states, type): - ret = {} - for idx, dataset in enumerate(datasets): - ret[dataset] = [] - for idx_2, dataset2 in enumerate(datasets): - union_similarity = Similarity(states[dataset2].g, states[dataset].g) - ret[dataset].append(union_similarity.result) - - dirname = self.config.callflow_dir - name = self.config.runName - # similarity_filepath = dirname + "/" + "similarity.json" - similarity_filepath = os.path.join(dirname, "similarity.json") - with open(similarity_filepath, "w") as json_file: - json.dump(ret, json_file) - - def read_all_data(self): - # dirname = self.config.callflow_path - all_data_filepath = os.path.join(self.config.save_path, "all_data.json") - LOGGER.info(f"[Read] {all_data_filepath}") - with open(all_data_filepath, "r") as filter_graphFile: - data = json.load(filter_graphFile) - return data diff --git a/callflow/pipeline/state.py b/callflow/pipeline/state.py deleted file mode 100644 index 11179435..00000000 --- a/callflow/pipeline/state.py +++ /dev/null @@ -1,82 +0,0 @@ -import os - -# from hatchet import * - -from callflow import GraphFrame - - -class State(object): - - # TODO: Assign self.g, self.root... - def __init__(self, dataset_name): - - # it appears we're using name as "union", "filter", etc. - # this is not a data set name! - self.name = dataset_name - - # instead of the old variables, we will use these new ones. - # these are callflow.graphframe object (has gf, df, and networkx) - self.new_gf = None - self.new_entire_gf = None - - # these are the old variables - # self.entire_g = None - # self.entire_df = None - # self.entire_graph = None - # self.g = None - # self.df = None - # self.gf = None - # self.graph = None - - # I cant see where these are used.. - # self.roots = None - # self.map = None - # self.node_hash_map = {} - self.projection_data = {} - - """ - def lookup_by_column(self, _hash, col_name): - # dont think this is used anywhere - assert False - - ret = [] - node_df = self.df.loc[self.df["node"] == self.map[str(_hash)]] - node_df_T = node_df.T.squeeze() - node_df_T_attr = node_df_T.loc[col_name] - if node_df_T_attr is not None: - if type(node_df_T_attr) is str or type(node_df_T_attr) is float: - ret.append(node_df_T_attr) - else: - ret = node_df_T_attr.tolist() - return ret - """ - - def lookup(self, node): - return self.new_gf.lookup(node) - # return self.df.loc[ - # (self.df["name"] == node.callpath[-1]) & (self.df["nid"] == node.nid) - # ] - - def lookup_with_node(self, node): - return self.new_gf.lookup_with_node(node) - # return self.df.loc[self.df["name"] == node.callpath[-1]] - - def lookup_with_name(self, name): - return self.new_gf.lookup_with_name(node) - # return self.df.loc[self.df["name"] == name] - - def lookup_with_vis_nodeName(self, name): - return self.new_gf.lookup_with_name(node) - # return self.df.loc[self.df["vis_node_name"] == name] - - def update_df(self, col_name, mapping): - return self.new_gf.update_df(col_name, mapping) - """ - self.df[col_name] = self.df["name"].apply( - lambda node: mapping[node] if node in mapping.keys() else "" - ) - """ - - def grouped_df(self, attr): - self.gdf[attr] = self.new_gf.df.groupby(attr, as_index=True, squeeze=True) - self.gdfKeys = self.gdf[attr].groups.keys() diff --git a/callflow/pipeline/to_delete_create_graphframe.py b/callflow/pipeline/to_delete_create_graphframe.py deleted file mode 100644 index 294f707a..00000000 --- a/callflow/pipeline/to_delete_create_graphframe.py +++ /dev/null @@ -1,67 +0,0 @@ -############################################################################## -# Copyright (c) 2018-2019, Lawrence Livermore National Security, LLC. -# Produced at the Lawrence Livermore National Laboratory. -# -# This file is part of Callflow. -# Created by Suraj Kesavan . -# LLNL-CODE-741008. All rights reserved. -# -# For details, see: https://github.com/LLNL/Callflow -# Please also read the LICENSE file for the MIT License notice. -############################################################################## - -## TODO: this file is not needed anymore. -# its functionality is not in graphframe.from_config() - -# the functionality has been moved to datastructures/graphframe.py -print("WARNING: ({}) is deprecated and should be deleted!".format(__file__)) - -import pandas as pd -import time -from callflow.utils.logger import Log - -import os -import hatchet as ht - - -class CreateGraphFrame: - """ - Creates a graph frame. - Input : config variable, and run name - Output : State object containing components of graphframe as separate object variables. - """ - - def __init__(self, config, name): - self.log = Log("create_graphframe") - LOGGER.info(f"Creating graphframes: {name}") - self.config = config - self.callflow_path = config.callflow_path - self.name = name - self.run() - - def run(self): - data_path = os.path.abspath( - os.path.join(self.callflow_path, self.config.paths[self.name]) - ) - LOGGER.info(f"Data path: {data_path}") - - if self.config.format[self.name] == "hpctoolkit": - self.gf = ht.GraphFrame.from_hpctoolkit(data_path) - - elif self.config.format[self.name] == "caliper": - self.gf = ht.GraphFrame.from_caliper(data_path) - - elif self.config.format[self.name] == "caliper_json": - self.gf = ht.GraphFrame.from_caliper(data_path, query="") - - elif self.config.format[self.name] == "gprof": - self.gf = ht.GraphFrame.from_grof_dot(data_path) - - elif self.config.format[self.name] == "literal": - self.gf = ht.GraphFrame.from_literal(data_path) - - elif self.config.format[self.name] == "lists": - self.gf = ht.GraphFrame.from_lists(data_path) - - self.df = self.gf.dataframe - self.graph = self.gf.graph diff --git a/callflow/pipeline/unused_gradients.py b/callflow/pipeline/unused_gradients.py deleted file mode 100644 index c0994ce8..00000000 --- a/callflow/pipeline/unused_gradients.py +++ /dev/null @@ -1,223 +0,0 @@ -import numpy as np -from scipy import stats -import statsmodels.nonparametric.api as smnp -import matplotlib.pyplot as plt -import math - -print("WARNING: ({}) is unused in the code and should be deleted!".format(__file__)) - - -class Gradients: - def __init__(self, dfs, binCount="20"): - self.dfs = dfs - self.binCount = binCount - - # Find the rank information. - self.num_of_ranks = {} - max_ranks = 0 - for dataset in self.dfs: - self.num_of_ranks[dataset] = len(self.dfs[dataset]["rank"].unique()) - max_ranks = max(max_ranks, self.num_of_ranks[dataset]) - self.max_ranks = max_ranks - - def iqr(self, arr): - """Calculate the IQR for an array of numbers.""" - a = np.asarray(arr) - self.q1 = stats.scoreatpercentile(a, 25) - self.q2 = stats.scoreatpercentile(a, 50) - self.q3 = stats.scoreatpercentile(a, 75) - - def freedman_diaconis_bins(self, arr): - """Calculate number of hist bins using Freedman-Diaconis rule.""" - # From https://stats.stackexchange.com/questions/798/ - a = np.asarray(arr) - if len(arr) < 2: - return 1 - # Calculate the iqr ranges. - self.iqr(arr) - # Calculate the h - h = 2 * (self.q3 - self.q1) / (len(arr) ** (1 / 3)) - # fall back to sqrt(a) bins if iqr is 0 - if h == 0: - return int(np.sqrt(arr.size)) - else: - return int(np.ceil((arr.max() - arr.min()) / h)) - - def convert_dictmean_to_list(self, dictionary): - mean = [] - dataset = {} - for state in dictionary: - d = list(dictionary[state].values()) - # ret.append(max(d)) - mean.append(np.mean(np.array(d))) - dataset[state] = np.mean(np.array(d)) - return [mean, dataset] - - def kde( - self, - data, - gridsize=10, - fft=True, - kernel="gau", - bw="scott", - cut=3, - clip=(-np.inf, np.inf), - ): - if bw == "scott": - bw = stats.gaussian_kde(data).scotts_factor() * data.std(ddof=1) - # print("biwidth is: ", bw) - - kde = smnp.KDEUnivariate(data) - - # create the grid to fit the estimation. - support_min = min(max(data.min() - bw * cut, clip[0]), 0) - support_max = min(data.max() + bw * cut, clip[1]) - # print(support_max, support_min) - x = np.linspace(support_min, support_max, gridsize) - - kde.fit("gau", bw, fft, gridsize=gridsize, cut=cut, clip=clip) - y = kde.density - # print("Y is: ", y.shape) - - return x, y - - def histogram( - self, data, dataset_dict={}, data_min=np.nan, data_max=np.nan, - ): - if np.isnan(data_min) or np.isnan(data_max): - data_min = data.min() - data_max = data.max() - - h, b = np.histogram(data, range=[data_min, data_max], bins=int(self.binCount)) - - # Map the datasets to their histogram indexes. - dataset_position_dict = {} - for dataset in dataset_dict: - mean = dataset_dict[dataset] - for idx, x in np.ndenumerate(b): - if x > float(mean): - dataset_position_dict[dataset] = idx[0] - 1 - break - if idx[0] == len(b) - 1: - dataset_position_dict[dataset] = len(b) - 2 - - return 0.5 * (b[1:] + b[:-1]), h, dataset_position_dict - - def clean_dict(self, in_dict): - ret = {k: in_dict[k] for k in in_dict if not math.isnan(in_dict[k])} - return np.array(tuple(ret)) - - def packByRankDistribution(self, df, metric): - ret = {} - if df.empty: - ret = dict((rank, 0) for rank in range(0, self.max_ranks)) - else: - ranks = df["rank"].tolist() - metric_vals = df[metric].tolist() - # metric_vals = df.groupby("rank").max()[metric].tolist() - ret = dict(zip(ranks, metric_vals)) - return ret - - def get_runtime_data(self, df, column_name, debug=False): - time_df = df[column_name] - time_list = time_df.tolist() - - if len(time_list) == 0: - time_list = [0] * self.max_ranks - - ret = self.packByRankDistribution(df, column_name) - return ret - - def run(self, columnName="name", callsiteOrModule="", targetDataset=""): - dist_inc = {} - dist_exc = {} - mean_inc_dist = {} - max_inc_dist = {} - mean_exc_dist = {} - max_exc_dist = {} - mean_time_inc_map = {} - num_of_bins = {} - kde_grid = {} - hist_inc_grid = {} - hist_exc_grid = {} - - # Get the runtimes for all the runs. - for idx, dataset in enumerate(self.dfs): - node_df = self.dfs[dataset].loc[ - (self.dfs[dataset][columnName] == callsiteOrModule) - ] - debug = False - dist_inc[dataset] = self.get_runtime_data(node_df, "time (inc)", debug) - dist_exc[dataset] = self.get_runtime_data(node_df, "time", debug) - - # convert the dictionary of values to list of values. - temp_inc = self.convert_dictmean_to_list(dist_inc) - dist_inc_list = temp_inc[0] - dataset_inc_list = temp_inc[1] - - temp_exc = self.convert_dictmean_to_list(dist_exc) - dist_exc_list = temp_exc[0] - dataset_exc_list = temp_exc[1] - - # Calculate appropriate number of bins automatically. - num_of_bins = self.binCount - - hist_inc_grid = self.histogram(np.array(dist_inc_list), dataset_inc_list) - hist_exc_grid = self.histogram(np.array(dist_exc_list), dataset_exc_list) - - # max_num_of_bins = min(self.freedman_diaconis_bins(np.array(dist_list)), 50) - - # Calculate the KDE grid (x, y) - # kde_grid[vis_node_name] = self.kde(np.array(dist_list), 10) - # kde_x_min = np.min(kde_grid[vis_node_name][0]) - # kde_x_max = np.max(kde_grid[vis_node_name][0]) - # kde_y_min = np.min(kde_grid[vis_node_name][1]) - # kde_y_max = np.max(kde_grid[vis_node_name][1]) - - # print("hist ranges = {} {} {} {}\n" - # .format(hist_x_min, hist_x_max, hist_y_min, hist_y_max)) - - results = { - "Inclusive": { - "bins": num_of_bins, - "dataset": {"mean": dataset_inc_list, "position": hist_inc_grid[2]}, - # "kde": { - # "x": kde_grid[vis_node_name][0].tolist(), - # "y": kde_grid[vis_node_name][1].tolist(), - # "x_min": kde_x_min, - # "x_max": kde_x_max, - # "y_min": kde_y_min, - # "y_max": kde_y_max, - # }, - "hist": { - "x": hist_inc_grid[0].tolist(), - "y": hist_inc_grid[1].tolist(), - "x_min": hist_inc_grid[0][0], - "x_max": hist_inc_grid[0][-1], - "y_min": np.min(hist_inc_grid[1]).astype(np.float64), - "y_max": np.max(hist_inc_grid[1]).astype(np.float64), - }, - }, - "Exclusive": { - "bins": num_of_bins, - "dataset": {"mean": dataset_exc_list, "position": hist_exc_grid[2]}, - # "kde": { - # "x": kde_grid[vis_node_name][0].tolist(), - # "y": kde_grid[vis_node_name][1].tolist(), - # "x_min": kde_x_min, - # "x_max": kde_x_max, - # "y_min": kde_y_min, - # "y_max": kde_y_max, - # }, - "hist": { - "x": hist_exc_grid[0].tolist(), - "y": hist_exc_grid[1].tolist(), - "x_min": hist_exc_grid[0][0], - "x_max": hist_exc_grid[0][-1], - "y_min": np.min(hist_exc_grid[1]).astype(np.float64), - "y_max": np.max(hist_exc_grid[1]).astype(np.float64), - }, - }, - } - - return results diff --git a/callflow/pipeline/unused_trees_to_literal.py b/callflow/pipeline/unused_trees_to_literal.py deleted file mode 100644 index 500b9ffe..00000000 --- a/callflow/pipeline/unused_trees_to_literal.py +++ /dev/null @@ -1,116 +0,0 @@ -############################################################################## -# Copyright (c) 2017-2019, Lawrence Livermore National Security, LLC. -# Produced at the Lawrence Livermore National Laboratory. -# -# This file is part of Hatchet. -# Created by Abhinav Bhatele . -# LLNL-CODE-741008. All rights reserved. -# -# For details, see: https://github.com/LLNL/hatchet -# Please also read the LICENSE file for the MIT License notice. -############################################################################## - -import numpy as np -import json - -print("WARNING: ({}) is unused in the code and should be deleted!".format(__file__)) - - -def trees_to_literal(graph, dataframe): - """ Calls to_json in turn for each tree in the graph/forest - """ - print("DFS on the graph") - # print("============================================") - # dfs(graph, dataframe, 100) - print("Number of nodes in graph", len(graph)) - print("============================================") - print("Dataframe Information") - print("Size:", dataframe.shape) - nodes = dataframe.groupby(["name", "nid"]).groups.keys() - print("Number of nodes in dataframe: ", len(nodes)) - # print("Nodes: {0}".format(nodes)) - literal = [] - nodes = dataframe["name"].unique() - adj_idx_map = {} - for idx, node in enumerate(nodes): - adj_idx_map[node] = idx - - num_of_nodes = len(nodes) - adj_matrix = np.zeros(shape=(num_of_nodes, num_of_nodes)) - - mapper = {} - - def add_nodes_and_children(hnode): - node_df = dataframe.loc[ - (dataframe["name"] == hnode.callpath[-1]) & (dataframe["nid"] == hnode.nid) - ] - node_id = node_df["nid"].unique()[0] - node_name = hnode.callpath[-1] - children = [] - - for child in hnode.children: - # print(child, child.nid) - child_df = dataframe.loc[ - (dataframe["name"] == child.callpath[-1]) - & (dataframe["nid"] == child.nid) - ] - - if not child_df.empty: - child_name = child_df["name"].unique()[0] - # print(child_name) - if child_name in adj_idx_map and node_name in adj_idx_map: - source_idx = adj_idx_map[node_name] - target_idx = adj_idx_map[child_name] - if adj_matrix[source_idx][target_idx] == 0.0: - adj_matrix[source_idx, target_idx] = 1.0 - children.append(add_nodes_and_children(child)) - - return { - "name": node_name, - "children": children, - "nid": int(node_id), - "metrics": { - "time (inc)": node_df["time (inc)"].mean(), - "time": node_df["time"].mean(), - }, - } - - for root in graph.roots: - literal.append(add_nodes_and_children(root)) - - return literal - - -def dfs(graph, dataframe, limit): - def dfs_recurse(root, level): - for node in root.children: - result = "" - if level < limit: - for i in range(0, level): - result += "- " - node_df = dataframe.loc[ - (dataframe["nid"] == node.nid) - & (dataframe["name"] == node.callpath[-1]) - ] - inclusive_runtime = " time (inc) = " + str(node_df["time (inc)"].mean()) - exclusive_runtime = " time = " + str(node_df["time"].mean()) - module = "Module = " + str(node_df["module"].unique()[0]) - result += ( - "Node = " - + node.callpath[-1] - + "[" - + module - + ":" - + str(node.nid) - + "]" - + inclusive_runtime - + exclusive_runtime - ) - print(result) - level += 1 - dfs_recurse(node, level) - - level = 0 - for root in graph.roots: - print("Root = {0} [{1}]".format(root, root.nid)) - dfs_recurse(root, level) diff --git a/callflow/server/__init__.py b/callflow/server/__init__.py new file mode 100644 index 00000000..8a452354 --- /dev/null +++ b/callflow/server/__init__.py @@ -0,0 +1 @@ +from .main import CallFlowServer diff --git a/callflow/server.py b/callflow/server/main.py similarity index 73% rename from callflow/server.py rename to callflow/server/main.py index 8ebd5575..c96f0447 100644 --- a/callflow/server.py +++ b/callflow/server/main.py @@ -1,14 +1,7 @@ -############################################################################## -# Copyright (c) 2018-2019, Lawrence Livermore National Security, LLC. -# Produced at the Lawrence Livermore National Laboratory. +# Copyright 2017-2020 Lawrence Livermore National Security, LLC and other +# CallFlow Project Developers. See the top-level LICENSE file for details. # -# This file is part of Callflow. -# Created by Suraj Kesavan . -# LLNL-CODE-741008. All rights reserved. -# -# For details, see: https://github.com/LLNL/Callflow -# Please also read the LICENSE file for the MIT License notice. -############################################################################## +# SPDX-License-Identifier: MIT # ------------------------------------------------------------------------------ @@ -28,14 +21,14 @@ import argparse from networkx.readwrite import json_graph +# ------------------------------------------------------------------------------ +# CallFlow imports. import callflow -from callflow import SingleCallFlow, EnsembleCallFlow -from callflow.pipeline import ConfigFileReader - +from callflow import CallFlow +from callflow.operations import ConfigFileReader LOGGER = callflow.get_logger(__name__) - # ------------------------------------------------------------------------------ # Create a Flask server. app = Flask(__name__, static_url_path="/public") @@ -53,22 +46,27 @@ def __init__(self): self.debug = args.verbose or True self.production = args.production or False - configFile = args.config self.process = args.process # Read the config file using config file reader. - self.config = ConfigFileReader(configFile) + self.config = ConfigFileReader(args.config) # Call the version of callflow corresponding to number of datasets. if len(self.config.datasets) == 1: - self.callflow = SingleCallFlow(config=self.config, process=self.process) + self.callflow = callflow.CallFlow( + config=self.config, process=self.process, ensemble=False + ) else: - self.callflow = EnsembleCallFlow(config=self.config, process=self.process) + self.callflow = callflow.CallFlow( + config=self.config, process=self.process, ensemble=True + ) # Create server if not processing. if not self.process: self._create_server() + # ------------------------------------------------------------------------------ + # Private methods. @staticmethod def _create_parser(): """ @@ -106,6 +104,14 @@ def _verify_parser(args): raise Exception() def _create_server(self): + """ + Create server's request handler and starts the server. + Current version abstracts the requests into 3 categores: + General: common requests for both ensemble and single. + Single: requests for single dataset processing. + Ensemble: requests for ensemble dataset processing. + """ + # Socket request handlers self._request_handler_general() if len(self.config.datasets) == 1: @@ -125,14 +131,12 @@ def _request_handler_general(self): General socket requests. """ - # TODO: Find a better way to debug. @sockets.on("reset", namespace="/") def reset(data): """ # TODO: This might have to be deleted. """ - if self.debug: - LOGGER.debug("[Socket request] reset: {}".format(data)) + LOGGER.debug("[Socket request] reset: {}".format(data)) dataset = data["dataset"] filterBy = data["filterBy"] filterPerc = data["filterPerc"] @@ -148,17 +152,16 @@ def reset(data): @sockets.on("init", namespace="/") def init(data): """ - # TODO: Change request tag to "config". - # TODO: Remove case study. Essential data house for single callflow. :return: Config file (JSON Format). """ - if self.debug: - LOGGER.debug(f"[Socket request] init: {data}") - - caseStudy = data["caseStudy"] - result = json.dumps(self.config, default=lambda o: o.__dict__) - emit("init", result, json=True) + LOGGER.debug(f"[Socket request] init: {data}") + if data["mode"] == "Ensemble": + result = self.callflow.request_ensemble({"name": "init"}) + elif data["mode"] == "Single": + result = self.callflow.request_single({"name": "init"}) + json_result = json.dumps(result) + emit("init", json_result, json=True) @sockets.on("reveal_callsite", namespace="/") def reveal_callsite(data): @@ -166,8 +169,7 @@ def reveal_callsite(data): Reveal the callpaths of selected callsites. :return: networkx graph (JSON) """ - if self.debug: - LOGGER.debug(f"[Socket request] reveal_callsite: {data}") + LOGGER.debug(f"[Socket request] reveal_callsite: {data}") nxg = self.callflow.request( { "name": "supergraph", @@ -186,8 +188,7 @@ def split_by_entry_callsites(data): Reveal the entry callsite of selected module. :return: networkx graph (JSON) """ - if self.debug: - LOGGER.debug("Split by entry: {}".format(data)) + LOGGER.debug("Split by entry: {}".format(data)) nxg = self.callflow.request( { "name": "supergraph", @@ -206,8 +207,7 @@ def split_by_callees(data): Reveal the callees of selected module. :return: networkx graph (JSON) """ - if self.debug: - LOGGER.debug("Split by callees: {}".format(data)) + LOGGER.debug("Split by callees: {}".format(data)) nxg = self.callflow.request( { "name": "supergraph", @@ -220,33 +220,15 @@ def split_by_callees(data): json_result = json.dumps(result) emit("ensemble_supergraph", json_result, json=True) - # @sockets.on("mpi_range_data", namespace="/") - # def mpi_range_data(data): - # if self.debug: - # LOGGER.debug("MPI range data: {}".format(data)) - # nx_graph = self.callflow.request( - # { - # "name": "mpi_range_data", - # "datasets": data["datasets"], - # "range_from": data["range_from"], - # "range_to": data["range_to"], - # } - # ) - def _request_handler_single(self): @sockets.on("single_callsite_data", namespace="/") def single_callsite_data(data): """ - TODO: Not sure if we can merge this with init. - TODO: Needs discussion and a better naming convention. - Data house for single callflow. :return: Auxiliary data. """ - if self.debug: - LOGGER.debug("[Socket request] single_callsite_data. {}".format(data)) - - result = self.callflow.request( + LOGGER.debug("[Socket request] single_callsite_data. {}".format(data)) + result = self.callflow.request_single( { "name": "auxiliary", "dataset": data["dataset"], @@ -263,10 +245,8 @@ def single_cct(data): Single CCT. :return: CCT networkx graph (JSON format). """ - if self.debug: - LOGGER.debug("[Socket request] Single CCT: {}".format(data)) - - nxg = self.callflow.request( + LOGGER.debug("[Socket request] Single CCT: {}".format(data)) + nxg = self.callflow.request_single( { "name": "cct", "dataset": data["dataset"], @@ -274,6 +254,8 @@ def single_cct(data): } ) result = json_graph.node_link_data(nxg) + json_result = json.dumps(result) + emit("single_cct", result, json=True) @sockets.on("single_supergraph", namespace="/") @@ -282,31 +264,25 @@ def single_supergraph(data): Single SuperGraph. :return: both SuperGraph networkx graphs (JSON format). """ - if self.debug: - LOGGER.debug("[Socket request] single_supergraph: {}".format(data)) - + LOGGER.debug("[Socket request] single_supergraph: {}".format(data)) dataset = data["dataset"] groupBy = data["groupBy"].lower() - nxg = self.callflow.request( + nxg = self.callflow.request_single( {"name": "supergraph", "groupBy": groupBy, "dataset": dataset} ) result = json_graph.node_link_data(nxg) - # json_result = json.dumps(result) - emit("single_supergraph", result, json=True) + json_result = json.dumps(result) + emit("single_supergraph", json_result, json=True) def _request_handler_ensemble(self): @sockets.on("ensemble_callsite_data", namespace="/") def ensemble_callsite_data(data): """ - TODO: Not sure if we can merge this with init. - TODO: Needs discussion and a better naming convention. - - Essential data house for ensemble callflow. + Data house for ensemble callflow. :return: Auxiliary data. """ - if self.debug: - LOGGER.debug("[Socket request] ensemble_callsite_data: {}".format(data)) - result = self.callflow.request( + LOGGER.debug("[Socket request] ensemble_callsite_data: {}".format(data)) + result = self.callflow.request_ensemble( { "name": "auxiliary", "datasets": data["datasets"], @@ -325,9 +301,8 @@ def ensemble_cct(data): Union of all CCTs. :return: CCT networkx graph (JSON format). """ - if self.debug: - LOGGER.debug("[Socket request] ensemble_cct: {}".format(data)) - nxg = self.callflow.request( + LOGGER.debug("[Socket request] ensemble_cct: {}".format(data)) + nxg = self.callflow.request_ensemble( { "name": "ensemble_cct", "datasets": data["datasets"], @@ -335,6 +310,7 @@ def ensemble_cct(data): } ) result = json_graph.node_link_data(nxg) + # json_result = json.dumps(result) emit("ensemble_cct", result, json=True) @sockets.on("ensemble_supergraph", namespace="/") @@ -343,17 +319,15 @@ def ensemble_supergraph(data): Ensemble SuperGraph. :return: both SuperGraph networkx graphs (JSON format). """ - if self.debug: - Logger.debug("[Socket request] ensemble_supergraph: {}".format(data)) - + LOGGER.debug("[Socket request] ensemble_supergraph: {}".format(data)) datasets = data["datasets"] groupBy = data["groupBy"].lower() - nxg = self.callflow.request( + nxg = self.callflow.request_ensemble( {"name": "supergraph", "groupBy": groupBy, "datasets": datasets} ) result = json_graph.node_link_data(nxg) - # json_result = json.dumps(result) - emit("ensemble_supergraph", result, json=True) + json_result = json.dumps(result) + emit("ensemble_supergraph", json_result, json=True) @sockets.on("ensemble_similarity", namespace="/") def ensemble_similarity(data): @@ -361,9 +335,7 @@ def ensemble_similarity(data): Similarity Matrix for all callgraphs in ensemble. :return: Pair-wise similarity matrix """ - if self.debug: - LOGGER.debug("ensemble_similarity: {data}") - + LOGGER.debug("ensemble_similarity: {data}") result = self.callflow.request( { "name": "similarity", @@ -380,9 +352,8 @@ def module_hierarchy(data): Module hierarchy of the supergraph. :return: CCT networkx graph (JSON format). """ - if self.debug: - LOGGER.debug(f"module_hierarchy {data}") - nxg = self.callflow.request( + LOGGER.debug(f"module_hierarchy {data}") + nxg = self.callflow.request_ensemble( { "name": "hierarchy", "datasets": data["datasets"], @@ -400,9 +371,8 @@ def parameter_projection(data): Parameter projection of the datasets. :return: PCs. I guess. """ - if self.debug: - LOGGER.debug(f"parameter_projection: {data}") - result = self.callflow.request( + LOGGER.debug(f"parameter_projection: {data}") + result = self.callflow.request_ensemble( { "name": "projection", "datasets": data["datasets"], @@ -412,6 +382,7 @@ def parameter_projection(data): ) emit("parameter_projection", result, json=True) + # Not used now. But lets keep it. Will be useful. @sockets.on("parameter_information", namespace="/") def parameter_information(data): """ @@ -419,9 +390,7 @@ def parameter_information(data): Parameter information :return: { "parameter1": [Array], "parameter2": [Array] ... }. """ - if self.debug: - LOGGER.debug(f"[Socket request] parameter_information: {data}") - + LOGGER.debug(f"[Socket request] parameter_information: {data}") result = self.callflow.request( {"name": "run-information", "datasets": data["datasets"]} ) @@ -430,12 +399,10 @@ def parameter_information(data): @sockets.on("compare", namespace="/") def compare(data): """ - TODO: Verify the return type. Compare two super-graphs. :return: Gradients in some JSON format. """ - if self.debug: - LOGGER.debug("[Socket request] compare_supergraph {data}") + LOGGER.debug("[Socket request] compare_supergraph {data}") result = self.callflow.request( { "name": "compare", @@ -446,18 +413,8 @@ def compare(data): ) emit("compare", result, json=True) - def create_server(self): - app.debug = True - app.__dir__ = os.path.join(os.path.dirname(os.getcwd()), "") - # CallFlowServer routes - @app.route("/") - def root(): - print("CallFlowServer directory", app.__dir__) - return send_from_directory(app.__dir__, "index.html") - if __name__ == "__main__": - # if verbose, level = 1 # else, level = 2 callflow.init_logger(level=1) diff --git a/callflow/timer.py b/callflow/timer.py index 52729106..7f754414 100644 --- a/callflow/timer.py +++ b/callflow/timer.py @@ -16,7 +16,9 @@ class Timer(object): - """Simple phase timer with a context manager.""" + """ + Simple phase timer with a context manager. + """ def __init__(self): self._phase = None diff --git a/callflow/utils.py b/callflow/utils.py index 0f55a9f6..de4c0298 100644 --- a/callflow/utils.py +++ b/callflow/utils.py @@ -16,7 +16,7 @@ def lookup_with_name(df, name): # ------------------------------------------------------------------------------ # a similar function in utils/hatchet.py -def sanitizeName(name): +def sanitize_name(name): ret_name = "" if name is None: ret_name = "Unknown" @@ -199,10 +199,10 @@ def string_to_list(string: str, sep: str): # ------------------------------------------------------------------------------ -# networx utilities +# networkx utilities # ------------------------------------------------------------------------------ # not sure if this is used anywhere -# Also, why is this not consistent with the rest of the stlye (ie, actions) +# Also, why is this not consistent with the rest of the style (ie, actions) def dfs(graph, dataframe, limit): def _dfs_recurse(root, level): for node in root.children: @@ -266,33 +266,6 @@ def graphmltojson(graphfile, outfile): # ------------------------------------------------------------------------------ - - -def getPathListFromFrames(frames): - paths = [] - for frame in frames: - path = [] - for f in frame: - if f["type"] == "function": - path.append(f["name"]) - elif f["type"] == "statement": - path.append(f["file"] + ":" + str(f["line"])) - elif f["type"] == "loop": - path.append(f["file"] + ":" + str(f["line"])) - paths.append(path) - return path - - -def framesToPathLists(paths): - all_paths = [] - for path in paths: - curr_path = [] - for frame in path: - curr_path.append(frame["name"]) - all_paths.append(curr_path) - return all_paths - - def bfs_hatchet(graph): ret = {} node_count = 0 @@ -330,7 +303,7 @@ def getNodeParents(node): return node.parents -def getNodeName(node): +def get_callsite_name_from_frame(node): name = node.frame.get("name") if name != None: return node.frame.get("name") @@ -338,18 +311,10 @@ def getNodeName(node): return node.frame.get("file") -def sanitizeName(name): - if name is None: - return "Unknown" - if "/" in name: - name_split = name.split("/") - return name_split[len(name_split) - 1] - else: - return name - - -# Return the Callsite name from frame. -def getNodeDictFromFrame(frame): +def node_dict_from_frame(frame): + """ + Constructs callsite's name from Hatchet's frame. + """ if frame["type"] == "function": return {"name": frame["name"], "line": "NA", "type": "function"} elif frame["type"] == "statement": @@ -358,3 +323,21 @@ def getNodeDictFromFrame(frame): return {"name": frame["file"], "line": frame["line"], "type": "loop"} else: return {} + + +def path_list_from_frames(frames): + """ + Constructs callsite's path from Hatchet's frame. + """ + paths = [] + for frame in frames: + path = [] + for f in frame: + if f["type"] == "function": + path.append(f["name"]) + elif f["type"] == "statement": + path.append(f["file"] + ":" + str(f["line"])) + elif f["type"] == "loop": + path.append(f["file"] + ":" + str(f["line"])) + paths.append(path) + return path diff --git a/data/caliper-cali/config.callflow.json b/data/caliper-cali/config.callflow.json index c0e877bb..e2926c61 100644 --- a/data/caliper-cali/config.callflow.json +++ b/data/caliper-cali/config.callflow.json @@ -1,6 +1,7 @@ { "run_name": "caliper-cali", "save_path": "./data/caliper-cali/.callflow", + "read_parameter": false, "datasets": [ { "name": "caliper-ex", diff --git a/data/caliper-cpi-json/config.callflow.json b/data/caliper-cpi-json/config.callflow.json index 3ff1d871..8e078cb2 100644 --- a/data/caliper-cpi-json/config.callflow.json +++ b/data/caliper-cpi-json/config.callflow.json @@ -1,6 +1,7 @@ { "run_name": "caliper-cpi-json", "save_path": "./data/caliper-cpi-json/.callflow", + "read_parameter": false, "datasets": [ { "name": "caliper-ex", diff --git a/data/caliper-lulesh-json/config.callflow.json b/data/caliper-lulesh-json/config.callflow.json index 13e51ae6..e841b811 100644 --- a/data/caliper-lulesh-json/config.callflow.json +++ b/data/caliper-lulesh-json/config.callflow.json @@ -1,6 +1,7 @@ { "run_name": "caliper-lulesh-json", "save_path": "data/caliper-lulesh-json/.callflow", + "read_parameter": false, "datasets": [ { "name": "lulesh", diff --git a/data/gprof2dot-cpi/config.callflow.json b/data/gprof2dot-cpi/config.callflow.json index 6d90b23b..6bbc1faa 100644 --- a/data/gprof2dot-cpi/config.callflow.json +++ b/data/gprof2dot-cpi/config.callflow.json @@ -1,6 +1,7 @@ { "run_name": "gprof-cpi", "save_path": "./data/gprof2dot-cpi/.callflow", + "read_parameter": false, "datasets": [ { "name": "calc-pi", diff --git a/data/hpctoolkit-cpi-database/config.callflow.json b/data/hpctoolkit-cpi-database/config.callflow.json index 598275a5..5fbe4820 100644 --- a/data/hpctoolkit-cpi-database/config.callflow.json +++ b/data/hpctoolkit-cpi-database/config.callflow.json @@ -1,6 +1,7 @@ { "run_name": "hpctoolkit-cpi-database", "save_path": "./data/hpctoolkit-cpi-database/.callflow", + "read_parameter": false, "datasets": [ { "name": "calc-pi",