diff --git a/src/cbdgen-framework.py b/src/cbdgen-framework.py index 6933ab8..44e6b50 100644 --- a/src/cbdgen-framework.py +++ b/src/cbdgen-framework.py @@ -1,80 +1,87 @@ -import numpy as np -import pandas as pd -import random -import matplotlib.pyplot as plt import multiprocessing import pickle -from sklearn.datasets import load_iris -from matplotlib import pyplot +import random +import numpy as np +import pandas as pd +from deap import algorithms from deap import base from deap import creator from deap import tools -from deap import algorithms - -import rpy2.robjects as robjects -from meta_features.ecol import Ecol +from rpy2 import robjects -import setup.setup_framework as setup -from instances_generator.generator import InstancesGenerator import extractor import preprocess +import setup.setup_framework as setup +from meta_features.ecol import Ecol +from instances_generator.generator import InstancesGenerator -# TODO: Implement Setup in a minimal main() -options = setup.get_options() - -cont = 0 -bobj = 0.4 -P = [12] -SCALES = [1] -tread = "" -select_new_dataset = "N" -NGEN = 1000 -# NGEN = options['NGEN'] -CXPB = 0.7 -MUTPB = 0.2 -INDPB = 0.05 -POP = 100 - -# TODO: Implement Generator of Instances in a minimal main() -gen_instances = InstancesGenerator(options) -df = gen_instances.generate(options['maker'][0]) - -filename = options['filename'] if options['filename'] != "" else "NGEN=" + \ - str(NGEN) - -metrics = options['measures'] - -# TODO: Implement fitness global measures in a minimal main() -global_measures = [] -if (options['filepath'] != ""): - base_df = pd.read_csv(options['filepath']) - target = options['label_name'] - - # Copying Columns names - # df.columns = preprocess.copyFeatureNamesFrom(base_df, label_name=target) - - # Extraction of Data Complexity Values - global_measures = tuple(extractor.complexity(base_df, target, metrics)) -else: - for metric in metrics: - global_measures.append(options[metric]) - global_measures = tuple(global_measures) - -filename += '-' + '-'.join(metrics) -N_ATTRIBUTES = int(options['samples']) # mispelled variable name -print(metrics, len(metrics)) -print(global_measures) -NOBJ = len(metrics) - -dic = {} - -# reference points -ref_points = [tools.uniform_reference_points( - NOBJ, p, s) for p, s in zip(P, SCALES)] -ref_points = np.concatenate(ref_points) -_, uniques = np.unique(ref_points, axis=0, return_index=True) -ref_points = ref_points[uniques] +def generate_instances(samples, attributes, classes, maker: tuple[int,str] + ) -> pd.DataFrame: + """ + Function responsible for the Generatation of Instances, highly dependent + of a InstancesGenerator object. + + Parameters + ---------- + samples : Number of instances to be generated. + attributes : Number of Attributes/Features to be generated. + classes : Number of classes to be classified to a instance. + maker : The type of maker that will generate the set of instances. + + Returns + ------- + pandas.DataFrame + """ + gen_instances = InstancesGenerator(samples, attributes, + classes=classes, + maker_option=maker[1]) + return gen_instances.generate(maker[0]) + +def complexity_extraction(measures: list[str], *, + dataframe_label: tuple[pd.DataFrame,str]=None, + complexity_values: dict) -> tuple[np.float64]: + """ + Function that extracts complexity values of a Data Set, highly dependent + of a extractor module. + + Parameters + ---------- + measures : A list of complexity measures to extract from the Data Set. + dataframe_label : Refers to the DataFrame itself and its label. + complexity_values : Dictionary of complexity values (TODO: Simplify!) + + Returns + ------- + tuple[complexity_values] + """ + if dataframe_label is not None: + # Copying Columns names + # df.columns = preprocess.copyFeatureNamesFrom(base_df, label_name=target) + + # Extraction of Data Complexity Values + return tuple(extractor.complexity(dataframe_label[0], + dataframe_label[1], + measures)) + return tuple(complexity_values[cm] for cm in measures) + +# TODO: Build a clever architecture for the filename +def build_filename(filename: str='', *, ngen: int, metrics: list) -> str: + """ + Function that builds a filename based on the number of generations and + metrics used to optimize. + + Parameters + ---------- + filename : Name or Prefix of the File that contains the result of the + optimization process. + ngen : Number of generations of the current run of optimization. + metrics : A list of metrics used to optimize. + """ + filename = filename if filename != "" else "NGEN="+ \ + str(ngen) + filename += '-' + '-'.join(metrics) + return filename def my_evaluate(individual): vetor = [] @@ -100,25 +107,70 @@ def print_evaluate(individual): return tuple(vetor) - -creator.create("FitnessMin", base.Fitness, weights=(-1.0,)*NOBJ) -creator.create("Individual", list, fitness=creator.FitnessMin) - -RANDINT_LOW = 0 -RANDINT_UP = options['classes'] - 1 - -toolbox = base.Toolbox() -toolbox.register("attr_int", random.randint, RANDINT_LOW, RANDINT_UP) -toolbox.register("individual", tools.initRepeat, - creator.Individual, toolbox.attr_int, N_ATTRIBUTES) -toolbox.register("population", tools.initRepeat, list, toolbox.individual) -toolbox.register("evaluate", my_evaluate) -toolbox.register("mate", tools.cxTwoPoint) -toolbox.register("mutate", tools.mutShuffleIndexes, indpb=INDPB) -toolbox.register("select", tools.selNSGA3, ref_points=ref_points) - - -def main(seed=None): +def setup_engine(options): + """ + Function that set up a deap.base.toolbox for the search-engine process + + Parameters + ---------- + options : Dictionary of setup parameters highly necessary to how the + search engine will find the solutions + + Returns + ------- + deap.base.Toolbox + """ + samples = int(options['samples']) + n_objectives = len(options['measures']) + + # reference points + ref_points = [tools.uniform_reference_points( + n_objectives, p, s) for p, s in zip(options['P'], options['SCALES'])] + ref_points = np.concatenate(ref_points) + _, uniques = np.unique(ref_points, axis=0, return_index=True) + ref_points = ref_points[uniques] + + creator.create("FitnessMin", base.Fitness, weights=(-1.0,)*n_objectives) + creator.create("Individual", list, fitness=creator.FitnessMin) + + randint_down = 0 + randint_up = options['classes'] - 1 + + toolbox = base.Toolbox() + toolbox.register("attr_int", random.randint, randint_down, randint_up) + toolbox.register("individual", tools.initRepeat, + creator.Individual, toolbox.attr_int, samples) + toolbox.register("population", tools.initRepeat, list, toolbox.individual) + toolbox.register("evaluate", my_evaluate) + toolbox.register("mate", tools.cxTwoPoint) + indpb = options['INDPB'] + toolbox.register("mutate", tools.mutShuffleIndexes, indpb=indpb) + toolbox.register("select", tools.selNSGA3, ref_points=ref_points) + + return toolbox + +def results(options: dict, toolbox: base.Toolbox): + """ + Function that operates the search engine process by operating an + evolutional algorithm to find the best results. + + Parameters + ---------- + options : Dictionary of setup parameters highly necessary to how the + search engine will find the solutions. + toolbox : A Toolbox for evolution that contains evolutionary operators. + + Returns + ------- + deap.base.toolbox.population : A population of the best individuals + from the search engine process. + deap.tools.logbook : A logbook that contains evolutionary and + statistics information about the search process. + """ + pop = options['POP'] + cxpb = options['CXPB'] + mutpb = options['MUTPB'] + ngen = options['NGEN'] random.seed(64) pool = multiprocessing.Pool(processes=12) toolbox.register("map", pool.map) @@ -132,58 +184,84 @@ def main(seed=None): logbook = tools.Logbook() logbook.header = "gen", "evals", "std", "min", "avg", "max" - pop = toolbox.population(POP) + tool_pop = toolbox.population(pop) # Evaluate the individuals with an invalid fitness - invalid_ind = [ind for ind in pop if not ind.fitness.valid] + invalid_ind = [ind for ind in tool_pop if not ind.fitness.valid] fitnesses = toolbox.map(toolbox.evaluate, invalid_ind) for ind, fit in zip(invalid_ind, fitnesses): ind.fitness.values = fit # Compile statistics about the population - record = stats.compile(pop) + record = stats.compile(tool_pop) logbook.record(gen=0, evals=len(invalid_ind), **record) print(logbook.stream) # Begin the generational process - for gen in range(1, NGEN): - offspring = algorithms.varAnd(pop, toolbox, CXPB, MUTPB) + for gen in range(1, ngen): + offspring = algorithms.varAnd(tool_pop, toolbox, cxpb, mutpb) # Evaluate the individuals with an invalid fitness invalid_ind = [ind for ind in offspring if not ind.fitness.valid] fitnesses = toolbox.map(toolbox.evaluate, invalid_ind) for ind, fit in zip(invalid_ind, fitnesses): ind.fitness.values = fit # Select the next generation population from parents and offspring - pop = toolbox.select(pop + offspring, POP) + tool_pop = toolbox.select(tool_pop + offspring, pop) # Compile statistics about the new population - record = stats.compile(pop) + record = stats.compile(tool_pop) logbook.record(gen=gen, evals=len(invalid_ind), **record) print(logbook.stream) - return pop, logbook + return tool_pop, logbook + +def main(): + options = setup.get_options() + + if options['filepath'] != '': + base_df = pd.read_csv(options['filepath']) + + global dataFrame + dataFrame = generate_instances(options['samples'], options['attributes'], + options['classes'], options['maker']) + + complexity_values = {} + global metrics + metrics = options['measures'] + for measure in metrics: + complexity_values[measure] = options[measure] + global global_measures + global_measures = complexity_extraction(metrics, + dataframe_label=( + base_df, options['label_name'] + ), + complexity_values=complexity_values + ) + + filename = build_filename(options['filename'], + ngen=options['NGEN'], + metrics=metrics) - -if __name__ == '__main__': - cont1 = 0 - cont0 = 0 - #dataFrame = pd.read_csv(str(N_ATTRIBUTES) + '.csv') - #dataFrame = dataFrame.drop('c0', axis=1) - dataFrame = df # This Ecol object should be called according to the variable dataFrame. - # If dataFrame is renamed, then ecol_dataFrame should be renamed + # If dataFrame is renamed, then ecol_dataFrame should be renamed # accordingly. + global ecol_dataFrame ecol_dataFrame = Ecol(dataframe=dataFrame, label='label') - results = main() - print("logbook") - print(results[0][0]) - for x in range(len(results[0])): - dic[print_evaluate(results[0][x])] = results[0][x] + + print(metrics, len(metrics)) + print(global_measures) + toolbox = setup_engine(options) + result = results(options, toolbox) + + compiled_results = {} + for x in range(len(result[0])): + compiled_results[print_evaluate(result[0][x])] = result[0][x] outfile = open(filename, 'wb') - pickle.dump(dic, outfile) + pickle.dump(compiled_results, outfile) outfile.close() - df['label'] = results[0][0] + dataFrame['label'] = result[0][0] # Scale to original Dataset (Optional) #TODO: Improve preprocessing # df = preprocess.scaleColumnsFrom(base_df, df, label_column='label') - df.to_csv(str(filename)+".csv") - ax1 = df.plot.scatter(x=0, y=1, c='label', colormap='Paired') - pyplot.show() + dataFrame.to_csv(str(filename)+".csv") + +if __name__ == '__main__': + main() diff --git a/src/instances_generator/generator.py b/src/instances_generator/generator.py index cc209be..571aca2 100644 --- a/src/instances_generator/generator.py +++ b/src/instances_generator/generator.py @@ -20,7 +20,7 @@ class InstancesGenerator: 5: '_mlabel_classf' } - def __init__(self, options: dict): + def __init__(self, samples, attributes, classes=None, maker_option=None): """ Constructs the generator based on properties desired. @@ -30,10 +30,10 @@ def __init__(self, options: dict): properties desired to generate a dataset (e.g. samples, attributes, classes). """ - self._samples = options['samples'] - self._attributes = options['attributes'] - self._classes = options['classes'] - self._optional_option = options['maker'][1] + self._samples = samples + self._attributes = attributes + self._classes = classes + self._optional_option = maker_option def generate(self, type_gen: int) -> DataFrame: """ diff --git a/src/setup/interactor.py b/src/setup/interactor.py index a7e9e41..0908888 100644 --- a/src/setup/interactor.py +++ b/src/setup/interactor.py @@ -75,6 +75,9 @@ def measures_input() -> list: # Appends every Complexity Measure in measures list return [cm(measure) for measure in input_Stream] +def generation_input() -> int: + return int(input("Com quantas gerações você deseja otimizar o dataset?\n")) + def __input_with_default__(input_text: str, default_value, data_type: type): try: return data_type(input(input_text)) diff --git a/src/setup/setup_framework.py b/src/setup/setup_framework.py index 9f5cb30..802840b 100644 --- a/src/setup/setup_framework.py +++ b/src/setup/setup_framework.py @@ -2,11 +2,22 @@ import setup.argparser as argparser import setup.interactor as interactor +# TODO: It is necessary to implement a config.json or a parameters.json for +# hyperparameters about the search-engine. +HYPERPARAMETERS = { + 'P' : [12], + 'SCALES' : [1], + 'CXPB' : 0.7, + 'MUTPB' : 0.2, + 'INDPB' : 0.05, + 'POP' : 100 +} + def get_options() -> dict: args = argparser.parse_args() - if args.option_interative: - return setup_interative() - return setup_non_interative(args) + return (setup_interative() if args.option_interative + else setup_non_interative(args) + ) | HYPERPARAMETERS def setup_interative() -> dict: options = {} @@ -34,6 +45,7 @@ def setup_interative() -> dict: # TODO: Extract Measures from the real dataset options['filename'] = interactor.filename_input() + options['NGEN'] = interactor.generation_input() # Separating Measures if measures != None: