[Refactor] Main Code to a modular Architecture (#34)

Merge pull request #34 from SteffanoP/refactor/def-main
SteffanoP · May 10, 2022 · 36604f1 · 36604f1
2 parents 9da3d8e + dbea3fd
commit 36604f1
Show file tree

Hide file tree

Showing 4 changed files with 213 additions and 120 deletions.
diff --git a/src/cbdgen-framework.py b/src/cbdgen-framework.py
@@ -1,80 +1,87 @@
-import numpy as np
-import pandas as pd
-import random
-import matplotlib.pyplot as plt
 import multiprocessing
 import pickle
-from sklearn.datasets import load_iris
-from matplotlib import pyplot
+import random
 
+import numpy as np
+import pandas as pd
+from deap import algorithms
 from deap import base
 from deap import creator
 from deap import tools
-from deap import algorithms
-
-import rpy2.robjects as robjects
-from meta_features.ecol import Ecol
+from rpy2 import robjects
 
-import setup.setup_framework as setup
-from instances_generator.generator import InstancesGenerator
 import extractor
 import preprocess
+import setup.setup_framework as setup
+from meta_features.ecol import Ecol
+from instances_generator.generator import InstancesGenerator
 
-# TODO: Implement Setup in a minimal main()
-options = setup.get_options()
-
-cont = 0
-bobj = 0.4
-P = [12]
-SCALES = [1]
-tread = ""
-select_new_dataset = "N"
-NGEN = 1000
-# NGEN = options['NGEN']
-CXPB = 0.7
-MUTPB = 0.2
-INDPB = 0.05
-POP = 100
-
-# TODO: Implement Generator of Instances in a minimal main()
-gen_instances = InstancesGenerator(options)
-df = gen_instances.generate(options['maker'][0])
-
-filename = options['filename'] if options['filename'] != "" else "NGEN=" + \
-    str(NGEN)
-
-metrics = options['measures']
-
-# TODO: Implement fitness global measures in a minimal main()
-global_measures = []
-if (options['filepath'] != ""):
-    base_df = pd.read_csv(options['filepath'])
-    target = options['label_name']
-
-    # Copying Columns names
-    # df.columns = preprocess.copyFeatureNamesFrom(base_df, label_name=target)
-
-    # Extraction of Data Complexity Values
-    global_measures = tuple(extractor.complexity(base_df, target, metrics))
-else:
-    for metric in metrics:
-        global_measures.append(options[metric])
-    global_measures = tuple(global_measures)
-
-filename += '-' + '-'.join(metrics)
-N_ATTRIBUTES = int(options['samples']) # mispelled variable name
-print(metrics, len(metrics))
-print(global_measures)
-NOBJ = len(metrics)
-
-dic = {}
-
-# reference points
-ref_points = [tools.uniform_reference_points(
-    NOBJ, p, s) for p, s in zip(P, SCALES)]
-ref_points = np.concatenate(ref_points)
-_, uniques = np.unique(ref_points, axis=0, return_index=True)
-ref_points = ref_points[uniques]
+def generate_instances(samples, attributes, classes, maker: tuple[int,str]
+                       ) -> pd.DataFrame:
+    """
+    Function responsible for the Generatation of Instances, highly dependent
+    of a InstancesGenerator object.
+
+    Parameters
+    ----------
+        samples : Number of instances to be generated.
+        attributes : Number of Attributes/Features to be generated.
+        classes : Number of classes to be classified to a instance.
+        maker : The type of maker that will generate the set of instances.
+
+    Returns
+    -------
+        pandas.DataFrame
+    """
+    gen_instances = InstancesGenerator(samples, attributes,
+                                       classes=classes,
+                                       maker_option=maker[1])
+    return gen_instances.generate(maker[0])
+
+def complexity_extraction(measures: list[str], *,
+                          dataframe_label: tuple[pd.DataFrame,str]=None,
+                          complexity_values: dict) -> tuple[np.float64]:
+    """
+    Function that extracts complexity values of a Data Set, highly dependent
+    of a extractor module.
+
+    Parameters
+    ----------
+        measures : A list of complexity measures to extract from the Data Set.
+        dataframe_label : Refers to the DataFrame itself and its label.
+        complexity_values : Dictionary of complexity values (TODO: Simplify!)
+
+    Returns
+    -------
+        tuple[complexity_values]
+    """
+    if dataframe_label is not None:
+        # Copying Columns names
+        # df.columns = preprocess.copyFeatureNamesFrom(base_df, label_name=target)
+
+        # Extraction of Data Complexity Values
+        return tuple(extractor.complexity(dataframe_label[0],
+                                          dataframe_label[1],
+                                          measures))
+    return tuple(complexity_values[cm] for cm in measures)
+
+# TODO: Build a clever architecture for the filename
+def build_filename(filename: str='', *, ngen: int, metrics: list) -> str:
+    """
+    Function that builds a filename based on the number of generations and
+    metrics used to optimize.
+
+    Parameters
+    ----------
+        filename : Name or Prefix of the File that contains the result of the
+            optimization process.
+        ngen : Number of generations of the current run of optimization.
+        metrics : A list of metrics used to optimize.
+    """
+    filename = filename if filename != "" else "NGEN="+ \
+        str(ngen)
+    filename += '-' + '-'.join(metrics)
+    return filename
 
 def my_evaluate(individual):
     vetor = []
@@ -100,25 +107,70 @@ def print_evaluate(individual):
 
     return tuple(vetor)
 
-
-creator.create("FitnessMin", base.Fitness, weights=(-1.0,)*NOBJ)
-creator.create("Individual", list, fitness=creator.FitnessMin)
-
-RANDINT_LOW = 0
-RANDINT_UP = options['classes'] - 1
-
-toolbox = base.Toolbox()
-toolbox.register("attr_int", random.randint, RANDINT_LOW, RANDINT_UP)
-toolbox.register("individual", tools.initRepeat,
-                 creator.Individual, toolbox.attr_int, N_ATTRIBUTES)
-toolbox.register("population", tools.initRepeat, list, toolbox.individual)
-toolbox.register("evaluate", my_evaluate)
-toolbox.register("mate", tools.cxTwoPoint)
-toolbox.register("mutate", tools.mutShuffleIndexes, indpb=INDPB)
-toolbox.register("select", tools.selNSGA3, ref_points=ref_points)
-
-
-def main(seed=None):
+def setup_engine(options):
+    """
+    Function that set up a deap.base.toolbox for the search-engine process
+
+    Parameters
+    ----------
+        options : Dictionary of setup parameters highly necessary to how the
+            search engine will find the solutions
+
+    Returns
+    -------
+        deap.base.Toolbox
+    """
+    samples = int(options['samples'])
+    n_objectives = len(options['measures'])
+
+    # reference points
+    ref_points = [tools.uniform_reference_points(
+        n_objectives, p, s) for p, s in zip(options['P'], options['SCALES'])]
+    ref_points = np.concatenate(ref_points)
+    _, uniques = np.unique(ref_points, axis=0, return_index=True)
+    ref_points = ref_points[uniques]
+
+    creator.create("FitnessMin", base.Fitness, weights=(-1.0,)*n_objectives)
+    creator.create("Individual", list, fitness=creator.FitnessMin)
+
+    randint_down = 0
+    randint_up = options['classes'] - 1
+
+    toolbox = base.Toolbox()
+    toolbox.register("attr_int", random.randint, randint_down, randint_up)
+    toolbox.register("individual", tools.initRepeat,
+                    creator.Individual, toolbox.attr_int, samples)
+    toolbox.register("population", tools.initRepeat, list, toolbox.individual)
+    toolbox.register("evaluate", my_evaluate)
+    toolbox.register("mate", tools.cxTwoPoint)
+    indpb = options['INDPB']
+    toolbox.register("mutate", tools.mutShuffleIndexes, indpb=indpb)
+    toolbox.register("select", tools.selNSGA3, ref_points=ref_points)
+
+    return toolbox
+
+def results(options: dict, toolbox: base.Toolbox):
+    """
+    Function that operates the search engine process by operating an
+    evolutional algorithm to find the best results.
+
+    Parameters
+    ----------
+        options : Dictionary of setup parameters highly necessary to how the
+            search engine will find the solutions.
+        toolbox : A Toolbox for evolution that contains evolutionary operators.
+
+    Returns
+    -------
+        deap.base.toolbox.population : A population of the best individuals
+            from the search engine process.
+        deap.tools.logbook : A logbook that contains evolutionary and
+            statistics information about the search process.
+    """
+    pop = options['POP']
+    cxpb = options['CXPB']
+    mutpb = options['MUTPB']
+    ngen = options['NGEN']
     random.seed(64)
     pool = multiprocessing.Pool(processes=12)
     toolbox.register("map", pool.map)
@@ -132,58 +184,84 @@ def main(seed=None):
     logbook = tools.Logbook()
     logbook.header = "gen", "evals", "std", "min", "avg", "max"
 
-    pop = toolbox.population(POP)
+    tool_pop = toolbox.population(pop)
 
     # Evaluate the individuals with an invalid fitness
-    invalid_ind = [ind for ind in pop if not ind.fitness.valid]
+    invalid_ind = [ind for ind in tool_pop if not ind.fitness.valid]
     fitnesses = toolbox.map(toolbox.evaluate, invalid_ind)
     for ind, fit in zip(invalid_ind, fitnesses):
         ind.fitness.values = fit
     # Compile statistics about the population
-    record = stats.compile(pop)
+    record = stats.compile(tool_pop)
 
     logbook.record(gen=0, evals=len(invalid_ind), **record)
     print(logbook.stream)
     # Begin the generational process
-    for gen in range(1, NGEN):
-        offspring = algorithms.varAnd(pop, toolbox, CXPB, MUTPB)
+    for gen in range(1, ngen):
+        offspring = algorithms.varAnd(tool_pop, toolbox, cxpb, mutpb)
         # Evaluate the individuals with an invalid fitness
         invalid_ind = [ind for ind in offspring if not ind.fitness.valid]
         fitnesses = toolbox.map(toolbox.evaluate, invalid_ind)
         for ind, fit in zip(invalid_ind, fitnesses):
             ind.fitness.values = fit
         # Select the next generation population from parents and offspring
-        pop = toolbox.select(pop + offspring, POP)
+        tool_pop = toolbox.select(tool_pop + offspring, pop)
 
         # Compile statistics about the new population
-        record = stats.compile(pop)
+        record = stats.compile(tool_pop)
         logbook.record(gen=gen, evals=len(invalid_ind), **record)
         print(logbook.stream)
-    return pop, logbook
+    return tool_pop, logbook
+
+def main():
+    options = setup.get_options()
+
+    if options['filepath'] != '':
+        base_df = pd.read_csv(options['filepath'])
+
+    global dataFrame
+    dataFrame = generate_instances(options['samples'], options['attributes'],
+                                   options['classes'], options['maker'])
+
+    complexity_values = {}
+    global metrics
+    metrics = options['measures']
+    for measure in metrics:
+        complexity_values[measure] = options[measure]
+    global global_measures
+    global_measures = complexity_extraction(metrics,
+                                            dataframe_label=(
+                                                base_df, options['label_name']
+                                            ),
+                                            complexity_values=complexity_values
+                                            )
+
+    filename = build_filename(options['filename'],
+                              ngen=options['NGEN'],
+                              metrics=metrics)
 
-
-if __name__ == '__main__':
-    cont1 = 0
-    cont0 = 0
-    #dataFrame = pd.read_csv(str(N_ATTRIBUTES) + '.csv')
-    #dataFrame = dataFrame.drop('c0', axis=1)
-    dataFrame = df
     # This Ecol object should be called according to the variable dataFrame.
-    # If dataFrame is renamed, then ecol_dataFrame should be renamed 
+    # If dataFrame is renamed, then ecol_dataFrame should be renamed
     # accordingly.
+    global ecol_dataFrame
     ecol_dataFrame = Ecol(dataframe=dataFrame, label='label')
-    results = main()
-    print("logbook")
-    print(results[0][0])
-    for x in range(len(results[0])):
-        dic[print_evaluate(results[0][x])] = results[0][x]
+
+    print(metrics, len(metrics))
+    print(global_measures)
+    toolbox = setup_engine(options)
+    result = results(options, toolbox)
+
+    compiled_results = {}
+    for x in range(len(result[0])):
+        compiled_results[print_evaluate(result[0][x])] = result[0][x]
         outfile = open(filename, 'wb')
-        pickle.dump(dic, outfile)
+        pickle.dump(compiled_results, outfile)
         outfile.close()
 
-    df['label'] = results[0][0]
+    dataFrame['label'] = result[0][0]
     # Scale to original Dataset (Optional) #TODO: Improve preprocessing
     # df = preprocess.scaleColumnsFrom(base_df, df, label_column='label')
-    df.to_csv(str(filename)+".csv")
-    ax1 = df.plot.scatter(x=0, y=1, c='label', colormap='Paired')
-    pyplot.show()
+    dataFrame.to_csv(str(filename)+".csv")
+
+if __name__ == '__main__':
+    main()
diff --git a/src/instances_generator/generator.py b/src/instances_generator/generator.py
@@ -20,7 +20,7 @@ class InstancesGenerator:
         5: '_mlabel_classf'
     }
 
-    def __init__(self, options: dict):
+    def __init__(self, samples, attributes, classes=None, maker_option=None):
         """
         Constructs the generator based on properties desired.
 
@@ -30,10 +30,10 @@ def __init__(self, options: dict):
                 properties desired to generate a dataset (e.g. samples,
                 attributes, classes).
         """
-        self._samples = options['samples']
-        self._attributes = options['attributes']
-        self._classes = options['classes']
-        self._optional_option = options['maker'][1]
+        self._samples = samples
+        self._attributes = attributes
+        self._classes = classes
+        self._optional_option = maker_option
 
     def generate(self, type_gen: int) -> DataFrame:
         """

diff --git a/src/setup/interactor.py b/src/setup/interactor.py
@@ -75,6 +75,9 @@ def measures_input() -> list:
     # Appends every Complexity Measure in measures list
     return [cm(measure) for measure in input_Stream]
 
+def generation_input() -> int:
+    return int(input("Com quantas gerações você deseja otimizar o dataset?\n"))
+
 def __input_with_default__(input_text: str, default_value, data_type: type):
     try:
         return data_type(input(input_text))