Skip to content

Commit

Permalink
[Refactor] Main Code to a modular Architecture (#34)
Browse files Browse the repository at this point in the history
Merge pull request #34 from SteffanoP/refactor/def-main
  • Loading branch information
SteffanoP authored May 10, 2022
2 parents 9da3d8e + dbea3fd commit 36604f1
Show file tree
Hide file tree
Showing 4 changed files with 213 additions and 120 deletions.
302 changes: 190 additions & 112 deletions src/cbdgen-framework.py
Original file line number Diff line number Diff line change
@@ -1,80 +1,87 @@
import numpy as np
import pandas as pd
import random
import matplotlib.pyplot as plt
import multiprocessing
import pickle
from sklearn.datasets import load_iris
from matplotlib import pyplot
import random

import numpy as np
import pandas as pd
from deap import algorithms
from deap import base
from deap import creator
from deap import tools
from deap import algorithms

import rpy2.robjects as robjects
from meta_features.ecol import Ecol
from rpy2 import robjects

import setup.setup_framework as setup
from instances_generator.generator import InstancesGenerator
import extractor
import preprocess
import setup.setup_framework as setup
from meta_features.ecol import Ecol
from instances_generator.generator import InstancesGenerator

# TODO: Implement Setup in a minimal main()
options = setup.get_options()

cont = 0
bobj = 0.4
P = [12]
SCALES = [1]
tread = ""
select_new_dataset = "N"
NGEN = 1000
# NGEN = options['NGEN']
CXPB = 0.7
MUTPB = 0.2
INDPB = 0.05
POP = 100

# TODO: Implement Generator of Instances in a minimal main()
gen_instances = InstancesGenerator(options)
df = gen_instances.generate(options['maker'][0])

filename = options['filename'] if options['filename'] != "" else "NGEN=" + \
str(NGEN)

metrics = options['measures']

# TODO: Implement fitness global measures in a minimal main()
global_measures = []
if (options['filepath'] != ""):
base_df = pd.read_csv(options['filepath'])
target = options['label_name']

# Copying Columns names
# df.columns = preprocess.copyFeatureNamesFrom(base_df, label_name=target)

# Extraction of Data Complexity Values
global_measures = tuple(extractor.complexity(base_df, target, metrics))
else:
for metric in metrics:
global_measures.append(options[metric])
global_measures = tuple(global_measures)

filename += '-' + '-'.join(metrics)
N_ATTRIBUTES = int(options['samples']) # mispelled variable name
print(metrics, len(metrics))
print(global_measures)
NOBJ = len(metrics)

dic = {}

# reference points
ref_points = [tools.uniform_reference_points(
NOBJ, p, s) for p, s in zip(P, SCALES)]
ref_points = np.concatenate(ref_points)
_, uniques = np.unique(ref_points, axis=0, return_index=True)
ref_points = ref_points[uniques]
def generate_instances(samples, attributes, classes, maker: tuple[int,str]
) -> pd.DataFrame:
"""
Function responsible for the Generatation of Instances, highly dependent
of a InstancesGenerator object.
Parameters
----------
samples : Number of instances to be generated.
attributes : Number of Attributes/Features to be generated.
classes : Number of classes to be classified to a instance.
maker : The type of maker that will generate the set of instances.
Returns
-------
pandas.DataFrame
"""
gen_instances = InstancesGenerator(samples, attributes,
classes=classes,
maker_option=maker[1])
return gen_instances.generate(maker[0])

def complexity_extraction(measures: list[str], *,
dataframe_label: tuple[pd.DataFrame,str]=None,
complexity_values: dict) -> tuple[np.float64]:
"""
Function that extracts complexity values of a Data Set, highly dependent
of a extractor module.
Parameters
----------
measures : A list of complexity measures to extract from the Data Set.
dataframe_label : Refers to the DataFrame itself and its label.
complexity_values : Dictionary of complexity values (TODO: Simplify!)
Returns
-------
tuple[complexity_values]
"""
if dataframe_label is not None:
# Copying Columns names
# df.columns = preprocess.copyFeatureNamesFrom(base_df, label_name=target)

# Extraction of Data Complexity Values
return tuple(extractor.complexity(dataframe_label[0],
dataframe_label[1],
measures))
return tuple(complexity_values[cm] for cm in measures)

# TODO: Build a clever architecture for the filename
def build_filename(filename: str='', *, ngen: int, metrics: list) -> str:
"""
Function that builds a filename based on the number of generations and
metrics used to optimize.
Parameters
----------
filename : Name or Prefix of the File that contains the result of the
optimization process.
ngen : Number of generations of the current run of optimization.
metrics : A list of metrics used to optimize.
"""
filename = filename if filename != "" else "NGEN="+ \
str(ngen)
filename += '-' + '-'.join(metrics)
return filename

def my_evaluate(individual):
vetor = []
Expand All @@ -100,25 +107,70 @@ def print_evaluate(individual):

return tuple(vetor)


creator.create("FitnessMin", base.Fitness, weights=(-1.0,)*NOBJ)
creator.create("Individual", list, fitness=creator.FitnessMin)

RANDINT_LOW = 0
RANDINT_UP = options['classes'] - 1

toolbox = base.Toolbox()
toolbox.register("attr_int", random.randint, RANDINT_LOW, RANDINT_UP)
toolbox.register("individual", tools.initRepeat,
creator.Individual, toolbox.attr_int, N_ATTRIBUTES)
toolbox.register("population", tools.initRepeat, list, toolbox.individual)
toolbox.register("evaluate", my_evaluate)
toolbox.register("mate", tools.cxTwoPoint)
toolbox.register("mutate", tools.mutShuffleIndexes, indpb=INDPB)
toolbox.register("select", tools.selNSGA3, ref_points=ref_points)


def main(seed=None):
def setup_engine(options):
"""
Function that set up a deap.base.toolbox for the search-engine process
Parameters
----------
options : Dictionary of setup parameters highly necessary to how the
search engine will find the solutions
Returns
-------
deap.base.Toolbox
"""
samples = int(options['samples'])
n_objectives = len(options['measures'])

# reference points
ref_points = [tools.uniform_reference_points(
n_objectives, p, s) for p, s in zip(options['P'], options['SCALES'])]
ref_points = np.concatenate(ref_points)
_, uniques = np.unique(ref_points, axis=0, return_index=True)
ref_points = ref_points[uniques]

creator.create("FitnessMin", base.Fitness, weights=(-1.0,)*n_objectives)
creator.create("Individual", list, fitness=creator.FitnessMin)

randint_down = 0
randint_up = options['classes'] - 1

toolbox = base.Toolbox()
toolbox.register("attr_int", random.randint, randint_down, randint_up)
toolbox.register("individual", tools.initRepeat,
creator.Individual, toolbox.attr_int, samples)
toolbox.register("population", tools.initRepeat, list, toolbox.individual)
toolbox.register("evaluate", my_evaluate)
toolbox.register("mate", tools.cxTwoPoint)
indpb = options['INDPB']
toolbox.register("mutate", tools.mutShuffleIndexes, indpb=indpb)
toolbox.register("select", tools.selNSGA3, ref_points=ref_points)

return toolbox

def results(options: dict, toolbox: base.Toolbox):
"""
Function that operates the search engine process by operating an
evolutional algorithm to find the best results.
Parameters
----------
options : Dictionary of setup parameters highly necessary to how the
search engine will find the solutions.
toolbox : A Toolbox for evolution that contains evolutionary operators.
Returns
-------
deap.base.toolbox.population : A population of the best individuals
from the search engine process.
deap.tools.logbook : A logbook that contains evolutionary and
statistics information about the search process.
"""
pop = options['POP']
cxpb = options['CXPB']
mutpb = options['MUTPB']
ngen = options['NGEN']
random.seed(64)
pool = multiprocessing.Pool(processes=12)
toolbox.register("map", pool.map)
Expand All @@ -132,58 +184,84 @@ def main(seed=None):
logbook = tools.Logbook()
logbook.header = "gen", "evals", "std", "min", "avg", "max"

pop = toolbox.population(POP)
tool_pop = toolbox.population(pop)

# Evaluate the individuals with an invalid fitness
invalid_ind = [ind for ind in pop if not ind.fitness.valid]
invalid_ind = [ind for ind in tool_pop if not ind.fitness.valid]
fitnesses = toolbox.map(toolbox.evaluate, invalid_ind)
for ind, fit in zip(invalid_ind, fitnesses):
ind.fitness.values = fit
# Compile statistics about the population
record = stats.compile(pop)
record = stats.compile(tool_pop)

logbook.record(gen=0, evals=len(invalid_ind), **record)
print(logbook.stream)
# Begin the generational process
for gen in range(1, NGEN):
offspring = algorithms.varAnd(pop, toolbox, CXPB, MUTPB)
for gen in range(1, ngen):
offspring = algorithms.varAnd(tool_pop, toolbox, cxpb, mutpb)
# Evaluate the individuals with an invalid fitness
invalid_ind = [ind for ind in offspring if not ind.fitness.valid]
fitnesses = toolbox.map(toolbox.evaluate, invalid_ind)
for ind, fit in zip(invalid_ind, fitnesses):
ind.fitness.values = fit
# Select the next generation population from parents and offspring
pop = toolbox.select(pop + offspring, POP)
tool_pop = toolbox.select(tool_pop + offspring, pop)

# Compile statistics about the new population
record = stats.compile(pop)
record = stats.compile(tool_pop)
logbook.record(gen=gen, evals=len(invalid_ind), **record)
print(logbook.stream)
return pop, logbook
return tool_pop, logbook

def main():
options = setup.get_options()

if options['filepath'] != '':
base_df = pd.read_csv(options['filepath'])

global dataFrame
dataFrame = generate_instances(options['samples'], options['attributes'],
options['classes'], options['maker'])

complexity_values = {}
global metrics
metrics = options['measures']
for measure in metrics:
complexity_values[measure] = options[measure]
global global_measures
global_measures = complexity_extraction(metrics,
dataframe_label=(
base_df, options['label_name']
),
complexity_values=complexity_values
)

filename = build_filename(options['filename'],
ngen=options['NGEN'],
metrics=metrics)


if __name__ == '__main__':
cont1 = 0
cont0 = 0
#dataFrame = pd.read_csv(str(N_ATTRIBUTES) + '.csv')
#dataFrame = dataFrame.drop('c0', axis=1)
dataFrame = df
# This Ecol object should be called according to the variable dataFrame.
# If dataFrame is renamed, then ecol_dataFrame should be renamed
# If dataFrame is renamed, then ecol_dataFrame should be renamed
# accordingly.
global ecol_dataFrame
ecol_dataFrame = Ecol(dataframe=dataFrame, label='label')
results = main()
print("logbook")
print(results[0][0])
for x in range(len(results[0])):
dic[print_evaluate(results[0][x])] = results[0][x]

print(metrics, len(metrics))
print(global_measures)
toolbox = setup_engine(options)
result = results(options, toolbox)

compiled_results = {}
for x in range(len(result[0])):
compiled_results[print_evaluate(result[0][x])] = result[0][x]
outfile = open(filename, 'wb')
pickle.dump(dic, outfile)
pickle.dump(compiled_results, outfile)
outfile.close()

df['label'] = results[0][0]
dataFrame['label'] = result[0][0]
# Scale to original Dataset (Optional) #TODO: Improve preprocessing
# df = preprocess.scaleColumnsFrom(base_df, df, label_column='label')
df.to_csv(str(filename)+".csv")
ax1 = df.plot.scatter(x=0, y=1, c='label', colormap='Paired')
pyplot.show()
dataFrame.to_csv(str(filename)+".csv")

if __name__ == '__main__':
main()
10 changes: 5 additions & 5 deletions src/instances_generator/generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ class InstancesGenerator:
5: '_mlabel_classf'
}

def __init__(self, options: dict):
def __init__(self, samples, attributes, classes=None, maker_option=None):
"""
Constructs the generator based on properties desired.
Expand All @@ -30,10 +30,10 @@ def __init__(self, options: dict):
properties desired to generate a dataset (e.g. samples,
attributes, classes).
"""
self._samples = options['samples']
self._attributes = options['attributes']
self._classes = options['classes']
self._optional_option = options['maker'][1]
self._samples = samples
self._attributes = attributes
self._classes = classes
self._optional_option = maker_option

def generate(self, type_gen: int) -> DataFrame:
"""
Expand Down
3 changes: 3 additions & 0 deletions src/setup/interactor.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,9 @@ def measures_input() -> list:
# Appends every Complexity Measure in measures list
return [cm(measure) for measure in input_Stream]

def generation_input() -> int:
return int(input("Com quantas gerações você deseja otimizar o dataset?\n"))

def __input_with_default__(input_text: str, default_value, data_type: type):
try:
return data_type(input(input_text))
Expand Down
Loading

0 comments on commit 36604f1

Please sign in to comment.