Skip to content

Commit

Permalink
[Feature] Módulo Gerador de Instâncias (#24)
Browse files Browse the repository at this point in the history
Merge pull request #24 from SteffanoP/feature/instances-generator
  • Loading branch information
SteffanoP authored Apr 5, 2022
2 parents 64bd498 + b91ba83 commit 51a4f08
Show file tree
Hide file tree
Showing 8 changed files with 309 additions and 109 deletions.
40 changes: 10 additions & 30 deletions src/cbdgen-framework.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,8 @@
import rpy2.robjects as robjects

import setup.setup_framework as setup
from instances_generator.generator import InstancesGenerator
import complexity as complx
import generate
import preprocess

# TODO: Implement Setup in a minimal main()
Expand All @@ -29,44 +29,24 @@
tread = ""
select_new_dataset = "N"
NGEN = 1000
# NGEN = options['NGEN']
CXPB = 0.7
MUTPB = 0.2
INDPB = 0.05
POP = 100

n_instancias = options['samples']
n_features = options['attributes']
n_classes = options['classes']

dataset = options['maker'][0]

if(dataset == 1):
centers = int(options['maker'][1])
df = generate.blobs(n_instancias, centers, n_features)
if (dataset == 2):
noise = options['maker'][1]
df = generate.moons(n_instancias, noise)
if (dataset == 3):
noise = options['maker'][1]
df = generate.circles(n_instancias, noise)
if (dataset == 4):
df = generate.classification(n_instancias, n_features, n_classes)
if (dataset == 5):
n_labels = int(options['maker'][1])
df = generate.multilabel_classification(n_instancias, n_features, n_classes, n_labels)
# TODO: Implement Generator of Instances in a minimal main()
gen_instances = InstancesGenerator(options)
df = gen_instances.generate(options['maker'][0])

filename = options['filename'] if options['filename'] != "" else "NGEN=" + \
str(NGEN)

print("Você deseja basear as métricas a um dataset já existente? (y/N)")
escolha = input()

metricasList = options['measures']

if (escolha == 'y'):
base_dataset = load_iris()
base_df = pd.DataFrame(data=np.c_[base_dataset['data'], base_dataset['target']], columns=base_dataset['feature_names'] + ['target'])
target = "target"
if (options['filepath'] != ""):
base_df = pd.read_csv(options['filepath'])
target = options['label_name']

# Copying Columns names
df.columns = preprocess.copyFeatureNamesFrom(base_df, label_name=target)
Expand Down Expand Up @@ -120,7 +100,7 @@
globalF2 = float(objetivo)
filename += "-F2"

N_ATTRIBUTES = int(n_instancias)
N_ATTRIBUTES = int(options['samples']) # mispelled variable name
print(metricasList, len(metricasList))
print(globalN1, globalLinear, globalBalance, globalF2)
NOBJ = len(metricasList)
Expand Down Expand Up @@ -207,7 +187,7 @@ def print_evaluate(individual):
creator.create("Individual", list, fitness=creator.FitnessMin)

RANDINT_LOW = 0
RANDINT_UP = n_classes - 1
RANDINT_UP = options['classes'] - 1

toolbox = base.Toolbox()
toolbox.register("attr_int", random.randint, RANDINT_LOW, RANDINT_UP)
Expand Down
53 changes: 0 additions & 53 deletions src/generate.py

This file was deleted.

73 changes: 73 additions & 0 deletions src/instances_generator/generator.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
from pandas import DataFrame
import instances_generator.maker as maker

class InstancesGenerator:
"""
A class to generate instances based on samples, attributes and classes.
Methods
-------
generate(options):
Generate a DataFrame of instances built by a maker based on samples,
attributes and classes.
"""

types_generator = {
1: '_blobs',
2: '_moons',
3: '_circles',
4: '_classf',
5: '_mlabel_classf'
}

def __init__(self, options: dict):
"""
Constructs the generator based on properties desired.
Parameters
----------
options : dict
properties desired to generate a dataset (e.g. samples,
attributes, classes).
"""
self._samples = options['samples']
self._attributes = options['attributes']
self._classes = options['classes']
self._optional_option = options['maker'][1]

def generate(self, type_gen: int) -> DataFrame:
"""
Generate a DataFrame of instances built by a maker based on samples,
attributes and classes.
Parameters
----------
type_gen : int, required
Selects the type of maker desired to generate a data set
Returns
-------
pandas.DataFrame
"""
return getattr(self, self.types_generator.get(type_gen))()

def _blobs(self) -> DataFrame:
centers = self._optional_option
return maker.blobs(self._samples, centers, self._attributes)

def _moons(self) -> DataFrame:
noise = self._optional_option
return maker.moons(self._samples, noise)

def _circles(self) -> DataFrame:
noise = self._optional_option
return maker.circles(self._samples, noise)

def _classf(self) -> DataFrame:
return maker.classification(self._samples, self._attributes,
self._classes)

def _mlabel_classf(self) -> DataFrame:
labels = self._optional_option
return maker.multilabel_classification(self._samples, self._attributes,
self._classes, labels)
126 changes: 126 additions & 0 deletions src/instances_generator/maker.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,126 @@
import pandas as pd
import numpy as np

# Supported dataset generators
from sklearn.datasets import make_blobs
from sklearn.datasets import make_moons
from sklearn.datasets import make_circles
from sklearn.datasets import make_classification
from sklearn.datasets import make_multilabel_classification as make_mlabel_classification

# Use same random seed for multiple calls to make_multilabel_classification to
# ensure same distributions
RANDOM_SEED = np.random.randint(2 ** 10)

def blobs(samples, centers, features):
"""
Generate isotropic Gaussian blobs for clustering, but resumes to 3 main
parameters.
See more at <https://scikit-learn.org/stable/modules/generated/sklearn.datasets.make_blobs.html>
Parameters
----------
samples : int
The total number of points equally divided among clusters.
centers : int
The number of centers to generate, or the fixed center locations.
features : int
The number of features for each sample.
Returns
-------
DataFrame : pandas.DataFrame
A DataFrame of the generated samples grouped by x and y.
"""
X, y = make_blobs(n_samples=samples, centers=centers,
n_features=features)
return _create_pd_dataframe(X, y)

def moons(samples, noise):
"""
Make two interleaving half circles.
See more at <https://scikit-learn.org/stable/modules/generated/sklearn.datasets.make_moons.html>
Parameters
----------
samples : int
The total number of points generated.
noise : int
Standard deviation of Gaussian noise added to the data.
Returns
-------
DataFrame : pandas.DataFrame
A DataFrame of the generated samples grouped by x and y.
"""
X, y = make_moons(n_samples=samples, noise=noise)
return _create_pd_dataframe(X, y)

def circles(samples, noise):
"""
Make a large circle containing a smaller circle in 2d.
See more: <https://scikit-learn.org/stable/modules/generated/sklearn.datasets.make_circles.html>
Parameters
----------
samples : int
The total number of points generated.
noise : int
Standard deviation of Gaussian noise added to the data.
Returns
-------
DataFrame : pandas.DataFrame
A DataFrame of the generated samples grouped by x and y.
"""
X, y = make_circles(n_samples=samples, noise=noise)
return _create_pd_dataframe(X, y)

def classification(samples, features, classes):
"""
Generate a random n-class classification problem.
See more at <https://scikit-learn.org/stable/modules/generated/sklearn.datasets.make_blobs.html>
Parameters
----------
samples : int
The total number of points.
features : int
The number of informative features.
features : int
The number of features for each sample.
Returns
-------
DataFrame : pandas.DataFrame
A DataFrame of the generated samples grouped by x and y.
"""
X, y = make_classification(
n_samples=samples,
n_features=features,
n_classes=classes,
n_redundant=0,
n_informative=2,
n_clusters_per_class=1
)
return _create_pd_dataframe(X, y)

def multilabel_classification(samples, features, classes, labels):
X, y = make_mlabel_classification(
n_samples=samples,
n_features=features,
n_classes=classes,
n_labels=labels,
allow_unlabeled=False,
random_state=RANDOM_SEED
)
return _create_pd_dataframe(X, y)

def _create_pd_dataframe(samples, label):
df = pd.DataFrame(samples)
df['label'] = label
return df
9 changes: 9 additions & 0 deletions src/setup/argparser.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,14 @@ def parse_args() -> argparse.ArgumentParser:
type=int,
default=2
)
parser.add_argument('-b','--based-on',
dest='option_based_on_filepath_label',
help="Allows the framework to run on based-on mode by "
"passing the filepath of the data set",
nargs=2,
type=str,
default=["",""]
)
parser.add_argument('--classes',
dest='number_of_classes',
help="number of classes to be targeted"
Expand Down Expand Up @@ -50,6 +58,7 @@ def parse_args() -> argparse.ArgumentParser:
type=int,
default=100
)
# TODO: flag --maker must be a required flag
parser.add_argument('--maker',
dest="maker",
help="The maker to generate a random dataset.",
Expand Down
Loading

0 comments on commit 51a4f08

Please sign in to comment.