Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add Consensus Operators #96

Closed
wants to merge 17 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
17 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
90 changes: 83 additions & 7 deletions tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,15 +93,15 @@ def test_combine_dfs():
"""Check combine_dfs operator"""
tpot_obj = TPOT()

df1 = pd.DataFrame({'a': range(10),
'b': range(10, 20)})
df1 = pd.DataFrame({'a': list(range(10)),
'b': list(range(10, 20))})

df2 = pd.DataFrame({'b': range(10, 20),
'c': range(20, 30)})
df2 = pd.DataFrame({'b': list(range(10, 20)),
'c': list(range(20, 30))})

combined_df = pd.DataFrame({'a': range(10),
'b': range(10, 20),
'c': range(20, 30)})
combined_df = pd.DataFrame({'a': list(range(10)),
'b': list(range(10, 20)),
'c': list(range(20, 30))})

assert tpot_obj._combine_dfs(df1, df2).equals(combined_df)

Expand All @@ -127,3 +127,79 @@ def test_static_models():
result = result[result['group'] == 'testing']

assert np.array_equal(result['guess'].values, sklearn_model_obj.predict(testing_features)), "Model {} failed".format(str(model))

def test_consensus():
tpot_obj = TPOT()

df1 = pd.DataFrame({'00001': [2,2,2,2,2,2,2,2,2,2],
'00002': [3,3,3,3,3,3,3,3,3,3],
'class': [1,1,1,1,1,0,0,0,0,0],
'guess': [1,0,1,0,1,0,1,0,1,0]})

df2 = pd.DataFrame({'00001': [2,2,2,2,2,2,2,2,2,2],
'00002': [3,3,3,3,3,3,3,3,3,3],
'class': [1,1,1,1,1,0,0,0,0,0],
'guess': [0,0,0,0,0,1,1,1,1,1]})

df3 = pd.DataFrame({'00001': [2,2,2,2,2,2,2,2,2,2],
'00002': [3,3,3,3,3,3,3,3,3,3],
'class': [1,1,1,1,1,0,0,0,0,0],
'guess': [0,1,0,1,0,1,0,1,0,1]})

df4 = pd.DataFrame({'00001': [2,2,2,2,2,2,2,2,2,2],
'00002': [3,3,3,3,3,3,3,3,3,3],
'class': [1,1,1,1,1,0,0,0,0,0],
'guess': [1,1,0,0,1,1,0,0,1,1]})

uniform = [1.0] * 10
adaboost = [1.0] * 10

accuracy1 = 0.60
accuracy2 = 0.0
accuracy3 = 0.40
accuracy4 = 0.50
res_two_acc_max = pd.DataFrame({'class': [1,1,1,1,1,0,0,0,0,0],
'guess': [1,0,1,0,1,0,1,0,1,0]})
res_two_acc_mean = pd.DataFrame({'class': [1,1,1,1,1,0,0,0,0,0],
'guess': [1,0,1,0,1,0,1,0,1,0]})
res_two_acc_med = pd.DataFrame({'class': [1,1,1,1,1,0,0,0,0,0],
'guess': [1,0,1,0,1,0,1,0,1,0]})
res_two_acc_min = pd.DataFrame({'class': [1,1,1,1,1,0,0,0,0,0],
'guess': [0,0,0,0,0,1,1,1,1,1]})
res_two_uni_max = pd.DataFrame({'class': [1,1,1,1,1,0,0,0,0,0],
'guess': [1,0,1,0,1,0,1,0,1,0]})
res_two_ada_max = pd.DataFrame({'class': [1,1,1,1,1,0,0,0,0,0],
'guess': [1,0,1,0,1,0,1,0,1,0]})
res_three_uni_thresh = pd.DataFrame({'class': [1,1,1,1,1,0,0,0,0,0],
'guess': [0,0,0,0,0,1,1,1,1,1]})
res_three_ada_max = pd.DataFrame({'class': [1,1,1,1,1,0,0,0,0,0],
'guess': [1,1,0,0,1,1,0,0,1,1]})
res_four_ada_max = pd.DataFrame({'class': [1,1,1,1,1,0,0,0,0,0],
'guess': [1,0,0,0,1,1,1,0,1,1]})

tests = [(tpot_obj._consensus_two, 0, 2, df1, df2), #accuracy, max
(tpot_obj._consensus_two, 0, 3, df1, df2), #accuracy mean
(tpot_obj._consensus_two, 0, 4, df1, df2), # accuracy median
(tpot_obj._consensus_two, 0, 5, df1, df2), # accuracy min
(tpot_obj._consensus_two, 1, 2, df1, df2), # uniform max
#(tpot_obj._consensus_two, 2, 3, df1, df2), # adaboost max
(tpot_obj._consensus_three, 1, 6, df1, df2, df3), #uni threshold
(tpot_obj._consensus_three, 1, 2, df1, df3, df4), # uni max
(tpot_obj._consensus_four, 1, 2, df1, df2, df3, df4) #uni max
]
results = [res_two_acc_max,
res_two_acc_mean,
res_two_acc_med,
res_two_acc_min,
res_two_uni_max,
#res_two_ada_max,
res_three_uni_thresh,
res_three_ada_max,
res_four_ada_max
]

for test, res in zip(tests, results):
#print(test[1:3])
#print(test[0](*test[1:])['guess'].values)
assert np.array_equal(test[0](*test[1:])['guess'].values, res['guess'].values)

271 changes: 271 additions & 0 deletions tpot/export_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,106 @@

import deap

consensus_options = ['accuracy', 'uniform', 'max', 'mean', 'median', 'min', 'threshold']
num_consensus_options = 7
consensus_opt_split_ix = 1

def consensus_operator_prefix(weight_scheme, method, operator_text):
"""Utility function for generating the first part of the consensus operator text

Parameters
----------
weight_scheme: integer
The corrected-for weight_scheme index in consensus_options
method: integer
The corrected-for method index in consensus_options
operator_text: String
The current operator export string (to be appended onto)

Returns
-------
operator_text: String
The updated operator export string
"""

operator_text +='''\n
def _get_ht_dict(classes, weights):
"""Return a dictionary where the keys are the unique class values present in this row of guesses, and the weights are the weights assigned to each guess.
"""
ret = {}
ctr = 0
for cls in classes:
try:
ret[cls] += weights[ctr]
except:
ret[cls] = weights[ctr]
ctr += 1
return ret

def _get_top( classes, tups):
"""Return the class from the row in the first DataFrame passed to the function (e.g., input_df1)
"""
values = [tup[0] for tup in tups if tup[1] == tups[0][1]]
for class_ in classes:
if class_ in values:
return class_
'''
if consensus_options[method % num_consensus_options] == 'max':
operator_text += '''\n
def _max_class(classes, weights):
"""Return the class with the highest weight, or the class that appears first with that weight (e.g., input_df1)
"""
ht = _get_ht_dict(classes, weights)
return _get_top(classes, sorted(list(ht.items()), key=operator.itemgetter(1), reverse=True))
method = _max_class
'''
elif consensus_options[method % num_consensus_options] == 'mean':
operator_text += '''\n
def _mean_class( classes, weights):
"""Return the class closest to the mean weight, or the class that appears first with that weight (e.g., input_df1)
"""
ht = _get_ht_dict(classes, weights)
mean_val = np.mean(list(ht.values()))
return _get_top(classes, sorted(((x, abs(y - mean_val)) for (x,y) in list(ht.items())), key=operator.itemgetter(1)))
method = _mean_class
'''
elif consensus_options[method % num_consensus_options] == 'median':
operator_text += '''\n
def _median_class(classes, weights):
"""Return the class closest to the median weight, or the class that appears first with that weight (e.g., input_df1)
"""
ht = _get_ht_dict(classes, weights)
median_val = np.median(list(ht.values()))
return _get_top(classes, sorted(((x, abs(y - median_val)) for (x,y) in list(ht.items())), key=operator.itemgetter(1)))
method = _median_class
'''
elif consensus_options[method % num_consensus_options] == 'min':
operator_text += '''\n
def _min_class(classes, weights):
"""Return the class with the minimal weight, or the class that appears first with that weight (e.g., input_df1)
"""
ht = _get_ht_dict(classes, weights)
return _get_top(classes, sorted(list(ht.items()), key=operator.itemgetter(1)))
method = _min_class
'''
elif consensus_options[method % num_consensus_options] == 'threshold':
operator_text += '''\n
def _threshold_class(self, classes, weights):
"""Return the class with that contains a certain percentage of the weight
"""
ht = _get_ht_dict(classes, weights)
total_weight = sum(list(ht.values()))
threshold = 0.75
sorted_classes = sorted(((x, float(y) / total_weight) for (x,y) in list(ht.items()) if (float(y) / total_weight) > threshold), key=operator.itemgetter(1), reverse=True)
while len(sorted_classes) == 0:
threshold = threshold - 0.05
sorted_classes = sorted(((x, float(y) / total_weight) for (x,y) in list(ht.items()) if (float(y) / total_weight) > threshold), key=operator.itemgetter(1), reverse=True)

return _get_top(classes, sorted_classes)
'''
return operator_text


def replace_mathematical_operators(exported_pipeline):
"""Replace all of the mathematical operators with their results for use in export(self, output_file_name)

Expand Down Expand Up @@ -293,6 +393,177 @@ def replace_function_calls(pipeline_list):
operator_text += '\n# Combine two DataFrames'
operator_text += '\n{OUTPUT_DF} = {INPUT_DF1}.join({INPUT_DF2}[[column for column in {INPUT_DF2}.columns.values if column not in {INPUT_DF1}.columns.values]])\n'.format(INPUT_DF1=operator[2], INPUT_DF2=operator[3], OUTPUT_DF=result_name)

elif operator_name == '_consensus_two':
weight_scheme = int(operator[2])
method = int(operator[3])
if weight_scheme % num_consensus_options > consensus_opt_split_ix:
weight_scheme = consensus_opt_split_ix
if method % num_consensus_options <= consensus_opt_split_ix:
method = consensus_opt_split_ix + 1

operator_text += consensus_operator_prefix(weight_scheme, method, operator_text)

operator_text += '\n# Combine two DataFrames'
operator_text += '\ndfs = [{0}, {1}]'.format(operator[4], operator[5])
operator_text += '''
ignore_consensus = False
if any(len(df.columns) == 3 for df in dfs):
found = False

for df in dfs:
if len(df.columns) > 3:
{0} = df.copy()
found = True
break
if not found:
ignore_consensus = True
{0} = dfs[0].copy()

if not ignore_consensus:
weights = []
for df in dfs:
tup = df[['guess', 'class']]
num_correct = len(np.where(tup['guess'] == tup['class'])[0])
total_vals = len(tup['guess'].index)'''.format(result_name)
if consensus_options[weight_scheme % num_consensus_options] == 'accuracy':
operator_text +='''
weights.append(float(num_correct) / float(total_vals))
'''
elif consensus_options[weight_scheme % num_consensus_options] == 'uniform':
operator_text +='''
weights.append(1.0)
'''
operator_text += '''
# Initialize the dataFrame containing just the guesses, and to hold the results
merged_guesses = pd.DataFrame(data={0}[['guess']].values, columns=['guess_1'])
merged_guesses.loc[:, 'guess_2'] = {1}['guess']
merged_guesses.loc[:, 'guess'] = None

for row_ix in merged_guesses.index:
merged_guesses['guess'].loc[row_ix] = method(merged_guesses[['guess_1', 'guess_2']].iloc[row_ix], weights)
{2} = {0}.join({1}[[column for column in {1}.columns.values if column not in {0}.columns.values]])
if 'guess' in {2}.columns.values:
{2} = {2}.drop('guess', 1).join(merged_guesses['guess']).copy()
else:
{2} = {2}.join(merged_guesses['guess'])
'''.format(operator[4], operator[5], result_name)

elif operator_name == '_consensus_three':
weight_scheme = int(operator[2])
method = int(operator[3])
if weight_scheme % num_consensus_options > consensus_opt_split_ix:
weight_scheme = consensus_opt_split_ix
if method % num_consensus_options <= consensus_opt_split_ix:
method = consensus_opt_split_ix + 1

operator_text += consensus_operator_prefix(weight_scheme, method, operator_text)

operator_text += '\n# Combine three DataFrames'
operator_text += '\ndfs = [{0}, {1}, {2}]'.format(operator[4], operator[5], operator[6])
operator_text += '''
ignore_consensus = False
if any(len(df.columns) == 3 for df in dfs):
found = False

for df in dfs:
if len(df.columns) > 3:
{0} = df.copy()
found = True
break
if not found:
ignore_consensus = True
{0} = dfs[0].copy()

if not ignore_consensus:
weights = []
for df in dfs:
tup = df[['guess', 'class']]
num_correct = len(np.where(tup['guess'] == tup['class'])[0])
total_vals = len(tup['guess'].index)'''.format(result_name)
if consensus_options[weight_scheme % num_consensus_options] == 'accuracy':
operator_text +='''
weights.append(float(num_correct) / float(total_vals))
'''
elif consensus_options[weight_scheme % num_consensus_options] == 'uniform':
operator_text +='''
weights.append(1.0)
'''
operator_text += '''
# Initialize the dataFrame containing just the guesses, and to hold the results
merged_guesses = pd.DataFrame(data={0}[['guess']].values, columns=['guess_1'])
merged_guesses.loc[:, 'guess_2'] = {1}['guess']
merged_guesses.loc[:, 'guess_3'] = {2}['guess']
merged_guesses.loc[:, 'guess'] = None

for row_ix in merged_guesses.index:
merged_guesses['guess'].loc[row_ix] = method(merged_guesses[['guess_1', 'guess_2', 'guess_3']].iloc[row_ix], weights)
{3} = {0}.join({1}[[column for column in {1}.columns.values if column not in {0}.columns.values]])
{3} = {3}.join({2}[[column for column in {2}.columns.values if column not in {3}.columns.values]])
if 'guess' in {3}.columns.values:
{3} = {3}.drop('guess', 1).join(merged_guesses['guess']).copy()
else:
{3} = {3}.join(merged_guesses['guess'])
'''.format(operator[4], operator[5], operator[6], result_name)

elif operator_name == '_consensus_four':
weight_scheme = int(operator[2])
method = int(operator[3])
if weight_scheme % num_consensus_options > consensus_opt_split_ix:
weight_scheme = consensus_opt_split_ix
if method % num_consensus_options <= consensus_opt_split_ix:
method = consensus_opt_split_ix + 1

operator_text += consensus_operator_prefix(weight_scheme, method, operator_text)

operator_text += '\n# Combine four DataFrames'
operator_text += '\ndfs = [{0}, {1}, {2}, {3}]'.format(operator[4], operator[5], operator[6], operator[7])
operator_text += '''
ignore_consensus = False
if any(len(df.columns) == 3 for df in dfs):
found = False

for df in dfs:
if len(df.columns) > 3:
{0} = df.copy()
found = True
break
if not found:
ignore_consensus = True
{0} = dfs[0].copy()

if not ignore_consensus:
weights = []
for df in dfs:
tup = df[['guess', 'class']]
num_correct = len(np.where(tup['guess'] == tup['class'])[0])
total_vals = len(tup['guess'].index)'''.format(result_name)
if consensus_options[weight_scheme % num_consensus_options] == 'accuracy':
operator_text +='''
weights.append(float(num_correct) / float(total_vals))
'''
elif consensus_options[weight_scheme % num_consensus_options] == 'uniform':
operator_text +='''
weights.append(1.0)
'''
operator_text += '''
# Initialize the dataFrame containing just the guesses, and to hold the results
merged_guesses = pd.DataFrame(data={0}[['guess']].values, columns=['guess_1'])
merged_guesses.loc[:, 'guess_2'] = {1}['guess']
merged_guesses.loc[:, 'guess_3'] = {2}['guess']
merged_guesses.loc[:, 'guess_4'] = {3}['guess']
merged_guesses.loc[:, 'guess'] = None

for row_ix in merged_guesses.index:
merged_guesses['guess'].loc[row_ix] = method(merged_guesses[['guess_1', 'guess_2', 'guess_3', 'guess_4']].iloc[row_ix], weights)
{4} = {0}.join({1}[[column for column in {1}.columns.values if column not in {0}.columns.values]])
{4} = {4}.join({2}[[column for column in {2}.columns.values if column not in {4}.columns.values]])
{4} = {4}.join({3}[[column for column in {3}.columns.values if column not in {4}.columns.values]])
if 'guess' in {4}.columns.values:
{4} = {4}.drop('guess', 1).join(merged_guesses['guess']).copy()
else:
{4} = {4}.join(merged_guesses['guess'])
'''.format(operator[4], operator[5], operator[6], result_name)

elif operator_name == '_variance_threshold':
threshold = float(operator[3])

Expand Down
Loading