EpistasisLab · bartleyn · Feb 19, 2016 · Feb 22, 2016 · Feb 22, 2016 · Feb 26, 2016
diff --git a/tests.py b/tests.py
@@ -93,15 +93,15 @@ def test_combine_dfs():
     """Check combine_dfs operator"""
     tpot_obj = TPOT()
 
-    df1 = pd.DataFrame({'a': range(10),
-                        'b': range(10, 20)})
+    df1 = pd.DataFrame({'a': list(range(10)),
+                        'b': list(range(10, 20))})
 
-    df2 = pd.DataFrame({'b': range(10, 20),
-                        'c': range(20, 30)})
+    df2 = pd.DataFrame({'b': list(range(10, 20)),
+                        'c': list(range(20, 30))})
 
-    combined_df = pd.DataFrame({'a': range(10),
-                                'b': range(10, 20),
-                                'c': range(20, 30)})
+    combined_df = pd.DataFrame({'a': list(range(10)),
+                                'b': list(range(10, 20)),
+                                'c': list(range(20, 30))})
 
     assert tpot_obj._combine_dfs(df1, df2).equals(combined_df)
 
@@ -127,3 +127,79 @@ def test_static_models():
         result = result[result['group'] == 'testing']
 
         assert np.array_equal(result['guess'].values, sklearn_model_obj.predict(testing_features)), "Model {} failed".format(str(model))
+
+def test_consensus():
+    tpot_obj = TPOT()
+
+    df1 = pd.DataFrame({'00001': [2,2,2,2,2,2,2,2,2,2],
+                        '00002': [3,3,3,3,3,3,3,3,3,3],
+                        'class': [1,1,1,1,1,0,0,0,0,0],
+                        'guess': [1,0,1,0,1,0,1,0,1,0]})
+
+    df2 = pd.DataFrame({'00001': [2,2,2,2,2,2,2,2,2,2],
+                        '00002': [3,3,3,3,3,3,3,3,3,3],
+                        'class': [1,1,1,1,1,0,0,0,0,0],
+                        'guess': [0,0,0,0,0,1,1,1,1,1]})
+
+    df3 = pd.DataFrame({'00001': [2,2,2,2,2,2,2,2,2,2],
+                        '00002': [3,3,3,3,3,3,3,3,3,3],
+                        'class': [1,1,1,1,1,0,0,0,0,0],
+                        'guess': [0,1,0,1,0,1,0,1,0,1]})
+
+    df4 = pd.DataFrame({'00001': [2,2,2,2,2,2,2,2,2,2],
+                        '00002': [3,3,3,3,3,3,3,3,3,3],
+                        'class': [1,1,1,1,1,0,0,0,0,0],
+                        'guess': [1,1,0,0,1,1,0,0,1,1]})
+
+    uniform = [1.0] * 10
+    adaboost = [1.0] * 10
+
+    accuracy1 = 0.60
+    accuracy2 = 0.0
+    accuracy3 = 0.40
+    accuracy4 = 0.50
+    res_two_acc_max = pd.DataFrame({'class': [1,1,1,1,1,0,0,0,0,0],
+                                    'guess': [1,0,1,0,1,0,1,0,1,0]})
+    res_two_acc_mean = pd.DataFrame({'class': [1,1,1,1,1,0,0,0,0,0],
+                                     'guess': [1,0,1,0,1,0,1,0,1,0]})
+    res_two_acc_med = pd.DataFrame({'class': [1,1,1,1,1,0,0,0,0,0],
+                                    'guess': [1,0,1,0,1,0,1,0,1,0]})
+    res_two_acc_min = pd.DataFrame({'class': [1,1,1,1,1,0,0,0,0,0],
+                                    'guess': [0,0,0,0,0,1,1,1,1,1]})
+    res_two_uni_max = pd.DataFrame({'class': [1,1,1,1,1,0,0,0,0,0],
+                                    'guess': [1,0,1,0,1,0,1,0,1,0]})
+    res_two_ada_max = pd.DataFrame({'class': [1,1,1,1,1,0,0,0,0,0],
+                                    'guess': [1,0,1,0,1,0,1,0,1,0]})
+    res_three_uni_thresh = pd.DataFrame({'class': [1,1,1,1,1,0,0,0,0,0],
+                                         'guess': [0,0,0,0,0,1,1,1,1,1]})
+    res_three_ada_max = pd.DataFrame({'class': [1,1,1,1,1,0,0,0,0,0],
+                                      'guess': [1,1,0,0,1,1,0,0,1,1]})
+    res_four_ada_max = pd.DataFrame({'class': [1,1,1,1,1,0,0,0,0,0],
+                                     'guess': [1,0,0,0,1,1,1,0,1,1]})
+
+    tests = [(tpot_obj._consensus_two, 0, 2, df1, df2), #accuracy, max
+             (tpot_obj._consensus_two, 0, 3, df1, df2), #accuracy mean
+             (tpot_obj._consensus_two, 0, 4, df1, df2), # accuracy median
+             (tpot_obj._consensus_two, 0, 5, df1, df2), # accuracy min
+             (tpot_obj._consensus_two, 1, 2, df1, df2), # uniform max
+             #(tpot_obj._consensus_two, 2, 3, df1, df2), # adaboost max
+             (tpot_obj._consensus_three, 1, 6, df1, df2, df3), #uni threshold
+             (tpot_obj._consensus_three, 1, 2, df1, df3, df4), # uni max
+             (tpot_obj._consensus_four, 1, 2, df1, df2, df3, df4) #uni  max
+             ]
+    results = [res_two_acc_max,
+           res_two_acc_mean,
+           res_two_acc_med,
+           res_two_acc_min,
+           res_two_uni_max,
+           #res_two_ada_max,
+           res_three_uni_thresh,
+           res_three_ada_max,
+           res_four_ada_max
+           ]
+
+    for test, res in zip(tests, results):
+        #print(test[1:3])
+        #print(test[0](*test[1:])['guess'].values)
+        assert np.array_equal(test[0](*test[1:])['guess'].values, res['guess'].values)
+
diff --git a/tpot/export_utils.py b/tpot/export_utils.py
@@ -22,6 +22,106 @@
 
 import deap
 
+consensus_options = ['accuracy', 'uniform', 'max', 'mean', 'median', 'min', 'threshold']
+num_consensus_options = 7
+consensus_opt_split_ix = 1
+
+def consensus_operator_prefix(weight_scheme, method, operator_text):
+    """Utility function for generating the first part of the consensus operator text
+
+    Parameters
+    ----------
+    weight_scheme: integer 
+        The corrected-for weight_scheme index in consensus_options 
+    method: integer
+        The corrected-for method index in consensus_options
+    operator_text: String
+        The current operator export string (to be appended onto)
+
+    Returns
+    -------
+    operator_text: String
+        The updated operator export string
+    """
+
+    operator_text +='''\n
+def _get_ht_dict(classes, weights):
+    """Return a dictionary where the keys are the unique class values present in this row of guesses, and the weights are the weights assigned to each guess.
+    """
+    ret = {}
+    ctr = 0
+    for cls in classes:
+        try:
+            ret[cls] += weights[ctr]
+        except:
+            ret[cls] = weights[ctr]
+        ctr += 1
+    return ret
+
+def _get_top( classes, tups):
+    """Return the class from the row in the first DataFrame passed to the function (e.g., input_df1)
+    """
+    values = [tup[0] for tup in tups if tup[1] == tups[0][1]]
+    for class_ in classes:
+        if class_ in values:
+            return class_
+    '''
+    if consensus_options[method % num_consensus_options] == 'max':
+        operator_text += '''\n
+def _max_class(classes, weights):
+    """Return the class with the highest weight, or the class that appears first with that weight (e.g., input_df1)
+    """
+    ht = _get_ht_dict(classes, weights)
+    return _get_top(classes, sorted(list(ht.items()), key=operator.itemgetter(1), reverse=True))
+method = _max_class
+        '''
+    elif consensus_options[method % num_consensus_options] == 'mean':
+        operator_text += '''\n
+def _mean_class( classes, weights):
+    """Return the class closest to the mean weight, or the class that appears first with that weight (e.g., input_df1)
+    """
+    ht = _get_ht_dict(classes, weights)
+    mean_val = np.mean(list(ht.values()))
+    return _get_top(classes, sorted(((x, abs(y - mean_val)) for (x,y) in list(ht.items())), key=operator.itemgetter(1)))
+method = _mean_class
+        '''
+    elif consensus_options[method % num_consensus_options] == 'median':
+        operator_text += '''\n
+def _median_class(classes, weights):
+    """Return the class closest to the median weight, or the class that appears first with that weight (e.g., input_df1)
+    """
+    ht = _get_ht_dict(classes, weights)
+    median_val = np.median(list(ht.values()))
+    return _get_top(classes, sorted(((x, abs(y - median_val)) for (x,y) in list(ht.items())), key=operator.itemgetter(1)))
+method = _median_class
+        '''
+    elif consensus_options[method % num_consensus_options] == 'min':
+        operator_text += '''\n
+def _min_class(classes, weights):
+    """Return the class with the minimal weight, or the class that appears first with that weight (e.g., input_df1)
+    """
+    ht = _get_ht_dict(classes, weights)
+    return _get_top(classes, sorted(list(ht.items()), key=operator.itemgetter(1)))
+method = _min_class
+        '''
+    elif consensus_options[method % num_consensus_options] == 'threshold':
+        operator_text += '''\n
+def _threshold_class(self, classes, weights):
+    """Return the class with that contains a certain percentage of the weight 
+    """
+    ht = _get_ht_dict(classes, weights)
+    total_weight = sum(list(ht.values()))
+    threshold = 0.75
+    sorted_classes = sorted(((x, float(y) / total_weight) for (x,y) in list(ht.items()) if (float(y) / total_weight) > threshold), key=operator.itemgetter(1), reverse=True)
+    while len(sorted_classes) == 0:
+        threshold = threshold - 0.05
+        sorted_classes = sorted(((x, float(y) / total_weight) for (x,y) in list(ht.items()) if (float(y) / total_weight) > threshold), key=operator.itemgetter(1), reverse=True)
+
+    return _get_top(classes, sorted_classes)
+        '''
+    return operator_text
+
+
 def replace_mathematical_operators(exported_pipeline):
     """Replace all of the mathematical operators with their results for use in export(self, output_file_name)
 
@@ -293,6 +393,177 @@ def replace_function_calls(pipeline_list):
             operator_text += '\n# Combine two DataFrames'
             operator_text += '\n{OUTPUT_DF} = {INPUT_DF1}.join({INPUT_DF2}[[column for column in {INPUT_DF2}.columns.values if column not in {INPUT_DF1}.columns.values]])\n'.format(INPUT_DF1=operator[2], INPUT_DF2=operator[3], OUTPUT_DF=result_name)
 
+        elif operator_name == '_consensus_two':
+            weight_scheme = int(operator[2])
+            method = int(operator[3])
+            if weight_scheme % num_consensus_options > consensus_opt_split_ix:
+                weight_scheme = consensus_opt_split_ix
+            if method % num_consensus_options <= consensus_opt_split_ix:
+                method = consensus_opt_split_ix + 1
+
+            operator_text += consensus_operator_prefix(weight_scheme, method, operator_text)
+
+            operator_text += '\n# Combine two DataFrames'
+            operator_text += '\ndfs = [{0}, {1}]'.format(operator[4], operator[5])
+            operator_text += '''
+ignore_consensus = False
+if any(len(df.columns) == 3 for df in dfs):
+    found = False
+
+    for df in dfs:
+        if len(df.columns) > 3:
+            {0} = df.copy()
+            found = True
+            break
+    if not found:
+        ignore_consensus = True
+        {0} = dfs[0].copy()
+
+if not ignore_consensus:
+    weights = []
+    for df in dfs:
+        tup = df[['guess', 'class']]
+        num_correct = len(np.where(tup['guess'] == tup['class'])[0])
+        total_vals = len(tup['guess'].index)'''.format(result_name)
+            if consensus_options[weight_scheme % num_consensus_options] == 'accuracy':
+                operator_text +='''
+        weights.append(float(num_correct) / float(total_vals))
+        '''
+            elif consensus_options[weight_scheme % num_consensus_options] == 'uniform':
+                operator_text +='''
+        weights.append(1.0)
+        '''
+            operator_text += '''
+    # Initialize the dataFrame containing just the guesses, and to hold the results
+    merged_guesses = pd.DataFrame(data={0}[['guess']].values, columns=['guess_1'])
+    merged_guesses.loc[:, 'guess_2'] = {1}['guess']
+    merged_guesses.loc[:, 'guess'] = None
+
+    for row_ix in merged_guesses.index:
+        merged_guesses['guess'].loc[row_ix] = method(merged_guesses[['guess_1', 'guess_2']].iloc[row_ix], weights)
+    {2} = {0}.join({1}[[column for column in {1}.columns.values if column not in {0}.columns.values]])
+    if 'guess' in {2}.columns.values:
+        {2} = {2}.drop('guess', 1).join(merged_guesses['guess']).copy()
+    else:
+        {2} = {2}.join(merged_guesses['guess'])
+        '''.format(operator[4], operator[5], result_name)
+
+        elif operator_name == '_consensus_three':
+            weight_scheme = int(operator[2])
+            method = int(operator[3])
+            if weight_scheme % num_consensus_options > consensus_opt_split_ix:
+                weight_scheme = consensus_opt_split_ix
+            if method % num_consensus_options <= consensus_opt_split_ix:
+                method = consensus_opt_split_ix + 1
+
+            operator_text += consensus_operator_prefix(weight_scheme, method, operator_text)
+
+            operator_text += '\n# Combine three DataFrames'
+            operator_text += '\ndfs = [{0}, {1}, {2}]'.format(operator[4], operator[5], operator[6])
+            operator_text += '''
+ignore_consensus = False
+if any(len(df.columns) == 3 for df in dfs):
+    found = False
+
+    for df in dfs:
+        if len(df.columns) > 3:
+            {0} = df.copy()
+            found = True
+            break
+    if not found:
+        ignore_consensus = True
+        {0} = dfs[0].copy()
+
+if not ignore_consensus:
+    weights = []
+    for df in dfs:
+        tup = df[['guess', 'class']]
+        num_correct = len(np.where(tup['guess'] == tup['class'])[0])
+        total_vals = len(tup['guess'].index)'''.format(result_name)
+            if consensus_options[weight_scheme % num_consensus_options] == 'accuracy':
+                operator_text +='''
+        weights.append(float(num_correct) / float(total_vals))
+        '''
+            elif consensus_options[weight_scheme % num_consensus_options] == 'uniform':
+                operator_text +='''
+        weights.append(1.0)
+        '''
+            operator_text += '''
+    # Initialize the dataFrame containing just the guesses, and to hold the results
+    merged_guesses = pd.DataFrame(data={0}[['guess']].values, columns=['guess_1'])
+    merged_guesses.loc[:, 'guess_2'] = {1}['guess']
+    merged_guesses.loc[:, 'guess_3'] = {2}['guess']
+    merged_guesses.loc[:, 'guess'] = None
+
+    for row_ix in merged_guesses.index:
+        merged_guesses['guess'].loc[row_ix] = method(merged_guesses[['guess_1', 'guess_2', 'guess_3']].iloc[row_ix], weights)
+    {3} = {0}.join({1}[[column for column in {1}.columns.values if column not in {0}.columns.values]])
+    {3} = {3}.join({2}[[column for column in {2}.columns.values if column not in {3}.columns.values]])
+    if 'guess' in {3}.columns.values:
+        {3} = {3}.drop('guess', 1).join(merged_guesses['guess']).copy()
+    else:
+        {3} = {3}.join(merged_guesses['guess'])
+        '''.format(operator[4], operator[5], operator[6], result_name)
+
+        elif operator_name == '_consensus_four':
+            weight_scheme = int(operator[2])
+            method = int(operator[3])
+            if weight_scheme % num_consensus_options > consensus_opt_split_ix:
+                weight_scheme = consensus_opt_split_ix
+            if method % num_consensus_options <= consensus_opt_split_ix:
+                method = consensus_opt_split_ix + 1
+
+            operator_text += consensus_operator_prefix(weight_scheme, method, operator_text)
+
+            operator_text += '\n# Combine four DataFrames'
+            operator_text += '\ndfs = [{0}, {1}, {2}, {3}]'.format(operator[4], operator[5], operator[6], operator[7])
+            operator_text += '''
+ignore_consensus = False
+if any(len(df.columns) == 3 for df in dfs):
+    found = False
+
+    for df in dfs:
+        if len(df.columns) > 3:
+            {0} = df.copy()
+            found = True
+            break
+    if not found:
+        ignore_consensus = True
+        {0} = dfs[0].copy()
+
+if not ignore_consensus:
+    weights = []
+    for df in dfs:
+        tup = df[['guess', 'class']]
+        num_correct = len(np.where(tup['guess'] == tup['class'])[0])
+        total_vals = len(tup['guess'].index)'''.format(result_name)
+            if consensus_options[weight_scheme % num_consensus_options] == 'accuracy':
+                operator_text +='''
+        weights.append(float(num_correct) / float(total_vals))
+        '''
+            elif consensus_options[weight_scheme % num_consensus_options] == 'uniform':
+                operator_text +='''
+        weights.append(1.0)
+        '''
+            operator_text += '''
+    # Initialize the dataFrame containing just the guesses, and to hold the results
+    merged_guesses = pd.DataFrame(data={0}[['guess']].values, columns=['guess_1'])
+    merged_guesses.loc[:, 'guess_2'] = {1}['guess']
+    merged_guesses.loc[:, 'guess_3'] = {2}['guess']
+    merged_guesses.loc[:, 'guess_4'] = {3}['guess']
+    merged_guesses.loc[:, 'guess'] = None
+
+    for row_ix in merged_guesses.index:
+        merged_guesses['guess'].loc[row_ix] = method(merged_guesses[['guess_1', 'guess_2', 'guess_3', 'guess_4']].iloc[row_ix], weights)
+    {4} = {0}.join({1}[[column for column in {1}.columns.values if column not in {0}.columns.values]])
+    {4} = {4}.join({2}[[column for column in {2}.columns.values if column not in {4}.columns.values]])
+    {4} = {4}.join({3}[[column for column in {3}.columns.values if column not in {4}.columns.values]])
+    if 'guess' in {4}.columns.values:
+        {4} = {4}.drop('guess', 1).join(merged_guesses['guess']).copy()
+    else:
+        {4} = {4}.join(merged_guesses['guess'])
+        '''.format(operator[4], operator[5], operator[6], result_name)
+
         elif operator_name == '_variance_threshold':
             threshold = float(operator[3])