From 6c8afec4ec5d38bdc2ea171458962c23bd09d554 Mon Sep 17 00:00:00 2001
From: Nathan <bartleyn@uchicago.edu>
Date: Thu, 18 Feb 2016 23:55:34 -0600
Subject: [PATCH 01/13] Added consensus operators, working through adaboost.
 Need to optimize memory usage.

---
 tpot/tpot.py | 263 ++++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 262 insertions(+), 1 deletion(-)

diff --git a/tpot/tpot.py b/tpot/tpot.py
index 6fe10ba8..cfbab2e4 100644
--- a/tpot/tpot.py
+++ b/tpot/tpot.py
@@ -122,10 +122,17 @@ def __init__(self, population_size=100, generations=100,
         self.pset.addPrimitive(operator.sub, [int, int], int)
         self.pset.addPrimitive(operator.mul, [int, int], int)
         self.pset.addPrimitive(self._div, [int, int], float)
+        self.pset.addPrimitive(self._consensus_two, [str, str, pd.DataFrame, pd.DataFrame], pd.DataFrame)
+        self.pset.addPrimitive(self._consensus_three, [str, str, pd.DataFrame, pd.DataFrame, pd.DataFrame], pd.DataFrame)
+        self.pset.addPrimitive(self._consensus_four, [str, str, pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame], pd.DataFrame)
+        self.pset.addPrimitive(self._ident, [str], str)
+
         for val in range(0, 101):
             self.pset.addTerminal(val, int)
         for val in [100.0, 10.0, 1.0, 0.1, 0.01, 0.001, 0.0001]:
             self.pset.addTerminal(val, float)
+        for val in ['max', 'mean', 'median', 'min', 'accuracy', 'uniform', 'adaboost']:
+            self.pset.addTerminal(val, str)
 
         creator.create('FitnessMax', base.Fitness, weights=(1.0,))
         creator.create('Individual', gp.PrimitiveTree, fitness=creator.FitnessMax)
@@ -863,14 +870,268 @@ def gradient_boosting(self, input_df, learning_rate, n_estimators, max_depth):
 
         return self._train_model_and_predict(input_df, GradientBoostingClassifier, learning_rate=learning_rate, n_estimators=n_estimators, max_depth=max_depth, random_state=42)
     
+  
+    def _get_ht_dict(self, classes, weights):
+        ret = {}
+        for cls in classes:
+            try:
+                ret[cls] += weights[cls]
+            except:
+                print(cls)
+                print(weights)
+                ret[cls] = weights[cls]
+        return ret
+
+    #max freq class
+    def _max_class(self, classes, weights):
+        ht = self._get_ht_dict(classes, weights)
+        return sorted(ht.items(), key=operator.itemgetter(1))[-1][0]
+    
+    def _mean_class(self, classes, weights):
+        ht = self._get_ht_dict(classes, weights)
+        mean_val = np.mean(ht.values())
+        return sorted(((x, abs(y - mean_val)) for (x,y) in ht.items()), key=operator.itemgetter(1))[0]
+
+    def _median_class(self, classes, weights):
+        ht = self._get_ht_dict(classes, weights)
+        median_val = np.median(ht.values())
+        return sorted(((x, abs(y - median_val)) for (x,y) in ht.items()), key=operator.itemgetter(1))[0]
+
+    #minimum frequency class
+    def _min_class(self, classes, weights):
+        ht = self._get_ht_dict(classes, weights)
+        return sorted(ht.items(), key=operator.itemgetter(1))[0]
+
+    def _adaboost(self, classes, gt):
+        # weigh the incorrect classes higher than the correct classes
+        num_correct = len(np.where(classes == gt))
+        total = classes.size#
+        # this is not quite right, the weight here is for the classifier as a whole, not the weifghted error of a single mistake
+        #weights = pd.Series(data=np.array(0.5 * np.log((1- error)/(error)))
+
+        # e ^ - y_i * alpha_m * k_m (x_i)
+        #init_weights = np.sum(np.exp(-1 * gt * 1 * classes), axis=0)        
+        return 1.0
    
+    def _ident(*args):
+        return args
+
+    def _consensus_two(self, weighting, method, input_df1, input_df2):
+        """Takes the classifications of different models and combines them in a meaningful manner.
+        
+        Parameters
+        ----------
+        weighting: {'accuracy', 'uniform', 'adaboost'}
+            Method of weighting the classifications from the different DataFrames
+        method: {'max', 'mean', 'median', 'min'}
+            Method of combining the classifications from the different DataFrames
+        input_df1: pandas.DataFrame {n_samples, n_features+['class', 'group', 'guess']}
+            First input DataFrame to combine guesses
+        input_df2: pandas.DataFrame {n_samples, n_features+['class', 'group', 'guess']}
+            Second input DataFrame to combine guesses
+
+        Returns
+        -------
+        combined_df: pandas.DataFrame {n_samples, n_features+['class', 'group', 'guess']}
+            Combined DataFrames 
+            
+        """
+        #Validate input
+        #If either DF is empty and the other isn't, return a non-empty DF as it probably has reasonable guesses
+        #Otherwise if both are lacking, return the first one
+        dfs = [input_df1, input_df2]
+        if any(len(df.columns) == 3 for df in dfs):
+            for df in dfs:
+                if len(df.columns) > 3:
+                    return df
+            return dfs[0].copy() 
+
+        if weighting not in ['accuracy', 'uniform', 'adaboost']:
+            return dfs[0].copy()
+        if method not in ['max', 'mean', 'median', 'min']:
+            return dfs[0].copy()
+        #Establish the weights for each dataframe/classifier
+        #guesses_gt = [df[['guess','class']] for df in dfs]
+        guesses
+        
+        input_df1.join(input_df2[[column for column in input_df2.columns.values if column not in input_df1.columns.values]]).copy()
+        weights = []
+
+        #for tup in guesses_gt:
+        for df in dfs:
+            tup = df[['guess', 'class']]
+            num_correct = len(np.where(tup['guess'] == tup['class']))
+            total_vals = len(tup['guess'].index)
+            if weighting == 'accuracy':
+                weights.append(float(num_correct) / float(total_vals))
+            elif weighting == 'uniform':
+                weights.append(1.0)
+            elif weighting == 'adaboost':
+                weights.append(self._adaboost(tup['guess'], tup['class']))
+        method_f = None
+        if method == 'max':
+            method_f = self._max_class
+        elif method == 'mean':
+            method_f = self._mean_class
+        elif method == 'median':
+            method_f = self._median_class
+        elif method == 'min':
+            method_f = self._min_class
+
+        # for each sample, get the appropriate combined value
+        merged_guesses = pd.merge(input_df1[['guess']], input_df2[['guess']], suffixes=['_1', '_2'])
+        merged_guesses['res'] = None
+        for row_ix in merged_guesses.index:
+            merged_guesses['res'].iloc[row_ix] = method_f(merged_guesses.iloc[row_ix], weights)
+
+        return pd.join(combined_df, merged_guesses['res'].copy())
+
+    def _consensus_three(self, weighting, method, input_df1, input_df2, input_df3):
+        """Takes the classifications of different models and combines them in a meaningful manner.
+        
+        Parameters
+        ----------
+        weighting: {'accuracy', 'uniform', 'adaboost'}
+            Method of weighting the classifications from the different DataFrames
+        method: {'max', 'mean', 'median', 'min'}
+            Method of combining the classifications from the different DataFrames
+        input_df1: pandas.DataFrame {n_samples, n_features+['class', 'group', 'guess']}
+            First input DataFrame to combine guesses
+        input_df2: pandas.DataFrame {n_samples, n_features+['class', 'group', 'guess']}
+            Second input DataFrame to combine guesses
+        input_df3: pandas.DataFrame {n_samples, n_features+['class', 'group', 'guess']}
+            Third input DataFrame to combine guesses
+
+        Returns
+        -------
+        combined_df: pandas.DataFrame {n_samples, n_features+['class', 'group', 'guess']}
+            Combined DataFrames 
+            
+        """
+        #Validate input
+        #If either DF is empty and the other isn't, return a non-empty DF as it probably has reasonable guesses
+        #Otherwise if both are lacking, return the first one
+        dfs = [input_df1, input_df2, input_df3]
+        if any(len(df.columns) == 3 for df in dfs):
+            for df in dfs:
+                if len(df.columns) > 3:
+                    return df
+            return dfs[0] 
+
+        if weighting not in ['accuracy', 'uniform', 'adaboost']:
+            return dfs[0]
+        if method not in ['max', 'mean', 'median', 'min']:
+            return dfs[0]
+        #Establish the weights for each dataframe/classifier
+        #guesses_gt = [df[['guess','class']] for df in dfs]
+        weights = []
+        #for tup in guesses_gt:
+        for df in dfs:
+            tup = df[['guess', 'class']]
+            num_correct = len(np.where(tup['guess'] == tup['class']))
+            total_vals = len(tup['guess'].index)
+            if weighting == 'accuracy':
+                weights.append(float(num_correct) / float(total_vals))
+            elif weighting == 'uniform':
+                weights.append(1.0)
+            elif weighting == 'adaboost':
+                weights.append(self._adaboost(tup['guess'], tup['class']))
+        method_f = None
+        if method == 'max':
+            method_f = self._max_class
+        elif method == 'mean':
+            method_f = self._mean_class
+        elif method == 'median':
+            method_f = self._median_class
+        elif method == 'min':
+            method_f = self._min_class
+
+        # for each sample, get the appropriate combined value
+        merged_guesses = pd.merge(input_df1[['guess']], pd.merge(input_df2[['guess']], input_df3[['guess']], suffixes=['_2', '_3']),  suffixes=['_1', '_2'])
+        merged_guesses['res'] = None
+        for row_ix in merged_guesses.index:
+            merged_guesses['res'].iloc[row_ix] = method_f(merged_guesses.iloc[row_ix], weights)
+
+        return pd.join(combined_df, merged_guesses['res'].copy())
+
+    def _consensus_four(self, weighting, method, input_df1, input_df2, input_df3, input_df4):
+        """Takes the classifications of different models and combines them in a meaningful manner.
+        
+        Parameters
+        ----------
+        weighting: {'accuracy', 'uniform', 'adaboost'}
+            Method of weighting the classifications from the different DataFrames
+        method: {'max', 'mean', 'median', 'min'}
+            Method of combining the classifications from the different DataFrames
+        input_df1: pandas.DataFrame {n_samples, n_features+['class', 'group', 'guess']}
+            First input DataFrame to combine guesses
+        input_df2: pandas.DataFrame {n_samples, n_features+['class', 'group', 'guess']}
+            Second input DataFrame to combine guesses
+        input_df3: pandas.DataFrame {n_samples, n_features+['class', 'group', 'guess']}
+            Third input DataFrame to combine guesses
+        input_df4: pandas.DataFrame {n_samples, n_features+['class', 'group', 'guess']}
+            Fourth input DataFrame to combine guesses
+
+        Returns
+        -------
+        combined_df: pandas.DataFrame {n_samples, n_features+['class', 'group', 'guess']}
+            Combined DataFrames 
+            
+        """
+        #Validate input
+        #If either DF is empty and the other isn't, return a non-empty DF as it probably has reasonable guesses
+        #Otherwise if both are lacking, return the first one
+        dfs = [input_df1, input_df2, input_df3, input_df4]
+        if any(len(df.columns) == 3 for df in dfs):
+            for df in dfs:
+                if len(df.columns) > 3:
+                    return df
+            return dfs[0] 
+
+        if weighting not in ['accuracy', 'uniform', 'adaboost']:
+            return dfs[0]
+        if method not in ['max', 'mean', 'median', 'min']:
+            return dfs[0]
+        #Establish the weights for each dataframe/classifier
+        #guesses_gt = [df[['guess','class']] for df in dfs]
+        weights = []
+        #for tup in guesses_gt:
+        for df in dfs:
+            tup = df[['guess', 'total']]
+            num_correct = len(np.where(tup['guess'] == tup['class']))
+            total_vals = len(tup['guess'].index)
+            if weighting == 'accuracy':
+                weights.append(float(num_correct) / float(total_vals))
+            elif weighting == 'uniform':
+                weights.append(1.0)
+            elif weighting == 'adaboost':
+                weights.append(self._adaboost(tup['guess'], tup['class']))
+        method_f = None
+        if method == 'max':
+            method_f = self._max_class
+        elif method == 'mean':
+            method_f = self._mean_class
+        elif method == 'median':
+            method_f = self._median_class
+        elif method == 'min':
+            method_f = self._min_class
+
+        # for each sample, get the appropriate combined value
+        merged_guesses = pd.merge(input_df1[['guess']], pd.merge(input_df2[['guess']], pd.merge(input_df3[['guess']], input_df4[['guess']], suffixes=['_3', '_4']), suffixes=['_2', '_3']), suffixes=['_1', '_2'])
+        merged_guesses['res'] = None
+        for row_ix in merged_guesses.index:
+            merged_guesses['res'].iloc[row_ix] = method_f(merged_guesses.iloc[row_ix], weights)
+
+        return pd.join(combined_df, merged_guesses['res'].copy())
+
+
     def _train_model_and_predict(self, input_df, model, **kwargs):
         """Fits an arbitrary sklearn classifier model with a set of keyword parameters
 
         Parameters
         ----------
         input_df: pandas.DataFrame {n_samples, n_features+['class', 'group', 'guess']}
-            Input DataFrame for fitting the k-neares
+            Input DataFrame for fitting the model
         model: sklearn classifier
             Input model to fit and predict on input_df
         kwargs: unpacked parameters

From 160593e1e0e1d821ba8f849daddd35b64efae783 Mon Sep 17 00:00:00 2001
From: Nathan <bartleyn@uchicago.edu>
Date: Sun, 21 Feb 2016 23:34:52 -0600
Subject: [PATCH 02/13] consensus_two working, needs another once over for
 optimization, and cleaning up comments

---
 tpot/tpot.py | 122 +++++++++++++++++++++++++++++++++++----------------
 1 file changed, 84 insertions(+), 38 deletions(-)

diff --git a/tpot/tpot.py b/tpot/tpot.py
index cfbab2e4..5eddcd69 100644
--- a/tpot/tpot.py
+++ b/tpot/tpot.py
@@ -108,7 +108,7 @@ def __init__(self, population_size=100, generations=100,
         self.pset.addPrimitive(self.svc, [pd.DataFrame, float], pd.DataFrame)
         self.pset.addPrimitive(self.knnc, [pd.DataFrame, int], pd.DataFrame)
         self.pset.addPrimitive(self.gradient_boosting, [pd.DataFrame, float, int, int], pd.DataFrame)
-        self.pset.addPrimitive(self._combine_dfs, [pd.DataFrame, pd.DataFrame], pd.DataFrame)
+        #self.pset.addPrimitive(self._combine_dfs, [pd.DataFrame, pd.DataFrame], pd.DataFrame)
         self.pset.addPrimitive(self._variance_threshold, [pd.DataFrame, float], pd.DataFrame)
         self.pset.addPrimitive(self._select_kbest, [pd.DataFrame, int], pd.DataFrame) 
         self.pset.addPrimitive(self._select_percentile, [pd.DataFrame, int], pd.DataFrame)
@@ -122,17 +122,17 @@ def __init__(self, population_size=100, generations=100,
         self.pset.addPrimitive(operator.sub, [int, int], int)
         self.pset.addPrimitive(operator.mul, [int, int], int)
         self.pset.addPrimitive(self._div, [int, int], float)
-        self.pset.addPrimitive(self._consensus_two, [str, str, pd.DataFrame, pd.DataFrame], pd.DataFrame)
-        self.pset.addPrimitive(self._consensus_three, [str, str, pd.DataFrame, pd.DataFrame, pd.DataFrame], pd.DataFrame)
-        self.pset.addPrimitive(self._consensus_four, [str, str, pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame], pd.DataFrame)
-        self.pset.addPrimitive(self._ident, [str], str)
+        self.pset.addPrimitive(self._consensus_two, [int, int, pd.DataFrame, pd.DataFrame], pd.DataFrame)
+        #self.pset.addPrimitive(self._consensus_three, [int, int, pd.DataFrame, pd.DataFrame, pd.DataFrame], pd.DataFrame)
+        #self.pset.addPrimitive(self._consensus_four, [int, int, pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame], pd.DataFrame)
+        #self.pset.addPrimitive(self._ident, [str], str)
 
         for val in range(0, 101):
             self.pset.addTerminal(val, int)
         for val in [100.0, 10.0, 1.0, 0.1, 0.01, 0.001, 0.0001]:
             self.pset.addTerminal(val, float)
-        for val in ['max', 'mean', 'median', 'min', 'accuracy', 'uniform', 'adaboost']:
-            self.pset.addTerminal(val, str)
+        #for val in ['max', 'mean', 'median', 'min', 'accuracy', 'uniform', 'adaboost']:
+        #    self.pset.addTerminal(val, str)
 
         creator.create('FitnessMax', base.Fitness, weights=(1.0,))
         creator.create('Individual', gp.PrimitiveTree, fitness=creator.FitnessMax)
@@ -873,13 +873,14 @@ def gradient_boosting(self, input_df, learning_rate, n_estimators, max_depth):
   
     def _get_ht_dict(self, classes, weights):
         ret = {}
+        #instead of weights of class needs to be weights of classifier
+        ctr = 0
         for cls in classes:
             try:
-                ret[cls] += weights[cls]
+                ret[cls] += weights[ctr]
             except:
-                print(cls)
-                print(weights)
-                ret[cls] = weights[cls]
+                ret[cls] = weights[ctr]
+            ctr += 1
         return ret
 
     #max freq class
@@ -890,17 +891,17 @@ def _max_class(self, classes, weights):
     def _mean_class(self, classes, weights):
         ht = self._get_ht_dict(classes, weights)
         mean_val = np.mean(ht.values())
-        return sorted(((x, abs(y - mean_val)) for (x,y) in ht.items()), key=operator.itemgetter(1))[0]
+        return sorted(((x, abs(y - mean_val)) for (x,y) in ht.items()), key=operator.itemgetter(1))[0][0]
 
     def _median_class(self, classes, weights):
         ht = self._get_ht_dict(classes, weights)
         median_val = np.median(ht.values())
-        return sorted(((x, abs(y - median_val)) for (x,y) in ht.items()), key=operator.itemgetter(1))[0]
+        return sorted(((x, abs(y - median_val)) for (x,y) in ht.items()), key=operator.itemgetter(1))[0][0]
 
     #minimum frequency class
     def _min_class(self, classes, weights):
         ht = self._get_ht_dict(classes, weights)
-        return sorted(ht.items(), key=operator.itemgetter(1))[0]
+        return sorted(ht.items(), key=operator.itemgetter(1))[0][0]
 
     def _adaboost(self, classes, gt):
         # weigh the incorrect classes higher than the correct classes
@@ -911,7 +912,7 @@ def _adaboost(self, classes, gt):
 
         # e ^ - y_i * alpha_m * k_m (x_i)
         #init_weights = np.sum(np.exp(-1 * gt * 1 * classes), axis=0)        
-        return 1.0
+        return (1.0)
    
     def _ident(*args):
         return args
@@ -946,15 +947,20 @@ def _consensus_two(self, weighting, method, input_df1, input_df2):
                     return df
             return dfs[0].copy() 
 
-        if weighting not in ['accuracy', 'uniform', 'adaboost']:
+        #if weighting not in ['accuracy', 'uniform', 'adaboost']:
+        if weighting % 7 > 2:
             return dfs[0].copy()
-        if method not in ['max', 'mean', 'median', 'min']:
+        #if method not in ['max', 'mean', 'median', 'min']:
+        if method % 7 <= 2:
             return dfs[0].copy()
+
+        options = ['accuracy', 'uniform', 'adaboost', 'max', 'mean', 'median', 'min']
+        weighting = options[weighting % 7]
+        method = options[method % 7]
         #Establish the weights for each dataframe/classifier
         #guesses_gt = [df[['guess','class']] for df in dfs]
-        guesses
         
-        input_df1.join(input_df2[[column for column in input_df2.columns.values if column not in input_df1.columns.values]]).copy()
+
         weights = []
 
         #for tup in guesses_gt:
@@ -978,14 +984,26 @@ def _consensus_two(self, weighting, method, input_df1, input_df2):
         elif method == 'min':
             method_f = self._min_class
 
+        merged_guesses = pd.DataFrame(data=input_df1[['guess']], columns=['guess_1'])
+        merged_guesses.loc[:, 'guess_2'] = input_df2['guess']
         # for each sample, get the appropriate combined value
-        merged_guesses = pd.merge(input_df1[['guess']], input_df2[['guess']], suffixes=['_1', '_2'])
-        merged_guesses['res'] = None
+        #merged_guesses = pd.merge(df1_guesses, df2_guesses, suffixes=['_1', '_2']).copy()
+        #print('merged_guesses columsn {}'.format(merged_guesses.columns.values))
+        #print('types!!!!!! {} {}'.format(type(df1_guesses), type(df2_guesses)))
+        #merged_guesses = pd.DataFrame()
+        #merged_guesses.loc[:, 'guess_1'] = df1_guesses.copy()
+        #merged_guesses.loc[:, 'guess_2'] = df2_guesses.copy()
+        merged_guesses.loc[:, 'guess'] = None
         for row_ix in merged_guesses.index:
-            merged_guesses['res'].iloc[row_ix] = method_f(merged_guesses.iloc[row_ix], weights)
-
-        return pd.join(combined_df, merged_guesses['res'].copy())
-
+            merged_guesses['guess'].loc[row_ix] = method_f(merged_guesses[['guess_1', 'guess_2']].iloc[row_ix], weights)
+        combined_df = input_df1.join(input_df2[[column for column in input_df2.columns.values if column not in input_df1.columns.values]])
+        if 'guess' in combined_df.columns.values:
+            #print('cols: combined_df then merged_guesses')
+            #print(combined_df.columns.values)
+            #print(merged_guesses.columns.values)
+            return combined_df.drop('guess', 1).join(merged_guesses['guess']).copy()
+        else:
+            return combined_df.join(merged_guesses['guess'])
     def _consensus_three(self, weighting, method, input_df1, input_df2, input_df3):
         """Takes the classifications of different models and combines them in a meaningful manner.
         
@@ -1016,14 +1034,22 @@ def _consensus_three(self, weighting, method, input_df1, input_df2, input_df3):
             for df in dfs:
                 if len(df.columns) > 3:
                     return df
-            return dfs[0] 
+            return dfs[0].copy() 
 
-        if weighting not in ['accuracy', 'uniform', 'adaboost']:
-            return dfs[0]
-        if method not in ['max', 'mean', 'median', 'min']:
-            return dfs[0]
+        #if weighting not in ['accuracy', 'uniform', 'adaboost']:
+        if weighting % 7 > 2:
+            return dfs[0].copy()
+        #if method not in ['max', 'mean', 'median', 'min']:
+        if method % 7 <= 2:
+            return dfs[0].copy()
+
+        options = ['accuracy', 'uniform', 'adaboost', 'max', 'mean', 'median', 'min']
+        weighting = options[weighting % 7]
+        method = options[method % 7]
         #Establish the weights for each dataframe/classifier
         #guesses_gt = [df[['guess','class']] for df in dfs]
+        combined_df = input_df1.join(input_df2[[column for column in input_df2.columns.values if column not in input_df1.columns.values]])
+        combined_df = combined_df.join(input_df3[[column for column in input_df3.columns.values if column not in combined_df.columns.values]])
         weights = []
         #for tup in guesses_gt:
         for df in dfs:
@@ -1031,9 +1057,9 @@ def _consensus_three(self, weighting, method, input_df1, input_df2, input_df3):
             num_correct = len(np.where(tup['guess'] == tup['class']))
             total_vals = len(tup['guess'].index)
             if weighting == 'accuracy':
-                weights.append(float(num_correct) / float(total_vals))
+                weights.append((float(num_correct) / float(total_vals)))
             elif weighting == 'uniform':
-                weights.append(1.0)
+                weights.append((1.0))
             elif weighting == 'adaboost':
                 weights.append(self._adaboost(tup['guess'], tup['class']))
         method_f = None
@@ -1046,6 +1072,9 @@ def _consensus_three(self, weighting, method, input_df1, input_df2, input_df3):
         elif method == 'min':
             method_f = self._min_class
 
+        df1_guesses = input_df1[['guess']]
+        df2_guesses = input_df2[['guess']]
+        df3_guesses = input_df3[['guess']]
         # for each sample, get the appropriate combined value
         merged_guesses = pd.merge(input_df1[['guess']], pd.merge(input_df2[['guess']], input_df3[['guess']], suffixes=['_2', '_3']),  suffixes=['_1', '_2'])
         merged_guesses['res'] = None
@@ -1088,16 +1117,26 @@ def _consensus_four(self, weighting, method, input_df1, input_df2, input_df3, in
                     return df
             return dfs[0] 
 
-        if weighting not in ['accuracy', 'uniform', 'adaboost']:
-            return dfs[0]
-        if method not in ['max', 'mean', 'median', 'min']:
-            return dfs[0]
+        #if weighting not in ['accuracy', 'uniform', 'adaboost']:
+        if weighting % 7 > 2:
+            return dfs[0].copy()
+        #if method not in ['max', 'mean', 'median', 'min']:
+        if method % 7 <= 2:
+            return dfs[0].copy()
+
+        options = ['accuracy', 'uniform', 'adaboost', 'max', 'mean', 'median', 'min']
+        weighting = options[weighting % 7]
+        method = options[method % 7]
+        combined_df = input_df1.join(input_df2[[column for column in input_df2.columns.values if column not in input_df1.columns.values]]).copy()
+        combined_df = combined_df.join(input_df3[[columns for column in input_df3.columns.values if column not in combined_df.columns.values]]).copy()
+        combined_df = combined_df.join(input_df4[[columns for column in input_df4.columns.values if column not in combined_df.columns.values]]).copy()
+
         #Establish the weights for each dataframe/classifier
         #guesses_gt = [df[['guess','class']] for df in dfs]
         weights = []
         #for tup in guesses_gt:
         for df in dfs:
-            tup = df[['guess', 'total']]
+            tup = df[['guess', 'class']]
             num_correct = len(np.where(tup['guess'] == tup['class']))
             total_vals = len(tup['guess'].index)
             if weighting == 'accuracy':
@@ -1116,8 +1155,12 @@ def _consensus_four(self, weighting, method, input_df1, input_df2, input_df3, in
         elif method == 'min':
             method_f = self._min_class
 
+        df1_guesses = input_df1[['guess']]
+        df2_guesses = input_df2[['guess']]
+        df3_guesses = input_df3[['guess']]
+        df4_guesses = input_df4[['guess']]
         # for each sample, get the appropriate combined value
-        merged_guesses = pd.merge(input_df1[['guess']], pd.merge(input_df2[['guess']], pd.merge(input_df3[['guess']], input_df4[['guess']], suffixes=['_3', '_4']), suffixes=['_2', '_3']), suffixes=['_1', '_2'])
+        merged_guesses = pd.merge(df1_guesses, pd.merge(df2_guesses, pd.merge(df3_guesses, df4_guesses, suffixes=['_3', '_4']), suffixes=['_2', '_3']), suffixes=['_1', '_2'])
         merged_guesses['res'] = None
         for row_ix in merged_guesses.index:
             merged_guesses['res'].iloc[row_ix] = method_f(merged_guesses.iloc[row_ix], weights)
@@ -1526,6 +1569,7 @@ def _evaluate_individual(self, individual, training_testing_data):
             # Throw out GP expressions that are too large to be compiled in Python
             return 0.,
 
+        print(individual)
         result = func(training_testing_data)
         result = result[result['group'] == 'testing']
         res = self.scoring_function(result)
@@ -1553,6 +1597,8 @@ def _balanced_accuracy(self, result):
         all_classes = list(set(result['class'].values))
         all_class_accuracies = []
         for this_class in all_classes:
+            #print("this class {}".format(this_class))
+            #print("result['guess'] {}".format(result['guess']))
             this_class_accuracy = len(result[(result['guess'] == this_class) \
                 & (result['class'] == this_class)])\
                 / float(len(result[result['class'] == this_class]))

From 6d0ba52d2aa96ac197ad931093f64be1bab6852e Mon Sep 17 00:00:00 2001
From: Nathan <bartleyn@uchicago.edu>
Date: Mon, 22 Feb 2016 00:10:52 -0600
Subject: [PATCH 03/13] consensus_three and four seem to work, no memory
 blowups like before. Requires more thorough testing, finishing adaboost
 weighting

---
 tpot/tpot.py | 68 +++++++++++++++++++++++-----------------------------
 1 file changed, 30 insertions(+), 38 deletions(-)

diff --git a/tpot/tpot.py b/tpot/tpot.py
index 5eddcd69..7c743151 100644
--- a/tpot/tpot.py
+++ b/tpot/tpot.py
@@ -123,8 +123,8 @@ def __init__(self, population_size=100, generations=100,
         self.pset.addPrimitive(operator.mul, [int, int], int)
         self.pset.addPrimitive(self._div, [int, int], float)
         self.pset.addPrimitive(self._consensus_two, [int, int, pd.DataFrame, pd.DataFrame], pd.DataFrame)
-        #self.pset.addPrimitive(self._consensus_three, [int, int, pd.DataFrame, pd.DataFrame, pd.DataFrame], pd.DataFrame)
-        #self.pset.addPrimitive(self._consensus_four, [int, int, pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame], pd.DataFrame)
+        self.pset.addPrimitive(self._consensus_three, [int, int, pd.DataFrame, pd.DataFrame, pd.DataFrame], pd.DataFrame)
+        self.pset.addPrimitive(self._consensus_four, [int, int, pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame], pd.DataFrame)
         #self.pset.addPrimitive(self._ident, [str], str)
 
         for val in range(0, 101):
@@ -986,21 +986,11 @@ def _consensus_two(self, weighting, method, input_df1, input_df2):
 
         merged_guesses = pd.DataFrame(data=input_df1[['guess']], columns=['guess_1'])
         merged_guesses.loc[:, 'guess_2'] = input_df2['guess']
-        # for each sample, get the appropriate combined value
-        #merged_guesses = pd.merge(df1_guesses, df2_guesses, suffixes=['_1', '_2']).copy()
-        #print('merged_guesses columsn {}'.format(merged_guesses.columns.values))
-        #print('types!!!!!! {} {}'.format(type(df1_guesses), type(df2_guesses)))
-        #merged_guesses = pd.DataFrame()
-        #merged_guesses.loc[:, 'guess_1'] = df1_guesses.copy()
-        #merged_guesses.loc[:, 'guess_2'] = df2_guesses.copy()
         merged_guesses.loc[:, 'guess'] = None
         for row_ix in merged_guesses.index:
             merged_guesses['guess'].loc[row_ix] = method_f(merged_guesses[['guess_1', 'guess_2']].iloc[row_ix], weights)
         combined_df = input_df1.join(input_df2[[column for column in input_df2.columns.values if column not in input_df1.columns.values]])
         if 'guess' in combined_df.columns.values:
-            #print('cols: combined_df then merged_guesses')
-            #print(combined_df.columns.values)
-            #print(merged_guesses.columns.values)
             return combined_df.drop('guess', 1).join(merged_guesses['guess']).copy()
         else:
             return combined_df.join(merged_guesses['guess'])
@@ -1048,8 +1038,6 @@ def _consensus_three(self, weighting, method, input_df1, input_df2, input_df3):
         method = options[method % 7]
         #Establish the weights for each dataframe/classifier
         #guesses_gt = [df[['guess','class']] for df in dfs]
-        combined_df = input_df1.join(input_df2[[column for column in input_df2.columns.values if column not in input_df1.columns.values]])
-        combined_df = combined_df.join(input_df3[[column for column in input_df3.columns.values if column not in combined_df.columns.values]])
         weights = []
         #for tup in guesses_gt:
         for df in dfs:
@@ -1057,9 +1045,9 @@ def _consensus_three(self, weighting, method, input_df1, input_df2, input_df3):
             num_correct = len(np.where(tup['guess'] == tup['class']))
             total_vals = len(tup['guess'].index)
             if weighting == 'accuracy':
-                weights.append((float(num_correct) / float(total_vals)))
+                weights.append(float(num_correct) / float(total_vals))
             elif weighting == 'uniform':
-                weights.append((1.0))
+                weights.append(1.0)
             elif weighting == 'adaboost':
                 weights.append(self._adaboost(tup['guess'], tup['class']))
         method_f = None
@@ -1072,16 +1060,20 @@ def _consensus_three(self, weighting, method, input_df1, input_df2, input_df3):
         elif method == 'min':
             method_f = self._min_class
 
-        df1_guesses = input_df1[['guess']]
-        df2_guesses = input_df2[['guess']]
-        df3_guesses = input_df3[['guess']]
-        # for each sample, get the appropriate combined value
-        merged_guesses = pd.merge(input_df1[['guess']], pd.merge(input_df2[['guess']], input_df3[['guess']], suffixes=['_2', '_3']),  suffixes=['_1', '_2'])
-        merged_guesses['res'] = None
+        merged_guesses = pd.DataFrame(data=input_df1[['guess']], columns=['guess_1'])
+        merged_guesses.loc[:, 'guess_2'] = input_df2['guess']
+        merged_guesses.loc[:, 'guess_3'] = input_df3['guess']
+        merged_guesses.loc[:, 'guess'] = None
         for row_ix in merged_guesses.index:
-            merged_guesses['res'].iloc[row_ix] = method_f(merged_guesses.iloc[row_ix], weights)
+            merged_guesses['guess'].loc[row_ix] = method_f(merged_guesses[['guess_1', 'guess_2', 'guess_3']].iloc[row_ix], weights)
+        combined_df = input_df1.join(input_df2[[column for column in input_df2.columns.values if column not in input_df1.columns.values]])
+        combined_df = combined_df.join(input_df3[[column for column in input_df3.columns.values if column not in combined_df.columns.values]])
+        if 'guess' in combined_df.columns.values:
+            return combined_df.drop('guess', 1).join(merged_guesses['guess']).copy()
+        else:
+            return combined_df.join(merged_guesses['guess'])
+
 
-        return pd.join(combined_df, merged_guesses['res'].copy())
 
     def _consensus_four(self, weighting, method, input_df1, input_df2, input_df3, input_df4):
         """Takes the classifications of different models and combines them in a meaningful manner.
@@ -1115,7 +1107,7 @@ def _consensus_four(self, weighting, method, input_df1, input_df2, input_df3, in
             for df in dfs:
                 if len(df.columns) > 3:
                     return df
-            return dfs[0] 
+            return dfs[0].copy() 
 
         #if weighting not in ['accuracy', 'uniform', 'adaboost']:
         if weighting % 7 > 2:
@@ -1127,10 +1119,6 @@ def _consensus_four(self, weighting, method, input_df1, input_df2, input_df3, in
         options = ['accuracy', 'uniform', 'adaboost', 'max', 'mean', 'median', 'min']
         weighting = options[weighting % 7]
         method = options[method % 7]
-        combined_df = input_df1.join(input_df2[[column for column in input_df2.columns.values if column not in input_df1.columns.values]]).copy()
-        combined_df = combined_df.join(input_df3[[columns for column in input_df3.columns.values if column not in combined_df.columns.values]]).copy()
-        combined_df = combined_df.join(input_df4[[columns for column in input_df4.columns.values if column not in combined_df.columns.values]]).copy()
-
         #Establish the weights for each dataframe/classifier
         #guesses_gt = [df[['guess','class']] for df in dfs]
         weights = []
@@ -1155,17 +1143,21 @@ def _consensus_four(self, weighting, method, input_df1, input_df2, input_df3, in
         elif method == 'min':
             method_f = self._min_class
 
-        df1_guesses = input_df1[['guess']]
-        df2_guesses = input_df2[['guess']]
-        df3_guesses = input_df3[['guess']]
-        df4_guesses = input_df4[['guess']]
-        # for each sample, get the appropriate combined value
-        merged_guesses = pd.merge(df1_guesses, pd.merge(df2_guesses, pd.merge(df3_guesses, df4_guesses, suffixes=['_3', '_4']), suffixes=['_2', '_3']), suffixes=['_1', '_2'])
-        merged_guesses['res'] = None
+        merged_guesses = pd.DataFrame(data=input_df1[['guess']], columns=['guess_1'])
+        merged_guesses.loc[:, 'guess_2'] = input_df2['guess']
+        merged_guesses.loc[:, 'guess_3'] = input_df3['guess']
+        merged_guesses.loc[:, 'guess_4'] = input_df3['guess']
+        merged_guesses.loc[:, 'guess'] = None
         for row_ix in merged_guesses.index:
-            merged_guesses['res'].iloc[row_ix] = method_f(merged_guesses.iloc[row_ix], weights)
+            merged_guesses['guess'].loc[row_ix] = method_f(merged_guesses[['guess_1', 'guess_2', 'guess_3', 'guess_4']].iloc[row_ix], weights)
+        combined_df = input_df1.join(input_df2[[column for column in input_df2.columns.values if column not in input_df1.columns.values]])
+        combined_df = combined_df.join(input_df3[[column for column in input_df3.columns.values if column not in combined_df.columns.values]])
+        combined_df = combined_df.join(input_df4[[column for column in input_df4.columns.values if column not in combined_df.columns.values]])
+        if 'guess' in combined_df.columns.values:
+            return combined_df.drop('guess', 1).join(merged_guesses['guess']).copy()
+        else:
+            return combined_df.join(merged_guesses['guess'])
 
-        return pd.join(combined_df, merged_guesses['res'].copy())
 
 
     def _train_model_and_predict(self, input_df, model, **kwargs):

From bb8fc28d771a2b56dc76499eaa51e5397fda7869 Mon Sep 17 00:00:00 2001
From: Nathan <bartleyn@uchicago.edu>
Date: Thu, 25 Feb 2016 23:59:59 -0600
Subject: [PATCH 04/13] Changed Twitter Bot to TPOT in License, added
 consensus_two, three, four, simple tests.

---
 tests.py     | 63 ++++++++++++++++++++++++++++++++++++++++++++++
 tpot/tpot.py | 70 +++++++++++++++++++++++++++++++++-------------------
 2 files changed, 107 insertions(+), 26 deletions(-)

diff --git a/tests.py b/tests.py
index 70f3a10a..276336d6 100644
--- a/tests.py
+++ b/tests.py
@@ -124,3 +124,66 @@ def test_static_models():
         assert np.array_equal(result['guess'].values, sklearn_model_obj.predict(testing_features)), "Model {} failed".format(str(model))
 
 
+def test_consensus():
+    tpot_obj = TPOT()
+
+    df1 = pd.DataFrame({'class': [1,1,1,1,1,0,0,0,0,0],
+                        'guess': [1,0,1,0,1,0,1,0,1,0]})
+
+    df2 = pd.DataFrame({'class': [1,1,1,1,1,0,0,0,0,0],
+                        'guess': [0,0,0,0,0,1,1,1,1,1]})
+
+    df3 = pd.DataFrame({'class': [1,1,1,1,1,0,0,0,0,0],
+                        'guess': [0,1,0,1,0,1,0,1,0,1]})
+
+    df4 = pd.DataFrame({'class': [1,1,1,1,1,0,0,0,0,0],
+                        'guess': [1,1,0,0,1,1,0,0,1,1]})
+
+    uniform = [1.0] * 10
+    adaboost = [1.0] * 10
+
+    accuracy1 = 0.60
+    accuracy2 = 0.0
+    accuracy3 = 0.40
+    accuracy4 = 0.50
+    res_two_acc_max = pd.DataFrame({'class': [1,1,1,1,1,0,0,0,0,0],
+                                    'guess': [1,0,1,0,1,0,1,0,1,0]})
+    res_two_acc_mean = pd.DataFrame({'class': [1,1,1,1,1,0,0,0,0,0],
+                                     'guess': [1,0,1,0,1,0,1,0,1,0]})
+    res_two_acc_med = pd.DataFrame({'class': [1,1,1,1,1,0,0,0,0,0],
+                                    'guess': [1,0,1,0,1,0,1,0,1,0]})
+    res_two_acc_min = pd.DataFrame({'class': [1,1,1,1,1,0,0,0,0,0],
+                                    'guess': [0,0,0,0,0,1,1,1,1,1]})
+    res_two_uni_max = pd.DataFrame({'class': [1,1,1,1,1,0,0,0,0,0],
+                                    'guess': [1,0,1,0,1,0,1,0,1,0]})
+    res_two_ada_max = pd.DataFrame({'class': [1,1,1,1,1,0,0,0,0,0],
+                                    'guess': [1,0,1,0,1,0,1,0,1,0]})
+    res_three_ada_max = pd.DataFrame({'class': [1,1,1,1,1,0,0,0,0,0],
+                                      'guess': [1,1,0,0,1,1,0,0,1,1]})
+    res_four_ada_max = pd.DataFrame({'class': [1,1,1,1,1,0,0,0,0,0],
+                                     'guess': [1,0,0,0,1,1,1,0,1,1]})
+    
+    tests = [(tpot_obj._consensus_two, 0, 3, df1, df2), #accuracy, max
+             (tpot_obj._consensus_two, 0, 4, df1, df2), #accuracy mean
+             (tpot_obj._consensus_two, 0, 5, df1, df2), # accuracy median
+             (tpot_obj._consensus_two, 0, 6, df1, df2), # accuracy min
+             (tpot_obj._consensus_two, 1, 3, df1, df2), # uniform max
+             (tpot_obj._consensus_two, 2, 3, df1, df2), # adaboost max
+             (tpot_obj._consensus_three, 2, 3, df1, df3, df4), # adaboost max
+             (tpot_obj._consensus_four, 2, 3, df1, df2, df3, df4) #adaboost max
+             ]
+    results = [res_two_acc_max,
+           res_two_acc_mean,
+           res_two_acc_med,
+           res_two_acc_min,
+           res_two_uni_max,
+           res_two_ada_max,
+           res_three_ada_max,
+           res_four_ada_max
+           ]
+
+    for test, res in zip(tests, results):
+        print test[1:3]
+        print test[0](*test[1:])['guess'].values
+        assert np.array_equal(test[0](*test[1:])['guess'].values, res['guess'].values)
+
diff --git a/tpot/tpot.py b/tpot/tpot.py
index 7c743151..36276a13 100644
--- a/tpot/tpot.py
+++ b/tpot/tpot.py
@@ -14,7 +14,7 @@
 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along with
-the Twitter Bot library. If not, see http://www.gnu.org/licenses/.
+the TPOT library. If not, see http://www.gnu.org/licenses/.
 
 """
 
@@ -131,8 +131,6 @@ def __init__(self, population_size=100, generations=100,
             self.pset.addTerminal(val, int)
         for val in [100.0, 10.0, 1.0, 0.1, 0.01, 0.001, 0.0001]:
             self.pset.addTerminal(val, float)
-        #for val in ['max', 'mean', 'median', 'min', 'accuracy', 'uniform', 'adaboost']:
-        #    self.pset.addTerminal(val, str)
 
         creator.create('FitnessMax', base.Fitness, weights=(1.0,))
         creator.create('Individual', gp.PrimitiveTree, fitness=creator.FitnessMax)
@@ -883,30 +881,49 @@ def _get_ht_dict(self, classes, weights):
             ctr += 1
         return ret
 
+    
+    def _get_top(self, classes, tups):
+        """Return the class from the row in the first DataFrame passed to the function (e.g., input_df1)
+        """
+        values = [tup[0] for tup in tups if tup[1] == tups[0][1]]
+        for class_ in classes:
+            if class_ in values:
+                return class_
+
     #max freq class
     def _max_class(self, classes, weights):
+        """Return the class with the highest weight, or the class that appears first with that weight (e.g., input_df1)
+        """
         ht = self._get_ht_dict(classes, weights)
-        return sorted(ht.items(), key=operator.itemgetter(1))[-1][0]
+        return self._get_top(classes, sorted(ht.items(), key=operator.itemgetter(1), reverse=True))
     
     def _mean_class(self, classes, weights):
+        """Return the class closest to the mean weight, or the class that appears first with that weight (e.g., input_df1)
+        """
         ht = self._get_ht_dict(classes, weights)
         mean_val = np.mean(ht.values())
-        return sorted(((x, abs(y - mean_val)) for (x,y) in ht.items()), key=operator.itemgetter(1))[0][0]
+        return self._get_top(classes, sorted(((x, abs(y - mean_val)) for (x,y) in ht.items()), key=operator.itemgetter(1)))
 
     def _median_class(self, classes, weights):
+        """Return the class closest to the median weight, or the class that appears first with that weight (e.g., input_df1)
+        """
         ht = self._get_ht_dict(classes, weights)
         median_val = np.median(ht.values())
-        return sorted(((x, abs(y - median_val)) for (x,y) in ht.items()), key=operator.itemgetter(1))[0][0]
+        return self._get_top(classes, sorted(((x, abs(y - median_val)) for (x,y) in ht.items()), key=operator.itemgetter(1)))
 
     #minimum frequency class
     def _min_class(self, classes, weights):
+        """Return the class with the minimal weight, or the class that appears first with that weight (e.g., input_df1)
+        """
         ht = self._get_ht_dict(classes, weights)
-        return sorted(ht.items(), key=operator.itemgetter(1))[0][0]
+        return self._get_top(classes, sorted(ht.items(), key=operator.itemgetter(1)))
 
     def _adaboost(self, classes, gt):
+        """Weigh each of a DataFrame's guesses according to an adaboost-like scheme.
+        """
         # weigh the incorrect classes higher than the correct classes
-        num_correct = len(np.where(classes == gt))
-        total = classes.size#
+        #num_correct = len(np.where(classes == gt))
+        #total = classes.size
         # this is not quite right, the weight here is for the classifier as a whole, not the weifghted error of a single mistake
         #weights = pd.Series(data=np.array(0.5 * np.log((1- error)/(error)))
 
@@ -914,8 +931,6 @@ def _adaboost(self, classes, gt):
         #init_weights = np.sum(np.exp(-1 * gt * 1 * classes), axis=0)        
         return (1.0)
    
-    def _ident(*args):
-        return args
 
     def _consensus_two(self, weighting, method, input_df1, input_df2):
         """Takes the classifications of different models and combines them in a meaningful manner.
@@ -958,15 +973,10 @@ def _consensus_two(self, weighting, method, input_df1, input_df2):
         weighting = options[weighting % 7]
         method = options[method % 7]
         #Establish the weights for each dataframe/classifier
-        #guesses_gt = [df[['guess','class']] for df in dfs]
-        
-
         weights = []
-
-        #for tup in guesses_gt:
         for df in dfs:
             tup = df[['guess', 'class']]
-            num_correct = len(np.where(tup['guess'] == tup['class']))
+            num_correct = len(np.where(tup['guess'] == tup['class'])[0])
             total_vals = len(tup['guess'].index)
             if weighting == 'accuracy':
                 weights.append(float(num_correct) / float(total_vals))
@@ -974,6 +984,7 @@ def _consensus_two(self, weighting, method, input_df1, input_df2):
                 weights.append(1.0)
             elif weighting == 'adaboost':
                 weights.append(self._adaboost(tup['guess'], tup['class']))
+        #Set the method function for evaluating each DataFrame
         method_f = None
         if method == 'max':
             method_f = self._max_class
@@ -984,9 +995,11 @@ def _consensus_two(self, weighting, method, input_df1, input_df2):
         elif method == 'min':
             method_f = self._min_class
 
-        merged_guesses = pd.DataFrame(data=input_df1[['guess']], columns=['guess_1'])
+        # Initialize the dataFrame containing just the guesses, and to hold the results
+        merged_guesses = pd.DataFrame(data=input_df1[['guess']].values, columns=['guess_1'])
         merged_guesses.loc[:, 'guess_2'] = input_df2['guess']
         merged_guesses.loc[:, 'guess'] = None
+        
         for row_ix in merged_guesses.index:
             merged_guesses['guess'].loc[row_ix] = method_f(merged_guesses[['guess_1', 'guess_2']].iloc[row_ix], weights)
         combined_df = input_df1.join(input_df2[[column for column in input_df2.columns.values if column not in input_df1.columns.values]])
@@ -994,6 +1007,7 @@ def _consensus_two(self, weighting, method, input_df1, input_df2):
             return combined_df.drop('guess', 1).join(merged_guesses['guess']).copy()
         else:
             return combined_df.join(merged_guesses['guess'])
+    
     def _consensus_three(self, weighting, method, input_df1, input_df2, input_df3):
         """Takes the classifications of different models and combines them in a meaningful manner.
         
@@ -1037,9 +1051,7 @@ def _consensus_three(self, weighting, method, input_df1, input_df2, input_df3):
         weighting = options[weighting % 7]
         method = options[method % 7]
         #Establish the weights for each dataframe/classifier
-        #guesses_gt = [df[['guess','class']] for df in dfs]
         weights = []
-        #for tup in guesses_gt:
         for df in dfs:
             tup = df[['guess', 'class']]
             num_correct = len(np.where(tup['guess'] == tup['class']))
@@ -1050,6 +1062,8 @@ def _consensus_three(self, weighting, method, input_df1, input_df2, input_df3):
                 weights.append(1.0)
             elif weighting == 'adaboost':
                 weights.append(self._adaboost(tup['guess'], tup['class']))
+        
+        #Set the method function for evaluating each DataFrame
         method_f = None
         if method == 'max':
             method_f = self._max_class
@@ -1060,10 +1074,12 @@ def _consensus_three(self, weighting, method, input_df1, input_df2, input_df3):
         elif method == 'min':
             method_f = self._min_class
 
-        merged_guesses = pd.DataFrame(data=input_df1[['guess']], columns=['guess_1'])
+        # Initialize the dataFrame containing just the guesses, and to hold the results
+        merged_guesses = pd.DataFrame(data=input_df1[['guess']].values, columns=['guess_1'])
         merged_guesses.loc[:, 'guess_2'] = input_df2['guess']
         merged_guesses.loc[:, 'guess_3'] = input_df3['guess']
         merged_guesses.loc[:, 'guess'] = None
+        
         for row_ix in merged_guesses.index:
             merged_guesses['guess'].loc[row_ix] = method_f(merged_guesses[['guess_1', 'guess_2', 'guess_3']].iloc[row_ix], weights)
         combined_df = input_df1.join(input_df2[[column for column in input_df2.columns.values if column not in input_df1.columns.values]])
@@ -1120,9 +1136,7 @@ def _consensus_four(self, weighting, method, input_df1, input_df2, input_df3, in
         weighting = options[weighting % 7]
         method = options[method % 7]
         #Establish the weights for each dataframe/classifier
-        #guesses_gt = [df[['guess','class']] for df in dfs]
         weights = []
-        #for tup in guesses_gt:
         for df in dfs:
             tup = df[['guess', 'class']]
             num_correct = len(np.where(tup['guess'] == tup['class']))
@@ -1133,6 +1147,8 @@ def _consensus_four(self, weighting, method, input_df1, input_df2, input_df3, in
                 weights.append(1.0)
             elif weighting == 'adaboost':
                 weights.append(self._adaboost(tup['guess'], tup['class']))
+        
+        #Set the method function for evaluating each DataFrame
         method_f = None
         if method == 'max':
             method_f = self._max_class
@@ -1143,11 +1159,14 @@ def _consensus_four(self, weighting, method, input_df1, input_df2, input_df3, in
         elif method == 'min':
             method_f = self._min_class
 
-        merged_guesses = pd.DataFrame(data=input_df1[['guess']], columns=['guess_1'])
+        # Initialize the dataFrame containing just the guesses, and to hold the results
+        merged_guesses = pd.DataFrame(data=input_df1[['guess']].values, columns=['guess_1'])
+        merged_guesses = pd.DataFrame(data=input_df1[['guess']].values, columns=['guess_1'])
         merged_guesses.loc[:, 'guess_2'] = input_df2['guess']
         merged_guesses.loc[:, 'guess_3'] = input_df3['guess']
-        merged_guesses.loc[:, 'guess_4'] = input_df3['guess']
+        merged_guesses.loc[:, 'guess_4'] = input_df4['guess']
         merged_guesses.loc[:, 'guess'] = None
+        
         for row_ix in merged_guesses.index:
             merged_guesses['guess'].loc[row_ix] = method_f(merged_guesses[['guess_1', 'guess_2', 'guess_3', 'guess_4']].iloc[row_ix], weights)
         combined_df = input_df1.join(input_df2[[column for column in input_df2.columns.values if column not in input_df1.columns.values]])
@@ -1561,7 +1580,6 @@ def _evaluate_individual(self, individual, training_testing_data):
             # Throw out GP expressions that are too large to be compiled in Python
             return 0.,
 
-        print(individual)
         result = func(training_testing_data)
         result = result[result['group'] == 'testing']
         res = self.scoring_function(result)

From 4ff886c20b587bcb9c5aac8e0cfe15d5d8560ce4 Mon Sep 17 00:00:00 2001
From: Nathan <bartleyn@uchicago.edu>
Date: Fri, 26 Feb 2016 21:36:16 -0600
Subject: [PATCH 05/13] Removed adaboost, moved some consensus operator options
 to class fields.

---
 tests.py     | 18 +++++++-------
 tpot/tpot.py | 69 +++++++++++++++++++++++-----------------------------
 2 files changed, 40 insertions(+), 47 deletions(-)

diff --git a/tests.py b/tests.py
index 276336d6..d2aaa1bf 100644
--- a/tests.py
+++ b/tests.py
@@ -163,21 +163,21 @@ def test_consensus():
     res_four_ada_max = pd.DataFrame({'class': [1,1,1,1,1,0,0,0,0,0],
                                      'guess': [1,0,0,0,1,1,1,0,1,1]})
     
-    tests = [(tpot_obj._consensus_two, 0, 3, df1, df2), #accuracy, max
-             (tpot_obj._consensus_two, 0, 4, df1, df2), #accuracy mean
-             (tpot_obj._consensus_two, 0, 5, df1, df2), # accuracy median
-             (tpot_obj._consensus_two, 0, 6, df1, df2), # accuracy min
-             (tpot_obj._consensus_two, 1, 3, df1, df2), # uniform max
-             (tpot_obj._consensus_two, 2, 3, df1, df2), # adaboost max
-             (tpot_obj._consensus_three, 2, 3, df1, df3, df4), # adaboost max
-             (tpot_obj._consensus_four, 2, 3, df1, df2, df3, df4) #adaboost max
+    tests = [(tpot_obj._consensus_two, 0, 2, df1, df2), #accuracy, max
+             (tpot_obj._consensus_two, 0, 3, df1, df2), #accuracy mean
+             (tpot_obj._consensus_two, 0, 4, df1, df2), # accuracy median
+             (tpot_obj._consensus_two, 0, 5, df1, df2), # accuracy min
+             (tpot_obj._consensus_two, 1, 2, df1, df2), # uniform max
+             #(tpot_obj._consensus_two, 2, 3, df1, df2), # adaboost max
+             (tpot_obj._consensus_three, 1, 2, df1, df3, df4), # uni max
+             (tpot_obj._consensus_four, 1, 2, df1, df2, df3, df4) #uni  max
              ]
     results = [res_two_acc_max,
            res_two_acc_mean,
            res_two_acc_med,
            res_two_acc_min,
            res_two_uni_max,
-           res_two_ada_max,
+           #res_two_ada_max,
            res_three_ada_max,
            res_four_ada_max
            ]
diff --git a/tpot/tpot.py b/tpot/tpot.py
index 36276a13..bb73c6c1 100644
--- a/tpot/tpot.py
+++ b/tpot/tpot.py
@@ -149,6 +149,10 @@ def __init__(self, population_size=100, generations=100,
             self.scoring_function=self._balanced_accuracy
         else:
             self.scoring_function=scoring_function
+        
+        self._consensus_options = ['accuracy', 'uniform', 'max', 'mean', 'median', 'min']
+        self._num_consensus_options = len(self._consensus_options)
+        self._consensus_opt_split_ix = 1
 
 
     def fit(self, features, classes, feature_names=None):
@@ -918,20 +922,6 @@ def _min_class(self, classes, weights):
         ht = self._get_ht_dict(classes, weights)
         return self._get_top(classes, sorted(ht.items(), key=operator.itemgetter(1)))
 
-    def _adaboost(self, classes, gt):
-        """Weigh each of a DataFrame's guesses according to an adaboost-like scheme.
-        """
-        # weigh the incorrect classes higher than the correct classes
-        #num_correct = len(np.where(classes == gt))
-        #total = classes.size
-        # this is not quite right, the weight here is for the classifier as a whole, not the weifghted error of a single mistake
-        #weights = pd.Series(data=np.array(0.5 * np.log((1- error)/(error)))
-
-        # e ^ - y_i * alpha_m * k_m (x_i)
-        #init_weights = np.sum(np.exp(-1 * gt * 1 * classes), axis=0)        
-        return (1.0)
-   
-
     def _consensus_two(self, weighting, method, input_df1, input_df2):
         """Takes the classifications of different models and combines them in a meaningful manner.
         
@@ -962,16 +952,19 @@ def _consensus_two(self, weighting, method, input_df1, input_df2):
                     return df
             return dfs[0].copy() 
 
-        #if weighting not in ['accuracy', 'uniform', 'adaboost']:
-        if weighting % 7 > 2:
+        options = self._consensus_options
+        num_options = self._num_consensus_options
+        opt_split_ix = self._consensus_opt_split_ix
+
+        #if weighting not in ['accuracy', 'uniform']:
+        if weighting % num_options > (opt_split_ix):
             return dfs[0].copy()
         #if method not in ['max', 'mean', 'median', 'min']:
-        if method % 7 <= 2:
+        if method % num_options <= (opt_split_ix):
             return dfs[0].copy()
 
-        options = ['accuracy', 'uniform', 'adaboost', 'max', 'mean', 'median', 'min']
-        weighting = options[weighting % 7]
-        method = options[method % 7]
+        weighting = options[weighting % num_options]
+        method = options[method % num_options]
         #Establish the weights for each dataframe/classifier
         weights = []
         for df in dfs:
@@ -982,8 +975,6 @@ def _consensus_two(self, weighting, method, input_df1, input_df2):
                 weights.append(float(num_correct) / float(total_vals))
             elif weighting == 'uniform':
                 weights.append(1.0)
-            elif weighting == 'adaboost':
-                weights.append(self._adaboost(tup['guess'], tup['class']))
         #Set the method function for evaluating each DataFrame
         method_f = None
         if method == 'max':
@@ -1040,16 +1031,19 @@ def _consensus_three(self, weighting, method, input_df1, input_df2, input_df3):
                     return df
             return dfs[0].copy() 
 
-        #if weighting not in ['accuracy', 'uniform', 'adaboost']:
-        if weighting % 7 > 2:
+        options = self._consensus_options
+        num_options = self._num_consensus_options
+        opt_split_ix = self._consensus_opt_split_ix
+        
+        #if weighting not in ['accuracy', 'uniform']:
+        if weighting % num_options > opt_split_ix:
             return dfs[0].copy()
         #if method not in ['max', 'mean', 'median', 'min']:
-        if method % 7 <= 2:
+        if method % num_options <= opt_split_ix:
             return dfs[0].copy()
 
-        options = ['accuracy', 'uniform', 'adaboost', 'max', 'mean', 'median', 'min']
-        weighting = options[weighting % 7]
-        method = options[method % 7]
+        weighting = options[weighting % num_options]
+        method = options[method % num_options]
         #Establish the weights for each dataframe/classifier
         weights = []
         for df in dfs:
@@ -1060,8 +1054,6 @@ def _consensus_three(self, weighting, method, input_df1, input_df2, input_df3):
                 weights.append(float(num_correct) / float(total_vals))
             elif weighting == 'uniform':
                 weights.append(1.0)
-            elif weighting == 'adaboost':
-                weights.append(self._adaboost(tup['guess'], tup['class']))
         
         #Set the method function for evaluating each DataFrame
         method_f = None
@@ -1125,16 +1117,19 @@ def _consensus_four(self, weighting, method, input_df1, input_df2, input_df3, in
                     return df
             return dfs[0].copy() 
 
-        #if weighting not in ['accuracy', 'uniform', 'adaboost']:
-        if weighting % 7 > 2:
+        options = self._consensus_options
+        num_options = self._num_consensus_options
+        opt_split_ix = self._consensus_opt_split_ix
+        
+        #if weighting not in ['accuracy', 'uniform']:
+        if weighting % num_options > opt_split_ix:
             return dfs[0].copy()
         #if method not in ['max', 'mean', 'median', 'min']:
-        if method % 7 <= 2:
+        if method % num_options <= opt_split_ix:
             return dfs[0].copy()
 
-        options = ['accuracy', 'uniform', 'adaboost', 'max', 'mean', 'median', 'min']
-        weighting = options[weighting % 7]
-        method = options[method % 7]
+        weighting = options[weighting % num_options]
+        method = options[method % num_options]
         #Establish the weights for each dataframe/classifier
         weights = []
         for df in dfs:
@@ -1145,8 +1140,6 @@ def _consensus_four(self, weighting, method, input_df1, input_df2, input_df3, in
                 weights.append(float(num_correct) / float(total_vals))
             elif weighting == 'uniform':
                 weights.append(1.0)
-            elif weighting == 'adaboost':
-                weights.append(self._adaboost(tup['guess'], tup['class']))
         
         #Set the method function for evaluating each DataFrame
         method_f = None

From 191c4facedb2fe69d26a3315ee38d98c731e1424 Mon Sep 17 00:00:00 2001
From: Nathan <bartleyn@uchicago.edu>
Date: Fri, 26 Feb 2016 21:47:37 -0600
Subject: [PATCH 06/13] cleanup comments, remove ident and uncomment
 combine_dfs

---
 tpot/tpot.py | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/tpot/tpot.py b/tpot/tpot.py
index bb73c6c1..f3467509 100644
--- a/tpot/tpot.py
+++ b/tpot/tpot.py
@@ -108,7 +108,7 @@ def __init__(self, population_size=100, generations=100,
         self.pset.addPrimitive(self.svc, [pd.DataFrame, float], pd.DataFrame)
         self.pset.addPrimitive(self.knnc, [pd.DataFrame, int], pd.DataFrame)
         self.pset.addPrimitive(self.gradient_boosting, [pd.DataFrame, float, int, int], pd.DataFrame)
-        #self.pset.addPrimitive(self._combine_dfs, [pd.DataFrame, pd.DataFrame], pd.DataFrame)
+        self.pset.addPrimitive(self._combine_dfs, [pd.DataFrame, pd.DataFrame], pd.DataFrame)
         self.pset.addPrimitive(self._variance_threshold, [pd.DataFrame, float], pd.DataFrame)
         self.pset.addPrimitive(self._select_kbest, [pd.DataFrame, int], pd.DataFrame) 
         self.pset.addPrimitive(self._select_percentile, [pd.DataFrame, int], pd.DataFrame)
@@ -125,7 +125,6 @@ def __init__(self, population_size=100, generations=100,
         self.pset.addPrimitive(self._consensus_two, [int, int, pd.DataFrame, pd.DataFrame], pd.DataFrame)
         self.pset.addPrimitive(self._consensus_three, [int, int, pd.DataFrame, pd.DataFrame, pd.DataFrame], pd.DataFrame)
         self.pset.addPrimitive(self._consensus_four, [int, int, pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame], pd.DataFrame)
-        #self.pset.addPrimitive(self._ident, [str], str)
 
         for val in range(0, 101):
             self.pset.addTerminal(val, int)
@@ -875,7 +874,6 @@ def gradient_boosting(self, input_df, learning_rate, n_estimators, max_depth):
   
     def _get_ht_dict(self, classes, weights):
         ret = {}
-        #instead of weights of class needs to be weights of classifier
         ctr = 0
         for cls in classes:
             try:
@@ -894,7 +892,6 @@ def _get_top(self, classes, tups):
             if class_ in values:
                 return class_
 
-    #max freq class
     def _max_class(self, classes, weights):
         """Return the class with the highest weight, or the class that appears first with that weight (e.g., input_df1)
         """
@@ -915,7 +912,6 @@ def _median_class(self, classes, weights):
         median_val = np.median(ht.values())
         return self._get_top(classes, sorted(((x, abs(y - median_val)) for (x,y) in ht.items()), key=operator.itemgetter(1)))
 
-    #minimum frequency class
     def _min_class(self, classes, weights):
         """Return the class with the minimal weight, or the class that appears first with that weight (e.g., input_df1)
         """

From 22a3e19b2281d7ca8356bda8290b2c8fa8e465a9 Mon Sep 17 00:00:00 2001
From: Nathan <bartleyn@uchicago.edu>
Date: Fri, 26 Feb 2016 21:47:37 -0600
Subject: [PATCH 07/13] cleanup comments, remove ident and uncomment
 combine_dfs

---
 tpot/tpot.py | 8 +-------
 1 file changed, 1 insertion(+), 7 deletions(-)

diff --git a/tpot/tpot.py b/tpot/tpot.py
index bb73c6c1..cceee419 100644
--- a/tpot/tpot.py
+++ b/tpot/tpot.py
@@ -108,7 +108,7 @@ def __init__(self, population_size=100, generations=100,
         self.pset.addPrimitive(self.svc, [pd.DataFrame, float], pd.DataFrame)
         self.pset.addPrimitive(self.knnc, [pd.DataFrame, int], pd.DataFrame)
         self.pset.addPrimitive(self.gradient_boosting, [pd.DataFrame, float, int, int], pd.DataFrame)
-        #self.pset.addPrimitive(self._combine_dfs, [pd.DataFrame, pd.DataFrame], pd.DataFrame)
+        self.pset.addPrimitive(self._combine_dfs, [pd.DataFrame, pd.DataFrame], pd.DataFrame)
         self.pset.addPrimitive(self._variance_threshold, [pd.DataFrame, float], pd.DataFrame)
         self.pset.addPrimitive(self._select_kbest, [pd.DataFrame, int], pd.DataFrame) 
         self.pset.addPrimitive(self._select_percentile, [pd.DataFrame, int], pd.DataFrame)
@@ -125,7 +125,6 @@ def __init__(self, population_size=100, generations=100,
         self.pset.addPrimitive(self._consensus_two, [int, int, pd.DataFrame, pd.DataFrame], pd.DataFrame)
         self.pset.addPrimitive(self._consensus_three, [int, int, pd.DataFrame, pd.DataFrame, pd.DataFrame], pd.DataFrame)
         self.pset.addPrimitive(self._consensus_four, [int, int, pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame], pd.DataFrame)
-        #self.pset.addPrimitive(self._ident, [str], str)
 
         for val in range(0, 101):
             self.pset.addTerminal(val, int)
@@ -875,7 +874,6 @@ def gradient_boosting(self, input_df, learning_rate, n_estimators, max_depth):
   
     def _get_ht_dict(self, classes, weights):
         ret = {}
-        #instead of weights of class needs to be weights of classifier
         ctr = 0
         for cls in classes:
             try:
@@ -894,7 +892,6 @@ def _get_top(self, classes, tups):
             if class_ in values:
                 return class_
 
-    #max freq class
     def _max_class(self, classes, weights):
         """Return the class with the highest weight, or the class that appears first with that weight (e.g., input_df1)
         """
@@ -915,7 +912,6 @@ def _median_class(self, classes, weights):
         median_val = np.median(ht.values())
         return self._get_top(classes, sorted(((x, abs(y - median_val)) for (x,y) in ht.items()), key=operator.itemgetter(1)))
 
-    #minimum frequency class
     def _min_class(self, classes, weights):
         """Return the class with the minimal weight, or the class that appears first with that weight (e.g., input_df1)
         """
@@ -1600,8 +1596,6 @@ def _balanced_accuracy(self, result):
         all_classes = list(set(result['class'].values))
         all_class_accuracies = []
         for this_class in all_classes:
-            #print("this class {}".format(this_class))
-            #print("result['guess'] {}".format(result['guess']))
             this_class_accuracy = len(result[(result['guess'] == this_class) \
                 & (result['class'] == this_class)])\
                 / float(len(result[result['class'] == this_class]))

From c560f1abd241160cf43e6cbd00518e0a7b913a8f Mon Sep 17 00:00:00 2001
From: Nathan <bartleyn@uchicago.edu>
Date: Fri, 26 Feb 2016 21:56:06 -0600
Subject: [PATCH 08/13] Added comment to get_ht_dict

---
 tpot/tpot.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tpot/tpot.py b/tpot/tpot.py
index cceee419..bec213ac 100644
--- a/tpot/tpot.py
+++ b/tpot/tpot.py
@@ -873,6 +873,8 @@ def gradient_boosting(self, input_df, learning_rate, n_estimators, max_depth):
     
   
     def _get_ht_dict(self, classes, weights):
+        """Return a dictionary where the keys are the unique class values present in this row of guesses, and the weights are the weights assigned to each guess.
+        """
         ret = {}
         ctr = 0
         for cls in classes:

From cbf49459c418a5e98a0874881c7df23bdef5b99e Mon Sep 17 00:00:00 2001
From: Nathan <bartleyn@uchicago.edu>
Date: Sat, 27 Feb 2016 09:14:30 -0600
Subject: [PATCH 09/13] Updated export_utils, cleanup

---
 tpot/export_utils.py | 256 +++++++++++++++++++++++++++++++++++++++++++
 tpot/tpot.py         |   7 +-
 2 files changed, 259 insertions(+), 4 deletions(-)

diff --git a/tpot/export_utils.py b/tpot/export_utils.py
index f1813bae..7c284239 100644
--- a/tpot/export_utils.py
+++ b/tpot/export_utils.py
@@ -22,6 +22,91 @@
 
 import deap
 
+consensus_options = ['accuracy', 'uniform', 'max', 'mean', 'median', 'min']
+num_consensus_options = 6
+consensus_opt_split_ix = 1
+
+def consensus_operator_prefix(weight_scheme, method, operator_text):
+    """Utility function for generating the first part of the consensus operator text
+    
+    Parameters
+    ----------
+    weight_scheme: integer 
+        The corrected-for weight_scheme index in consensus_options 
+    method: integer
+        The corrected-for method index in consensus_options
+    operator_text: String
+        The current operator export string (to be appended onto)
+
+    Returns
+    -------
+    operator_text: String
+        The updated operator export string
+    """
+
+    operator_text +='''\n
+def _get_ht_dict(classes, weights):
+    """Return a dictionary where the keys are the unique class values present in this row of guesses, and the weights are the weights assigned to each guess.
+    """
+    ret = {}
+    ctr = 0
+    for cls in classes:
+        try:
+            ret[cls] += weights[ctr]
+        except:
+            ret[cls] = weights[ctr]
+        ctr += 1
+    return ret
+
+def _get_top( classes, tups):
+    """Return the class from the row in the first DataFrame passed to the function (e.g., input_df1)
+    """
+    values = [tup[0] for tup in tups if tup[1] == tups[0][1]]
+    for class_ in classes:
+        if class_ in values:
+            return class_
+    '''
+    if consensus_options[method % num_consensus_options] == 'max':
+        operator_text += '''\n
+def _max_class(classes, weights):
+    """Return the class with the highest weight, or the class that appears first with that weight (e.g., input_df1)
+    """
+    ht = _get_ht_dict(classes, weights)
+    return _get_top(classes, sorted(ht.items(), key=operator.itemgetter(1), reverse=True))
+method = _max_class
+        '''
+    elif consensus_options[method % num_consensus_options] == 'mean':
+        operator_text += '''\n
+def _mean_class( classes, weights):
+    """Return the class closest to the mean weight, or the class that appears first with that weight (e.g., input_df1)
+    """
+    ht = _get_ht_dict(classes, weights)
+    mean_val = np.mean(ht.values())
+    return _get_top(classes, sorted(((x, abs(y - mean_val)) for (x,y) in ht.items()), key=operator.itemgetter(1)))
+method = _mean_class
+        '''
+    elif consensus_options[method % num_consensus_options] == 'median':
+        operator_text += '''\n
+def _median_class(classes, weights):
+    """Return the class closest to the median weight, or the class that appears first with that weight (e.g., input_df1)
+    """
+    ht = _get_ht_dict(classes, weights)
+    median_val = np.median(ht.values())
+    return _get_top(classes, sorted(((x, abs(y - median_val)) for (x,y) in ht.items()), key=operator.itemgetter(1)))
+method = _median_class
+        '''
+    elif consensus_options[method % num_consensus_options] == 'min':
+        operator_text += '''\n
+def _min_class(classes, weights):
+    """Return the class with the minimal weight, or the class that appears first with that weight (e.g., input_df1)
+    """
+    ht = _get_ht_dict(classes, weights)
+    return _get_top(classes, sorted(ht.items(), key=operator.itemgetter(1)))
+method = _min_class
+        '''
+    return operator_text
+
+
 def replace_mathematical_operators(exported_pipeline):
     """Replace all of the mathematical operators with their results for use in export(self, output_file_name)
 
@@ -291,6 +376,177 @@ def replace_function_calls(pipeline_list):
             operator_text += '\n# Combine two DataFrames'
             operator_text += '\n{2} = {0}.join({1}[[column for column in {1}.columns.values if column not in {0}.columns.values]])\n'.format(operator[2], operator[3], result_name)
 
+        elif operator_name == '_consensus_two':
+            weight_scheme = int(operator[2])
+            method = int(operator[3])
+            if weight_scheme % num_consensus_options > consensus_opt_split_ix:
+                weight_scheme = consensus_opt_split_ix
+            if method % num_consensus_options <= consensus_opt_split_ix:
+                method = consensus_opt_split_ix + 1
+            
+            operator_text += consensus_operator_prefix(weight_scheme, method, operator_text)
+
+            operator_text += '\n# Combine two DataFrames'
+            operator_text += '\ndfs = [{0}, {1}]'.format(operator[4], operator[5])
+            operator_text += '''
+ignore_consensus = False
+if any(len(df.columns) == 3 for df in dfs):
+    found = False
+
+    for df in dfs:
+        if len(df.columns) > 3:
+            {0} = df.copy()
+            found = True
+            break
+    if not found:
+        ignore_consensus = True
+        {0} = df.copy()
+
+if not ignore_consensus:
+    weights = []
+    for df in dfs:
+        tup = df[['guess', 'class']]
+        num_correct = len(np.where(tup['guess'] == tup['class'])[0])
+        total_vals = len(tup['guess'].index)'''.format(result_name)
+            if consensus_options[weight_scheme % num_consensus_options] == 'accuracy':
+                operator_text +='''
+        weights.append(float(num_correct) / float(total_vals))
+        '''
+            elif consensus_options[weight_scheme % num_consensus_options] == 'uniform':
+                operator_text +='''
+        weights.append(1.0)
+        '''
+            operator_text += '''
+    # Initialize the dataFrame containing just the guesses, and to hold the results
+    merged_guesses = pd.DataFrame(data={0}[['guess']].values, columns=['guess_1'])
+    merged_guesses.loc[:, 'guess_2'] = {1}['guess']
+    merged_guesses.loc[:, 'guess'] = None
+
+    for row_ix in merged_guesses.index:
+        merged_guesses['guess'].loc[row_ix] = method(merged_guesses[['guess_1', 'guess_2']].iloc[row_ix], weights)
+    {2} = {0}.join({1}[[column for column in {1}.columns.values if column not in {0}.columns.values]])
+    if 'guess' in {2}.columns.values:
+        {2} = {2}.drop('guess', 1).join(merged_guesses['guess']).copy()
+    else:
+        {2} = {2}.join(merged_guesses['guess'])
+        '''.format(operator[4], operator[5], result_name)
+
+        elif operator_name == '_consensus_three':
+            weight_scheme = int(operator[2])
+            method = int(operator[3])
+            if weight_scheme % num_consensus_options > consensus_opt_split_ix:
+                weight_scheme = consensus_opt_split_ix
+            if method % num_consensus_options <= consensus_opt_split_ix:
+                method = consensus_opt_split_ix + 1
+
+            operator_text += consensus_operator_prefix(weight_scheme, method, operator_text)
+            
+            operator_text += '\n# Combine three DataFrames'
+            operator_text += '\ndfs = [{0}, {1}, {2}]'.format(operator[4], operator[5], operator[6])
+            operator_text += '''
+ignore_consensus = False
+if any(len(df.columns) == 3 for df in dfs):
+    found = False
+
+    for df in dfs:
+        if len(df.columns) > 3:
+            {0} = df.copy()
+            found = True
+            break
+    if not found:
+        ignore_consensus = True
+        {0} = df.copy()
+
+if not ignore_consensus:
+    weights = []
+    for df in dfs:
+        tup = df[['guess', 'class']]
+        num_correct = len(np.where(tup['guess'] == tup['class'])[0])
+        total_vals = len(tup['guess'].index)'''.format(result_name)
+            if consensus_options[weight_scheme % num_consensus_options] == 'accuracy':
+                operator_text +='''
+        weights.append(float(num_correct) / float(total_vals))
+        '''
+            elif consensus_options[weight_scheme % num_consensus_options] == 'uniform':
+                operator_text +='''
+        weights.append(1.0)
+        '''
+            operator_text += '''
+    # Initialize the dataFrame containing just the guesses, and to hold the results
+    merged_guesses = pd.DataFrame(data={0}[['guess']].values, columns=['guess_1'])
+    merged_guesses.loc[:, 'guess_2'] = {1}['guess']
+    merged_guesses.loc[:, 'guess_3'] = {2}['guess']
+    merged_guesses.loc[:, 'guess'] = None
+
+    for row_ix in merged_guesses.index:
+        merged_guesses['guess'].loc[row_ix] = method(merged_guesses[['guess_1', 'guess_2', 'guess_3']].iloc[row_ix], weights)
+    {3} = {0}.join({1}[[column for column in {1}.columns.values if column not in {0}.columns.values]])
+    {3} = {3}.join({2}[[column for column in {2}.columns.values if column not in {3}.columns.values]])
+    if 'guess' in {3}.columns.values:
+        {3} = {3}.drop('guess', 1).join(merged_guesses['guess']).copy()
+    else:
+        {3} = {3}.join(merged_guesses['guess'])
+        '''.format(operator[4], operator[5], operator[6], result_name)
+
+        elif operator_name == '_consensus_four':
+            weight_scheme = int(operator[2])
+            method = int(operator[3])
+            if weight_scheme % num_consensus_options > consensus_opt_split_ix:
+                weight_scheme = consensus_opt_split_ix
+            if method % num_consensus_options <= consensus_opt_split_ix:
+                method = consensus_opt_split_ix + 1
+
+            operator_text += consensus_operator_prefix(weight_scheme, method, operator_text)
+            
+            operator_text += '\n# Combine four DataFrames'
+            operator_text += '\ndfs = [{0}, {1}, {2}, {3}]'.format(operator[4], operator[5], operator[6], operator[7])
+            operator_text += '''
+ignore_consensus = False
+if any(len(df.columns) == 3 for df in dfs):
+    found = False
+
+    for df in dfs:
+        if len(df.columns) > 3:
+            {0} = df.copy()
+            found = True
+            break
+    if not found:
+        ignore_consensus = True
+        {0} = df.copy()
+
+if not ignore_consensus:
+    weights = []
+    for df in dfs:
+        tup = df[['guess', 'class']]
+        num_correct = len(np.where(tup['guess'] == tup['class'])[0])
+        total_vals = len(tup['guess'].index)'''.format(result_name)
+            if consensus_options[weight_scheme % num_consensus_options] == 'accuracy':
+                operator_text +='''
+        weights.append(float(num_correct) / float(total_vals))
+        '''
+            elif consensus_options[weight_scheme % num_consensus_options] == 'uniform':
+                operator_text +='''
+        weights.append(1.0)
+        '''
+            operator_text += '''
+    # Initialize the dataFrame containing just the guesses, and to hold the results
+    merged_guesses = pd.DataFrame(data={0}[['guess']].values, columns=['guess_1'])
+    merged_guesses.loc[:, 'guess_2'] = {1}['guess']
+    merged_guesses.loc[:, 'guess_3'] = {2}['guess']
+    merged_guesses.loc[:, 'guess_4'] = {3}['guess']
+    merged_guesses.loc[:, 'guess'] = None
+
+    for row_ix in merged_guesses.index:
+        merged_guesses['guess'].loc[row_ix] = method(merged_guesses[['guess_1', 'guess_2', 'guess_3', 'guess_4']].iloc[row_ix], weights)
+    {4} = {0}.join({1}[[column for column in {1}.columns.values if column not in {0}.columns.values]])
+    {4} = {4}.join({2}[[column for column in {2}.columns.values if column not in {4}.columns.values]])
+    {4} = {4}.join({3}[[column for column in {3}.columns.values if column not in {4}.columns.values]])
+    if 'guess' in {4}.columns.values:
+        {4} = {4}.drop('guess', 1).join(merged_guesses['guess']).copy()
+    else:
+        {4} = {4}.join(merged_guesses['guess'])
+        '''.format(operator[4], operator[5], operator[6], result_name)
+
         elif operator_name == '_variance_threshold':
             operator_text += '''
 # Use Scikit-learn's VarianceThreshold for feature selection
diff --git a/tpot/tpot.py b/tpot/tpot.py
index 26a67919..0bff51d7 100644
--- a/tpot/tpot.py
+++ b/tpot/tpot.py
@@ -131,9 +131,9 @@ def __init__(self, population_size=100, generations=100,
         self._pset.addPrimitive(self._div, [int, int], float)
         
         # Other operators
-        self.pset.addPrimitive(self._consensus_two, [int, int, pd.DataFrame, pd.DataFrame], pd.DataFrame)
-        self.pset.addPrimitive(self._consensus_three, [int, int, pd.DataFrame, pd.DataFrame, pd.DataFrame], pd.DataFrame)
-        self.pset.addPrimitive(self._consensus_four, [int, int, pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame], pd.DataFrame)
+        self._pset.addPrimitive(self._consensus_two, [int, int, pd.DataFrame, pd.DataFrame], pd.DataFrame)
+        self._pset.addPrimitive(self._consensus_three, [int, int, pd.DataFrame, pd.DataFrame, pd.DataFrame], pd.DataFrame)
+        self._pset.addPrimitive(self._consensus_four, [int, int, pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame], pd.DataFrame)
         for val in range(0, 101):
             self._pset.addTerminal(val, int)
         for val in [100.0, 10.0, 1.0, 0.1, 0.01, 0.001, 0.0001]:
@@ -845,7 +845,6 @@ def _consensus_four(self, weighting, method, input_df1, input_df2, input_df3, in
 
         # Initialize the dataFrame containing just the guesses, and to hold the results
         merged_guesses = pd.DataFrame(data=input_df1[['guess']].values, columns=['guess_1'])
-        merged_guesses = pd.DataFrame(data=input_df1[['guess']].values, columns=['guess_1'])
         merged_guesses.loc[:, 'guess_2'] = input_df2['guess']
         merged_guesses.loc[:, 'guess_3'] = input_df3['guess']
         merged_guesses.loc[:, 'guess_4'] = input_df4['guess']

From 26a162e264d12f4f497abaddc239583c07e1eed4 Mon Sep 17 00:00:00 2001
From: Nathan <bartleyn@uchicago.edu>
Date: Sat, 27 Feb 2016 09:27:03 -0600
Subject: [PATCH 10/13] Corrected one line in export_utils for the
 consensus_operators

---
 tpot/export_utils.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tpot/export_utils.py b/tpot/export_utils.py
index 7c284239..aa8ce54d 100644
--- a/tpot/export_utils.py
+++ b/tpot/export_utils.py
@@ -400,7 +400,7 @@ def replace_function_calls(pipeline_list):
             break
     if not found:
         ignore_consensus = True
-        {0} = df.copy()
+        {0} = dfs[0].copy()
 
 if not ignore_consensus:
     weights = []
@@ -455,7 +455,7 @@ def replace_function_calls(pipeline_list):
             break
     if not found:
         ignore_consensus = True
-        {0} = df.copy()
+        {0} = dfs[0].copy()
 
 if not ignore_consensus:
     weights = []
@@ -512,7 +512,7 @@ def replace_function_calls(pipeline_list):
             break
     if not found:
         ignore_consensus = True
-        {0} = df.copy()
+        {0} = dfs[0].copy()
 
 if not ignore_consensus:
     weights = []

From 7fac07316c194204a071f41d1f005e2634edb735 Mon Sep 17 00:00:00 2001
From: Nathan <bartleyn@uchicago.edu>
Date: Sat, 27 Feb 2016 10:01:00 -0600
Subject: [PATCH 11/13] Updated print statements in tests to be print function
 call from python3

---
 tests.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests.py b/tests.py
index 736b94c3..c1a1014a 100644
--- a/tests.py
+++ b/tests.py
@@ -186,7 +186,7 @@ def test_consensus():
            ]
 
     for test, res in zip(tests, results):
-        print test[1:3]
-        print test[0](*test[1:])['guess'].values
+        print(test[1:3])
+        print(test[0](*test[1:])['guess'].values)
         assert np.array_equal(test[0](*test[1:])['guess'].values, res['guess'].values)
 

From 47248ede495d70ca1c088ac93bfaeefa1199f3f4 Mon Sep 17 00:00:00 2001
From: Nathan <bartleyn@uchicago.edu>
Date: Sat, 27 Feb 2016 17:07:03 -0600
Subject: [PATCH 12/13] 2to3 conversion of consensus stuff, updated
 export_utils and tests

---
 tests.py             | 34 +++++++++++++++++++++-------------
 tpot/export_utils.py | 12 ++++++------
 tpot/tpot.py         | 16 ++++++++--------
 3 files changed, 35 insertions(+), 27 deletions(-)

diff --git a/tests.py b/tests.py
index c1a1014a..7c39e618 100644
--- a/tests.py
+++ b/tests.py
@@ -92,15 +92,15 @@ def test_xgboost():
 def test_combine_dfs():
     tpot_obj = TPOT()
 
-    df1 = pd.DataFrame({'a': range(10),
-                        'b': range(10, 20)})
+    df1 = pd.DataFrame({'a': list(range(10)),
+                        'b': list(range(10, 20))})
 
-    df2 = pd.DataFrame({'b': range(10, 20),
-                        'c': range(20, 30)})
+    df2 = pd.DataFrame({'b': list(range(10, 20)),
+                        'c': list(range(20, 30))})
 
-    combined_df = pd.DataFrame({'a': range(10),
-                                'b': range(10, 20),
-                                'c': range(20, 30)})
+    combined_df = pd.DataFrame({'a': list(range(10)),
+                                'b': list(range(10, 20)),
+                                'c': list(range(20, 30))})
 
     assert tpot_obj._combine_dfs(df1, df2).equals(combined_df)
 
@@ -130,16 +130,24 @@ def test_static_models():
 def test_consensus():
     tpot_obj = TPOT()
 
-    df1 = pd.DataFrame({'class': [1,1,1,1,1,0,0,0,0,0],
+    df1 = pd.DataFrame({'00001': [2,2,2,2,2,2,2,2,2,2],
+                        '00002': [3,3,3,3,3,3,3,3,3,3],
+                        'class': [1,1,1,1,1,0,0,0,0,0],
                         'guess': [1,0,1,0,1,0,1,0,1,0]})
 
-    df2 = pd.DataFrame({'class': [1,1,1,1,1,0,0,0,0,0],
+    df2 = pd.DataFrame({'00001': [2,2,2,2,2,2,2,2,2,2],
+                        '00002': [3,3,3,3,3,3,3,3,3,3],
+                        'class': [1,1,1,1,1,0,0,0,0,0],
                         'guess': [0,0,0,0,0,1,1,1,1,1]})
 
-    df3 = pd.DataFrame({'class': [1,1,1,1,1,0,0,0,0,0],
+    df3 = pd.DataFrame({'00001': [2,2,2,2,2,2,2,2,2,2],
+                        '00002': [3,3,3,3,3,3,3,3,3,3],
+                        'class': [1,1,1,1,1,0,0,0,0,0],
                         'guess': [0,1,0,1,0,1,0,1,0,1]})
 
-    df4 = pd.DataFrame({'class': [1,1,1,1,1,0,0,0,0,0],
+    df4 = pd.DataFrame({'00001': [2,2,2,2,2,2,2,2,2,2],
+                        '00002': [3,3,3,3,3,3,3,3,3,3],
+                        'class': [1,1,1,1,1,0,0,0,0,0],
                         'guess': [1,1,0,0,1,1,0,0,1,1]})
 
     uniform = [1.0] * 10
@@ -186,7 +194,7 @@ def test_consensus():
            ]
 
     for test, res in zip(tests, results):
-        print(test[1:3])
-        print(test[0](*test[1:])['guess'].values)
+        #print(test[1:3])
+        #print(test[0](*test[1:])['guess'].values)
         assert np.array_equal(test[0](*test[1:])['guess'].values, res['guess'].values)
 
diff --git a/tpot/export_utils.py b/tpot/export_utils.py
index aa8ce54d..78dd5bf8 100644
--- a/tpot/export_utils.py
+++ b/tpot/export_utils.py
@@ -72,7 +72,7 @@ def _max_class(classes, weights):
     """Return the class with the highest weight, or the class that appears first with that weight (e.g., input_df1)
     """
     ht = _get_ht_dict(classes, weights)
-    return _get_top(classes, sorted(ht.items(), key=operator.itemgetter(1), reverse=True))
+    return _get_top(classes, sorted(list(ht.items()), key=operator.itemgetter(1), reverse=True))
 method = _max_class
         '''
     elif consensus_options[method % num_consensus_options] == 'mean':
@@ -81,8 +81,8 @@ def _mean_class( classes, weights):
     """Return the class closest to the mean weight, or the class that appears first with that weight (e.g., input_df1)
     """
     ht = _get_ht_dict(classes, weights)
-    mean_val = np.mean(ht.values())
-    return _get_top(classes, sorted(((x, abs(y - mean_val)) for (x,y) in ht.items()), key=operator.itemgetter(1)))
+    mean_val = np.mean(list(ht.values()))
+    return _get_top(classes, sorted(((x, abs(y - mean_val)) for (x,y) in list(ht.items())), key=operator.itemgetter(1)))
 method = _mean_class
         '''
     elif consensus_options[method % num_consensus_options] == 'median':
@@ -91,8 +91,8 @@ def _median_class(classes, weights):
     """Return the class closest to the median weight, or the class that appears first with that weight (e.g., input_df1)
     """
     ht = _get_ht_dict(classes, weights)
-    median_val = np.median(ht.values())
-    return _get_top(classes, sorted(((x, abs(y - median_val)) for (x,y) in ht.items()), key=operator.itemgetter(1)))
+    median_val = np.median(list(ht.values()))
+    return _get_top(classes, sorted(((x, abs(y - median_val)) for (x,y) in list(ht.items())), key=operator.itemgetter(1)))
 method = _median_class
         '''
     elif consensus_options[method % num_consensus_options] == 'min':
@@ -101,7 +101,7 @@ def _min_class(classes, weights):
     """Return the class with the minimal weight, or the class that appears first with that weight (e.g., input_df1)
     """
     ht = _get_ht_dict(classes, weights)
-    return _get_top(classes, sorted(ht.items(), key=operator.itemgetter(1)))
+    return _get_top(classes, sorted(list(ht.items()), key=operator.itemgetter(1)))
 method = _min_class
         '''
     return operator_text
diff --git a/tpot/tpot.py b/tpot/tpot.py
index 0bff51d7..c21845fb 100644
--- a/tpot/tpot.py
+++ b/tpot/tpot.py
@@ -18,7 +18,7 @@
 
 """
 
-from __future__ import print_function
+
 import argparse
 import operator
 import random
@@ -593,27 +593,27 @@ def _max_class(self, classes, weights):
         """Return the class with the highest weight, or the class that appears first with that weight (e.g., input_df1)
         """
         ht = self._get_ht_dict(classes, weights)
-        return self._get_top(classes, sorted(ht.items(), key=operator.itemgetter(1), reverse=True))
+        return self._get_top(classes, sorted(list(ht.items()), key=operator.itemgetter(1), reverse=True))
     
     def _mean_class(self, classes, weights):
         """Return the class closest to the mean weight, or the class that appears first with that weight (e.g., input_df1)
         """
         ht = self._get_ht_dict(classes, weights)
-        mean_val = np.mean(ht.values())
-        return self._get_top(classes, sorted(((x, abs(y - mean_val)) for (x,y) in ht.items()), key=operator.itemgetter(1)))
+        mean_val = np.mean(list(ht.values()))
+        return self._get_top(classes, sorted(((x, abs(y - mean_val)) for (x,y) in list(ht.items())), key=operator.itemgetter(1)))
 
     def _median_class(self, classes, weights):
         """Return the class closest to the median weight, or the class that appears first with that weight (e.g., input_df1)
         """
         ht = self._get_ht_dict(classes, weights)
-        median_val = np.median(ht.values())
-        return self._get_top(classes, sorted(((x, abs(y - median_val)) for (x,y) in ht.items()), key=operator.itemgetter(1)))
+        median_val = np.median(list(ht.values()))
+        return self._get_top(classes, sorted(((x, abs(y - median_val)) for (x,y) in list(ht.items())), key=operator.itemgetter(1)))
 
     def _min_class(self, classes, weights):
         """Return the class with the minimal weight, or the class that appears first with that weight (e.g., input_df1)
         """
         ht = self._get_ht_dict(classes, weights)
-        return self._get_top(classes, sorted(ht.items(), key=operator.itemgetter(1)))
+        return self._get_top(classes, sorted(list(ht.items()), key=operator.itemgetter(1)))
 
     def _consensus_two(self, weighting, method, input_df1, input_df2):
         """Takes the classifications of different models and combines them in a meaningful manner.
@@ -1488,7 +1488,7 @@ def float_range(value):
     args = parser.parse_args()
 
     if args.version:
-        from _version import __version__
+        from ._version import __version__
         print('TPOT version: {}'.format(__version__))
         return
     elif args.input_file is None:

From 1e621508a6369a90d5b2ce03abc5323aa17704f1 Mon Sep 17 00:00:00 2001
From: Nathan <bartleyn@uchicago.edu>
Date: Tue, 8 Mar 2016 22:38:31 -0600
Subject: [PATCH 13/13] Added threshold consensus operator

---
 tests.py             |  4 ++++
 tpot/export_utils.py | 19 +++++++++++++++++--
 tpot/tpot.py         | 21 ++++++++++++++++++++-
 3 files changed, 41 insertions(+), 3 deletions(-)

diff --git a/tests.py b/tests.py
index 7c39e618..f6d44553 100644
--- a/tests.py
+++ b/tests.py
@@ -169,6 +169,8 @@ def test_consensus():
                                     'guess': [1,0,1,0,1,0,1,0,1,0]})
     res_two_ada_max = pd.DataFrame({'class': [1,1,1,1,1,0,0,0,0,0],
                                     'guess': [1,0,1,0,1,0,1,0,1,0]})
+    res_three_uni_thresh = pd.DataFrame({'class': [1,1,1,1,1,0,0,0,0,0],
+                                         'guess': [0,0,0,0,0,1,1,1,1,1]})
     res_three_ada_max = pd.DataFrame({'class': [1,1,1,1,1,0,0,0,0,0],
                                       'guess': [1,1,0,0,1,1,0,0,1,1]})
     res_four_ada_max = pd.DataFrame({'class': [1,1,1,1,1,0,0,0,0,0],
@@ -180,6 +182,7 @@ def test_consensus():
              (tpot_obj._consensus_two, 0, 5, df1, df2), # accuracy min
              (tpot_obj._consensus_two, 1, 2, df1, df2), # uniform max
              #(tpot_obj._consensus_two, 2, 3, df1, df2), # adaboost max
+             (tpot_obj._consensus_three, 1, 6, df1, df2, df3), #uni threshold
              (tpot_obj._consensus_three, 1, 2, df1, df3, df4), # uni max
              (tpot_obj._consensus_four, 1, 2, df1, df2, df3, df4) #uni  max
              ]
@@ -189,6 +192,7 @@ def test_consensus():
            res_two_acc_min,
            res_two_uni_max,
            #res_two_ada_max,
+           res_three_uni_thresh,
            res_three_ada_max,
            res_four_ada_max
            ]
diff --git a/tpot/export_utils.py b/tpot/export_utils.py
index 78dd5bf8..73abdc07 100644
--- a/tpot/export_utils.py
+++ b/tpot/export_utils.py
@@ -22,8 +22,8 @@
 
 import deap
 
-consensus_options = ['accuracy', 'uniform', 'max', 'mean', 'median', 'min']
-num_consensus_options = 6
+consensus_options = ['accuracy', 'uniform', 'max', 'mean', 'median', 'min', 'threshold']
+num_consensus_options = 7
 consensus_opt_split_ix = 1
 
 def consensus_operator_prefix(weight_scheme, method, operator_text):
@@ -104,6 +104,21 @@ def _min_class(classes, weights):
     return _get_top(classes, sorted(list(ht.items()), key=operator.itemgetter(1)))
 method = _min_class
         '''
+    elif consensus_options[method % num_consensus_options] == 'threshold':
+        operator_text += '''\n
+def _threshold_class(self, classes, weights):
+    """Return the class with that contains a certain percentage of the weight 
+    """
+    ht = _get_ht_dict(classes, weights)
+    total_weight = sum(list(ht.values()))
+    threshold = 0.75
+    sorted_classes = sorted(((x, float(y) / total_weight) for (x,y) in list(ht.items()) if (float(y) / total_weight) > threshold), key=operator.itemgetter(1), reverse=True)
+    while len(sorted_classes) == 0:
+        threshold = threshold - 0.05
+        sorted_classes = sorted(((x, float(y) / total_weight) for (x,y) in list(ht.items()) if (float(y) / total_weight) > threshold), key=operator.itemgetter(1), reverse=True)
+    
+    return _get_top(classes, sorted_classes)
+        '''
     return operator_text
 
 
diff --git a/tpot/tpot.py b/tpot/tpot.py
index 0000df84..32fa8b66 100644
--- a/tpot/tpot.py
+++ b/tpot/tpot.py
@@ -157,7 +157,7 @@ def __init__(self, population_size=100, generations=100,
         else:
             self.scoring_function=scoring_function
         
-        self._consensus_options = ['accuracy', 'uniform', 'max', 'mean', 'median', 'min']
+        self._consensus_options = ['accuracy', 'uniform', 'max', 'mean', 'median', 'min', 'threshold']
         self._num_consensus_options = len(self._consensus_options)
         self._consensus_opt_split_ix = 1
 
@@ -615,6 +615,19 @@ def _min_class(self, classes, weights):
         ht = self._get_ht_dict(classes, weights)
         return self._get_top(classes, sorted(list(ht.items()), key=operator.itemgetter(1)))
 
+    def _threshold_class(self, classes, weights):
+        """Return the class with that contains a certain percentage of the weight 
+        """
+        ht = self._get_ht_dict(classes, weights)
+        total_weight = sum(list(ht.values()))
+        threshold = 0.75
+        sorted_classes = sorted(((x, float(y) / total_weight) for (x,y) in list(ht.items()) if (float(y) / total_weight) > threshold), key=operator.itemgetter(1), reverse=True)
+        while len(sorted_classes) == 0:
+            threshold = threshold - 0.05
+            sorted_classes = sorted(((x, float(y) / total_weight) for (x,y) in list(ht.items()) if (float(y) / total_weight) > threshold), key=operator.itemgetter(1), reverse=True)
+        
+        return self._get_top(classes, sorted_classes) 
+
     def _consensus_two(self, weighting, method, input_df1, input_df2):
         """Takes the classifications of different models and combines them in a meaningful manner.
         
@@ -678,6 +691,8 @@ def _consensus_two(self, weighting, method, input_df1, input_df2):
             method_f = self._median_class
         elif method == 'min':
             method_f = self._min_class
+        elif method =='threshold':
+            method_f = self._threshold_class
 
         # Initialize the dataFrame containing just the guesses, and to hold the results
         merged_guesses = pd.DataFrame(data=input_df1[['guess']].values, columns=['guess_1'])
@@ -758,6 +773,8 @@ def _consensus_three(self, weighting, method, input_df1, input_df2, input_df3):
             method_f = self._median_class
         elif method == 'min':
             method_f = self._min_class
+        elif method =='threshold':
+            method_f = self._threshold_class
 
         # Initialize the dataFrame containing just the guesses, and to hold the results
         merged_guesses = pd.DataFrame(data=input_df1[['guess']].values, columns=['guess_1'])
@@ -842,6 +859,8 @@ def _consensus_four(self, weighting, method, input_df1, input_df2, input_df3, in
             method_f = self._median_class
         elif method == 'min':
             method_f = self._min_class
+        elif method =='threshold':
+            method_f = self._threshold_class
 
         # Initialize the dataFrame containing just the guesses, and to hold the results
         merged_guesses = pd.DataFrame(data=input_df1[['guess']].values, columns=['guess_1'])