From 6c8afec4ec5d38bdc2ea171458962c23bd09d554 Mon Sep 17 00:00:00 2001 From: Nathan Date: Thu, 18 Feb 2016 23:55:34 -0600 Subject: [PATCH 01/13] Added consensus operators, working through adaboost. Need to optimize memory usage. --- tpot/tpot.py | 263 ++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 262 insertions(+), 1 deletion(-) diff --git a/tpot/tpot.py b/tpot/tpot.py index 6fe10ba8..cfbab2e4 100644 --- a/tpot/tpot.py +++ b/tpot/tpot.py @@ -122,10 +122,17 @@ def __init__(self, population_size=100, generations=100, self.pset.addPrimitive(operator.sub, [int, int], int) self.pset.addPrimitive(operator.mul, [int, int], int) self.pset.addPrimitive(self._div, [int, int], float) + self.pset.addPrimitive(self._consensus_two, [str, str, pd.DataFrame, pd.DataFrame], pd.DataFrame) + self.pset.addPrimitive(self._consensus_three, [str, str, pd.DataFrame, pd.DataFrame, pd.DataFrame], pd.DataFrame) + self.pset.addPrimitive(self._consensus_four, [str, str, pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame], pd.DataFrame) + self.pset.addPrimitive(self._ident, [str], str) + for val in range(0, 101): self.pset.addTerminal(val, int) for val in [100.0, 10.0, 1.0, 0.1, 0.01, 0.001, 0.0001]: self.pset.addTerminal(val, float) + for val in ['max', 'mean', 'median', 'min', 'accuracy', 'uniform', 'adaboost']: + self.pset.addTerminal(val, str) creator.create('FitnessMax', base.Fitness, weights=(1.0,)) creator.create('Individual', gp.PrimitiveTree, fitness=creator.FitnessMax) @@ -863,14 +870,268 @@ def gradient_boosting(self, input_df, learning_rate, n_estimators, max_depth): return self._train_model_and_predict(input_df, GradientBoostingClassifier, learning_rate=learning_rate, n_estimators=n_estimators, max_depth=max_depth, random_state=42) + + def _get_ht_dict(self, classes, weights): + ret = {} + for cls in classes: + try: + ret[cls] += weights[cls] + except: + print(cls) + print(weights) + ret[cls] = weights[cls] + return ret + + #max freq class + def _max_class(self, classes, weights): + ht = self._get_ht_dict(classes, weights) + return sorted(ht.items(), key=operator.itemgetter(1))[-1][0] + + def _mean_class(self, classes, weights): + ht = self._get_ht_dict(classes, weights) + mean_val = np.mean(ht.values()) + return sorted(((x, abs(y - mean_val)) for (x,y) in ht.items()), key=operator.itemgetter(1))[0] + + def _median_class(self, classes, weights): + ht = self._get_ht_dict(classes, weights) + median_val = np.median(ht.values()) + return sorted(((x, abs(y - median_val)) for (x,y) in ht.items()), key=operator.itemgetter(1))[0] + + #minimum frequency class + def _min_class(self, classes, weights): + ht = self._get_ht_dict(classes, weights) + return sorted(ht.items(), key=operator.itemgetter(1))[0] + + def _adaboost(self, classes, gt): + # weigh the incorrect classes higher than the correct classes + num_correct = len(np.where(classes == gt)) + total = classes.size# + # this is not quite right, the weight here is for the classifier as a whole, not the weifghted error of a single mistake + #weights = pd.Series(data=np.array(0.5 * np.log((1- error)/(error))) + + # e ^ - y_i * alpha_m * k_m (x_i) + #init_weights = np.sum(np.exp(-1 * gt * 1 * classes), axis=0) + return 1.0 + def _ident(*args): + return args + + def _consensus_two(self, weighting, method, input_df1, input_df2): + """Takes the classifications of different models and combines them in a meaningful manner. + + Parameters + ---------- + weighting: {'accuracy', 'uniform', 'adaboost'} + Method of weighting the classifications from the different DataFrames + method: {'max', 'mean', 'median', 'min'} + Method of combining the classifications from the different DataFrames + input_df1: pandas.DataFrame {n_samples, n_features+['class', 'group', 'guess']} + First input DataFrame to combine guesses + input_df2: pandas.DataFrame {n_samples, n_features+['class', 'group', 'guess']} + Second input DataFrame to combine guesses + + Returns + ------- + combined_df: pandas.DataFrame {n_samples, n_features+['class', 'group', 'guess']} + Combined DataFrames + + """ + #Validate input + #If either DF is empty and the other isn't, return a non-empty DF as it probably has reasonable guesses + #Otherwise if both are lacking, return the first one + dfs = [input_df1, input_df2] + if any(len(df.columns) == 3 for df in dfs): + for df in dfs: + if len(df.columns) > 3: + return df + return dfs[0].copy() + + if weighting not in ['accuracy', 'uniform', 'adaboost']: + return dfs[0].copy() + if method not in ['max', 'mean', 'median', 'min']: + return dfs[0].copy() + #Establish the weights for each dataframe/classifier + #guesses_gt = [df[['guess','class']] for df in dfs] + guesses + + input_df1.join(input_df2[[column for column in input_df2.columns.values if column not in input_df1.columns.values]]).copy() + weights = [] + + #for tup in guesses_gt: + for df in dfs: + tup = df[['guess', 'class']] + num_correct = len(np.where(tup['guess'] == tup['class'])) + total_vals = len(tup['guess'].index) + if weighting == 'accuracy': + weights.append(float(num_correct) / float(total_vals)) + elif weighting == 'uniform': + weights.append(1.0) + elif weighting == 'adaboost': + weights.append(self._adaboost(tup['guess'], tup['class'])) + method_f = None + if method == 'max': + method_f = self._max_class + elif method == 'mean': + method_f = self._mean_class + elif method == 'median': + method_f = self._median_class + elif method == 'min': + method_f = self._min_class + + # for each sample, get the appropriate combined value + merged_guesses = pd.merge(input_df1[['guess']], input_df2[['guess']], suffixes=['_1', '_2']) + merged_guesses['res'] = None + for row_ix in merged_guesses.index: + merged_guesses['res'].iloc[row_ix] = method_f(merged_guesses.iloc[row_ix], weights) + + return pd.join(combined_df, merged_guesses['res'].copy()) + + def _consensus_three(self, weighting, method, input_df1, input_df2, input_df3): + """Takes the classifications of different models and combines them in a meaningful manner. + + Parameters + ---------- + weighting: {'accuracy', 'uniform', 'adaboost'} + Method of weighting the classifications from the different DataFrames + method: {'max', 'mean', 'median', 'min'} + Method of combining the classifications from the different DataFrames + input_df1: pandas.DataFrame {n_samples, n_features+['class', 'group', 'guess']} + First input DataFrame to combine guesses + input_df2: pandas.DataFrame {n_samples, n_features+['class', 'group', 'guess']} + Second input DataFrame to combine guesses + input_df3: pandas.DataFrame {n_samples, n_features+['class', 'group', 'guess']} + Third input DataFrame to combine guesses + + Returns + ------- + combined_df: pandas.DataFrame {n_samples, n_features+['class', 'group', 'guess']} + Combined DataFrames + + """ + #Validate input + #If either DF is empty and the other isn't, return a non-empty DF as it probably has reasonable guesses + #Otherwise if both are lacking, return the first one + dfs = [input_df1, input_df2, input_df3] + if any(len(df.columns) == 3 for df in dfs): + for df in dfs: + if len(df.columns) > 3: + return df + return dfs[0] + + if weighting not in ['accuracy', 'uniform', 'adaboost']: + return dfs[0] + if method not in ['max', 'mean', 'median', 'min']: + return dfs[0] + #Establish the weights for each dataframe/classifier + #guesses_gt = [df[['guess','class']] for df in dfs] + weights = [] + #for tup in guesses_gt: + for df in dfs: + tup = df[['guess', 'class']] + num_correct = len(np.where(tup['guess'] == tup['class'])) + total_vals = len(tup['guess'].index) + if weighting == 'accuracy': + weights.append(float(num_correct) / float(total_vals)) + elif weighting == 'uniform': + weights.append(1.0) + elif weighting == 'adaboost': + weights.append(self._adaboost(tup['guess'], tup['class'])) + method_f = None + if method == 'max': + method_f = self._max_class + elif method == 'mean': + method_f = self._mean_class + elif method == 'median': + method_f = self._median_class + elif method == 'min': + method_f = self._min_class + + # for each sample, get the appropriate combined value + merged_guesses = pd.merge(input_df1[['guess']], pd.merge(input_df2[['guess']], input_df3[['guess']], suffixes=['_2', '_3']), suffixes=['_1', '_2']) + merged_guesses['res'] = None + for row_ix in merged_guesses.index: + merged_guesses['res'].iloc[row_ix] = method_f(merged_guesses.iloc[row_ix], weights) + + return pd.join(combined_df, merged_guesses['res'].copy()) + + def _consensus_four(self, weighting, method, input_df1, input_df2, input_df3, input_df4): + """Takes the classifications of different models and combines them in a meaningful manner. + + Parameters + ---------- + weighting: {'accuracy', 'uniform', 'adaboost'} + Method of weighting the classifications from the different DataFrames + method: {'max', 'mean', 'median', 'min'} + Method of combining the classifications from the different DataFrames + input_df1: pandas.DataFrame {n_samples, n_features+['class', 'group', 'guess']} + First input DataFrame to combine guesses + input_df2: pandas.DataFrame {n_samples, n_features+['class', 'group', 'guess']} + Second input DataFrame to combine guesses + input_df3: pandas.DataFrame {n_samples, n_features+['class', 'group', 'guess']} + Third input DataFrame to combine guesses + input_df4: pandas.DataFrame {n_samples, n_features+['class', 'group', 'guess']} + Fourth input DataFrame to combine guesses + + Returns + ------- + combined_df: pandas.DataFrame {n_samples, n_features+['class', 'group', 'guess']} + Combined DataFrames + + """ + #Validate input + #If either DF is empty and the other isn't, return a non-empty DF as it probably has reasonable guesses + #Otherwise if both are lacking, return the first one + dfs = [input_df1, input_df2, input_df3, input_df4] + if any(len(df.columns) == 3 for df in dfs): + for df in dfs: + if len(df.columns) > 3: + return df + return dfs[0] + + if weighting not in ['accuracy', 'uniform', 'adaboost']: + return dfs[0] + if method not in ['max', 'mean', 'median', 'min']: + return dfs[0] + #Establish the weights for each dataframe/classifier + #guesses_gt = [df[['guess','class']] for df in dfs] + weights = [] + #for tup in guesses_gt: + for df in dfs: + tup = df[['guess', 'total']] + num_correct = len(np.where(tup['guess'] == tup['class'])) + total_vals = len(tup['guess'].index) + if weighting == 'accuracy': + weights.append(float(num_correct) / float(total_vals)) + elif weighting == 'uniform': + weights.append(1.0) + elif weighting == 'adaboost': + weights.append(self._adaboost(tup['guess'], tup['class'])) + method_f = None + if method == 'max': + method_f = self._max_class + elif method == 'mean': + method_f = self._mean_class + elif method == 'median': + method_f = self._median_class + elif method == 'min': + method_f = self._min_class + + # for each sample, get the appropriate combined value + merged_guesses = pd.merge(input_df1[['guess']], pd.merge(input_df2[['guess']], pd.merge(input_df3[['guess']], input_df4[['guess']], suffixes=['_3', '_4']), suffixes=['_2', '_3']), suffixes=['_1', '_2']) + merged_guesses['res'] = None + for row_ix in merged_guesses.index: + merged_guesses['res'].iloc[row_ix] = method_f(merged_guesses.iloc[row_ix], weights) + + return pd.join(combined_df, merged_guesses['res'].copy()) + + def _train_model_and_predict(self, input_df, model, **kwargs): """Fits an arbitrary sklearn classifier model with a set of keyword parameters Parameters ---------- input_df: pandas.DataFrame {n_samples, n_features+['class', 'group', 'guess']} - Input DataFrame for fitting the k-neares + Input DataFrame for fitting the model model: sklearn classifier Input model to fit and predict on input_df kwargs: unpacked parameters From 160593e1e0e1d821ba8f849daddd35b64efae783 Mon Sep 17 00:00:00 2001 From: Nathan Date: Sun, 21 Feb 2016 23:34:52 -0600 Subject: [PATCH 02/13] consensus_two working, needs another once over for optimization, and cleaning up comments --- tpot/tpot.py | 122 +++++++++++++++++++++++++++++++++++---------------- 1 file changed, 84 insertions(+), 38 deletions(-) diff --git a/tpot/tpot.py b/tpot/tpot.py index cfbab2e4..5eddcd69 100644 --- a/tpot/tpot.py +++ b/tpot/tpot.py @@ -108,7 +108,7 @@ def __init__(self, population_size=100, generations=100, self.pset.addPrimitive(self.svc, [pd.DataFrame, float], pd.DataFrame) self.pset.addPrimitive(self.knnc, [pd.DataFrame, int], pd.DataFrame) self.pset.addPrimitive(self.gradient_boosting, [pd.DataFrame, float, int, int], pd.DataFrame) - self.pset.addPrimitive(self._combine_dfs, [pd.DataFrame, pd.DataFrame], pd.DataFrame) + #self.pset.addPrimitive(self._combine_dfs, [pd.DataFrame, pd.DataFrame], pd.DataFrame) self.pset.addPrimitive(self._variance_threshold, [pd.DataFrame, float], pd.DataFrame) self.pset.addPrimitive(self._select_kbest, [pd.DataFrame, int], pd.DataFrame) self.pset.addPrimitive(self._select_percentile, [pd.DataFrame, int], pd.DataFrame) @@ -122,17 +122,17 @@ def __init__(self, population_size=100, generations=100, self.pset.addPrimitive(operator.sub, [int, int], int) self.pset.addPrimitive(operator.mul, [int, int], int) self.pset.addPrimitive(self._div, [int, int], float) - self.pset.addPrimitive(self._consensus_two, [str, str, pd.DataFrame, pd.DataFrame], pd.DataFrame) - self.pset.addPrimitive(self._consensus_three, [str, str, pd.DataFrame, pd.DataFrame, pd.DataFrame], pd.DataFrame) - self.pset.addPrimitive(self._consensus_four, [str, str, pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame], pd.DataFrame) - self.pset.addPrimitive(self._ident, [str], str) + self.pset.addPrimitive(self._consensus_two, [int, int, pd.DataFrame, pd.DataFrame], pd.DataFrame) + #self.pset.addPrimitive(self._consensus_three, [int, int, pd.DataFrame, pd.DataFrame, pd.DataFrame], pd.DataFrame) + #self.pset.addPrimitive(self._consensus_four, [int, int, pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame], pd.DataFrame) + #self.pset.addPrimitive(self._ident, [str], str) for val in range(0, 101): self.pset.addTerminal(val, int) for val in [100.0, 10.0, 1.0, 0.1, 0.01, 0.001, 0.0001]: self.pset.addTerminal(val, float) - for val in ['max', 'mean', 'median', 'min', 'accuracy', 'uniform', 'adaboost']: - self.pset.addTerminal(val, str) + #for val in ['max', 'mean', 'median', 'min', 'accuracy', 'uniform', 'adaboost']: + # self.pset.addTerminal(val, str) creator.create('FitnessMax', base.Fitness, weights=(1.0,)) creator.create('Individual', gp.PrimitiveTree, fitness=creator.FitnessMax) @@ -873,13 +873,14 @@ def gradient_boosting(self, input_df, learning_rate, n_estimators, max_depth): def _get_ht_dict(self, classes, weights): ret = {} + #instead of weights of class needs to be weights of classifier + ctr = 0 for cls in classes: try: - ret[cls] += weights[cls] + ret[cls] += weights[ctr] except: - print(cls) - print(weights) - ret[cls] = weights[cls] + ret[cls] = weights[ctr] + ctr += 1 return ret #max freq class @@ -890,17 +891,17 @@ def _max_class(self, classes, weights): def _mean_class(self, classes, weights): ht = self._get_ht_dict(classes, weights) mean_val = np.mean(ht.values()) - return sorted(((x, abs(y - mean_val)) for (x,y) in ht.items()), key=operator.itemgetter(1))[0] + return sorted(((x, abs(y - mean_val)) for (x,y) in ht.items()), key=operator.itemgetter(1))[0][0] def _median_class(self, classes, weights): ht = self._get_ht_dict(classes, weights) median_val = np.median(ht.values()) - return sorted(((x, abs(y - median_val)) for (x,y) in ht.items()), key=operator.itemgetter(1))[0] + return sorted(((x, abs(y - median_val)) for (x,y) in ht.items()), key=operator.itemgetter(1))[0][0] #minimum frequency class def _min_class(self, classes, weights): ht = self._get_ht_dict(classes, weights) - return sorted(ht.items(), key=operator.itemgetter(1))[0] + return sorted(ht.items(), key=operator.itemgetter(1))[0][0] def _adaboost(self, classes, gt): # weigh the incorrect classes higher than the correct classes @@ -911,7 +912,7 @@ def _adaboost(self, classes, gt): # e ^ - y_i * alpha_m * k_m (x_i) #init_weights = np.sum(np.exp(-1 * gt * 1 * classes), axis=0) - return 1.0 + return (1.0) def _ident(*args): return args @@ -946,15 +947,20 @@ def _consensus_two(self, weighting, method, input_df1, input_df2): return df return dfs[0].copy() - if weighting not in ['accuracy', 'uniform', 'adaboost']: + #if weighting not in ['accuracy', 'uniform', 'adaboost']: + if weighting % 7 > 2: return dfs[0].copy() - if method not in ['max', 'mean', 'median', 'min']: + #if method not in ['max', 'mean', 'median', 'min']: + if method % 7 <= 2: return dfs[0].copy() + + options = ['accuracy', 'uniform', 'adaboost', 'max', 'mean', 'median', 'min'] + weighting = options[weighting % 7] + method = options[method % 7] #Establish the weights for each dataframe/classifier #guesses_gt = [df[['guess','class']] for df in dfs] - guesses - input_df1.join(input_df2[[column for column in input_df2.columns.values if column not in input_df1.columns.values]]).copy() + weights = [] #for tup in guesses_gt: @@ -978,14 +984,26 @@ def _consensus_two(self, weighting, method, input_df1, input_df2): elif method == 'min': method_f = self._min_class + merged_guesses = pd.DataFrame(data=input_df1[['guess']], columns=['guess_1']) + merged_guesses.loc[:, 'guess_2'] = input_df2['guess'] # for each sample, get the appropriate combined value - merged_guesses = pd.merge(input_df1[['guess']], input_df2[['guess']], suffixes=['_1', '_2']) - merged_guesses['res'] = None + #merged_guesses = pd.merge(df1_guesses, df2_guesses, suffixes=['_1', '_2']).copy() + #print('merged_guesses columsn {}'.format(merged_guesses.columns.values)) + #print('types!!!!!! {} {}'.format(type(df1_guesses), type(df2_guesses))) + #merged_guesses = pd.DataFrame() + #merged_guesses.loc[:, 'guess_1'] = df1_guesses.copy() + #merged_guesses.loc[:, 'guess_2'] = df2_guesses.copy() + merged_guesses.loc[:, 'guess'] = None for row_ix in merged_guesses.index: - merged_guesses['res'].iloc[row_ix] = method_f(merged_guesses.iloc[row_ix], weights) - - return pd.join(combined_df, merged_guesses['res'].copy()) - + merged_guesses['guess'].loc[row_ix] = method_f(merged_guesses[['guess_1', 'guess_2']].iloc[row_ix], weights) + combined_df = input_df1.join(input_df2[[column for column in input_df2.columns.values if column not in input_df1.columns.values]]) + if 'guess' in combined_df.columns.values: + #print('cols: combined_df then merged_guesses') + #print(combined_df.columns.values) + #print(merged_guesses.columns.values) + return combined_df.drop('guess', 1).join(merged_guesses['guess']).copy() + else: + return combined_df.join(merged_guesses['guess']) def _consensus_three(self, weighting, method, input_df1, input_df2, input_df3): """Takes the classifications of different models and combines them in a meaningful manner. @@ -1016,14 +1034,22 @@ def _consensus_three(self, weighting, method, input_df1, input_df2, input_df3): for df in dfs: if len(df.columns) > 3: return df - return dfs[0] + return dfs[0].copy() - if weighting not in ['accuracy', 'uniform', 'adaboost']: - return dfs[0] - if method not in ['max', 'mean', 'median', 'min']: - return dfs[0] + #if weighting not in ['accuracy', 'uniform', 'adaboost']: + if weighting % 7 > 2: + return dfs[0].copy() + #if method not in ['max', 'mean', 'median', 'min']: + if method % 7 <= 2: + return dfs[0].copy() + + options = ['accuracy', 'uniform', 'adaboost', 'max', 'mean', 'median', 'min'] + weighting = options[weighting % 7] + method = options[method % 7] #Establish the weights for each dataframe/classifier #guesses_gt = [df[['guess','class']] for df in dfs] + combined_df = input_df1.join(input_df2[[column for column in input_df2.columns.values if column not in input_df1.columns.values]]) + combined_df = combined_df.join(input_df3[[column for column in input_df3.columns.values if column not in combined_df.columns.values]]) weights = [] #for tup in guesses_gt: for df in dfs: @@ -1031,9 +1057,9 @@ def _consensus_three(self, weighting, method, input_df1, input_df2, input_df3): num_correct = len(np.where(tup['guess'] == tup['class'])) total_vals = len(tup['guess'].index) if weighting == 'accuracy': - weights.append(float(num_correct) / float(total_vals)) + weights.append((float(num_correct) / float(total_vals))) elif weighting == 'uniform': - weights.append(1.0) + weights.append((1.0)) elif weighting == 'adaboost': weights.append(self._adaboost(tup['guess'], tup['class'])) method_f = None @@ -1046,6 +1072,9 @@ def _consensus_three(self, weighting, method, input_df1, input_df2, input_df3): elif method == 'min': method_f = self._min_class + df1_guesses = input_df1[['guess']] + df2_guesses = input_df2[['guess']] + df3_guesses = input_df3[['guess']] # for each sample, get the appropriate combined value merged_guesses = pd.merge(input_df1[['guess']], pd.merge(input_df2[['guess']], input_df3[['guess']], suffixes=['_2', '_3']), suffixes=['_1', '_2']) merged_guesses['res'] = None @@ -1088,16 +1117,26 @@ def _consensus_four(self, weighting, method, input_df1, input_df2, input_df3, in return df return dfs[0] - if weighting not in ['accuracy', 'uniform', 'adaboost']: - return dfs[0] - if method not in ['max', 'mean', 'median', 'min']: - return dfs[0] + #if weighting not in ['accuracy', 'uniform', 'adaboost']: + if weighting % 7 > 2: + return dfs[0].copy() + #if method not in ['max', 'mean', 'median', 'min']: + if method % 7 <= 2: + return dfs[0].copy() + + options = ['accuracy', 'uniform', 'adaboost', 'max', 'mean', 'median', 'min'] + weighting = options[weighting % 7] + method = options[method % 7] + combined_df = input_df1.join(input_df2[[column for column in input_df2.columns.values if column not in input_df1.columns.values]]).copy() + combined_df = combined_df.join(input_df3[[columns for column in input_df3.columns.values if column not in combined_df.columns.values]]).copy() + combined_df = combined_df.join(input_df4[[columns for column in input_df4.columns.values if column not in combined_df.columns.values]]).copy() + #Establish the weights for each dataframe/classifier #guesses_gt = [df[['guess','class']] for df in dfs] weights = [] #for tup in guesses_gt: for df in dfs: - tup = df[['guess', 'total']] + tup = df[['guess', 'class']] num_correct = len(np.where(tup['guess'] == tup['class'])) total_vals = len(tup['guess'].index) if weighting == 'accuracy': @@ -1116,8 +1155,12 @@ def _consensus_four(self, weighting, method, input_df1, input_df2, input_df3, in elif method == 'min': method_f = self._min_class + df1_guesses = input_df1[['guess']] + df2_guesses = input_df2[['guess']] + df3_guesses = input_df3[['guess']] + df4_guesses = input_df4[['guess']] # for each sample, get the appropriate combined value - merged_guesses = pd.merge(input_df1[['guess']], pd.merge(input_df2[['guess']], pd.merge(input_df3[['guess']], input_df4[['guess']], suffixes=['_3', '_4']), suffixes=['_2', '_3']), suffixes=['_1', '_2']) + merged_guesses = pd.merge(df1_guesses, pd.merge(df2_guesses, pd.merge(df3_guesses, df4_guesses, suffixes=['_3', '_4']), suffixes=['_2', '_3']), suffixes=['_1', '_2']) merged_guesses['res'] = None for row_ix in merged_guesses.index: merged_guesses['res'].iloc[row_ix] = method_f(merged_guesses.iloc[row_ix], weights) @@ -1526,6 +1569,7 @@ def _evaluate_individual(self, individual, training_testing_data): # Throw out GP expressions that are too large to be compiled in Python return 0., + print(individual) result = func(training_testing_data) result = result[result['group'] == 'testing'] res = self.scoring_function(result) @@ -1553,6 +1597,8 @@ def _balanced_accuracy(self, result): all_classes = list(set(result['class'].values)) all_class_accuracies = [] for this_class in all_classes: + #print("this class {}".format(this_class)) + #print("result['guess'] {}".format(result['guess'])) this_class_accuracy = len(result[(result['guess'] == this_class) \ & (result['class'] == this_class)])\ / float(len(result[result['class'] == this_class])) From 6d0ba52d2aa96ac197ad931093f64be1bab6852e Mon Sep 17 00:00:00 2001 From: Nathan Date: Mon, 22 Feb 2016 00:10:52 -0600 Subject: [PATCH 03/13] consensus_three and four seem to work, no memory blowups like before. Requires more thorough testing, finishing adaboost weighting --- tpot/tpot.py | 68 +++++++++++++++++++++++----------------------------- 1 file changed, 30 insertions(+), 38 deletions(-) diff --git a/tpot/tpot.py b/tpot/tpot.py index 5eddcd69..7c743151 100644 --- a/tpot/tpot.py +++ b/tpot/tpot.py @@ -123,8 +123,8 @@ def __init__(self, population_size=100, generations=100, self.pset.addPrimitive(operator.mul, [int, int], int) self.pset.addPrimitive(self._div, [int, int], float) self.pset.addPrimitive(self._consensus_two, [int, int, pd.DataFrame, pd.DataFrame], pd.DataFrame) - #self.pset.addPrimitive(self._consensus_three, [int, int, pd.DataFrame, pd.DataFrame, pd.DataFrame], pd.DataFrame) - #self.pset.addPrimitive(self._consensus_four, [int, int, pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame], pd.DataFrame) + self.pset.addPrimitive(self._consensus_three, [int, int, pd.DataFrame, pd.DataFrame, pd.DataFrame], pd.DataFrame) + self.pset.addPrimitive(self._consensus_four, [int, int, pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame], pd.DataFrame) #self.pset.addPrimitive(self._ident, [str], str) for val in range(0, 101): @@ -986,21 +986,11 @@ def _consensus_two(self, weighting, method, input_df1, input_df2): merged_guesses = pd.DataFrame(data=input_df1[['guess']], columns=['guess_1']) merged_guesses.loc[:, 'guess_2'] = input_df2['guess'] - # for each sample, get the appropriate combined value - #merged_guesses = pd.merge(df1_guesses, df2_guesses, suffixes=['_1', '_2']).copy() - #print('merged_guesses columsn {}'.format(merged_guesses.columns.values)) - #print('types!!!!!! {} {}'.format(type(df1_guesses), type(df2_guesses))) - #merged_guesses = pd.DataFrame() - #merged_guesses.loc[:, 'guess_1'] = df1_guesses.copy() - #merged_guesses.loc[:, 'guess_2'] = df2_guesses.copy() merged_guesses.loc[:, 'guess'] = None for row_ix in merged_guesses.index: merged_guesses['guess'].loc[row_ix] = method_f(merged_guesses[['guess_1', 'guess_2']].iloc[row_ix], weights) combined_df = input_df1.join(input_df2[[column for column in input_df2.columns.values if column not in input_df1.columns.values]]) if 'guess' in combined_df.columns.values: - #print('cols: combined_df then merged_guesses') - #print(combined_df.columns.values) - #print(merged_guesses.columns.values) return combined_df.drop('guess', 1).join(merged_guesses['guess']).copy() else: return combined_df.join(merged_guesses['guess']) @@ -1048,8 +1038,6 @@ def _consensus_three(self, weighting, method, input_df1, input_df2, input_df3): method = options[method % 7] #Establish the weights for each dataframe/classifier #guesses_gt = [df[['guess','class']] for df in dfs] - combined_df = input_df1.join(input_df2[[column for column in input_df2.columns.values if column not in input_df1.columns.values]]) - combined_df = combined_df.join(input_df3[[column for column in input_df3.columns.values if column not in combined_df.columns.values]]) weights = [] #for tup in guesses_gt: for df in dfs: @@ -1057,9 +1045,9 @@ def _consensus_three(self, weighting, method, input_df1, input_df2, input_df3): num_correct = len(np.where(tup['guess'] == tup['class'])) total_vals = len(tup['guess'].index) if weighting == 'accuracy': - weights.append((float(num_correct) / float(total_vals))) + weights.append(float(num_correct) / float(total_vals)) elif weighting == 'uniform': - weights.append((1.0)) + weights.append(1.0) elif weighting == 'adaboost': weights.append(self._adaboost(tup['guess'], tup['class'])) method_f = None @@ -1072,16 +1060,20 @@ def _consensus_three(self, weighting, method, input_df1, input_df2, input_df3): elif method == 'min': method_f = self._min_class - df1_guesses = input_df1[['guess']] - df2_guesses = input_df2[['guess']] - df3_guesses = input_df3[['guess']] - # for each sample, get the appropriate combined value - merged_guesses = pd.merge(input_df1[['guess']], pd.merge(input_df2[['guess']], input_df3[['guess']], suffixes=['_2', '_3']), suffixes=['_1', '_2']) - merged_guesses['res'] = None + merged_guesses = pd.DataFrame(data=input_df1[['guess']], columns=['guess_1']) + merged_guesses.loc[:, 'guess_2'] = input_df2['guess'] + merged_guesses.loc[:, 'guess_3'] = input_df3['guess'] + merged_guesses.loc[:, 'guess'] = None for row_ix in merged_guesses.index: - merged_guesses['res'].iloc[row_ix] = method_f(merged_guesses.iloc[row_ix], weights) + merged_guesses['guess'].loc[row_ix] = method_f(merged_guesses[['guess_1', 'guess_2', 'guess_3']].iloc[row_ix], weights) + combined_df = input_df1.join(input_df2[[column for column in input_df2.columns.values if column not in input_df1.columns.values]]) + combined_df = combined_df.join(input_df3[[column for column in input_df3.columns.values if column not in combined_df.columns.values]]) + if 'guess' in combined_df.columns.values: + return combined_df.drop('guess', 1).join(merged_guesses['guess']).copy() + else: + return combined_df.join(merged_guesses['guess']) + - return pd.join(combined_df, merged_guesses['res'].copy()) def _consensus_four(self, weighting, method, input_df1, input_df2, input_df3, input_df4): """Takes the classifications of different models and combines them in a meaningful manner. @@ -1115,7 +1107,7 @@ def _consensus_four(self, weighting, method, input_df1, input_df2, input_df3, in for df in dfs: if len(df.columns) > 3: return df - return dfs[0] + return dfs[0].copy() #if weighting not in ['accuracy', 'uniform', 'adaboost']: if weighting % 7 > 2: @@ -1127,10 +1119,6 @@ def _consensus_four(self, weighting, method, input_df1, input_df2, input_df3, in options = ['accuracy', 'uniform', 'adaboost', 'max', 'mean', 'median', 'min'] weighting = options[weighting % 7] method = options[method % 7] - combined_df = input_df1.join(input_df2[[column for column in input_df2.columns.values if column not in input_df1.columns.values]]).copy() - combined_df = combined_df.join(input_df3[[columns for column in input_df3.columns.values if column not in combined_df.columns.values]]).copy() - combined_df = combined_df.join(input_df4[[columns for column in input_df4.columns.values if column not in combined_df.columns.values]]).copy() - #Establish the weights for each dataframe/classifier #guesses_gt = [df[['guess','class']] for df in dfs] weights = [] @@ -1155,17 +1143,21 @@ def _consensus_four(self, weighting, method, input_df1, input_df2, input_df3, in elif method == 'min': method_f = self._min_class - df1_guesses = input_df1[['guess']] - df2_guesses = input_df2[['guess']] - df3_guesses = input_df3[['guess']] - df4_guesses = input_df4[['guess']] - # for each sample, get the appropriate combined value - merged_guesses = pd.merge(df1_guesses, pd.merge(df2_guesses, pd.merge(df3_guesses, df4_guesses, suffixes=['_3', '_4']), suffixes=['_2', '_3']), suffixes=['_1', '_2']) - merged_guesses['res'] = None + merged_guesses = pd.DataFrame(data=input_df1[['guess']], columns=['guess_1']) + merged_guesses.loc[:, 'guess_2'] = input_df2['guess'] + merged_guesses.loc[:, 'guess_3'] = input_df3['guess'] + merged_guesses.loc[:, 'guess_4'] = input_df3['guess'] + merged_guesses.loc[:, 'guess'] = None for row_ix in merged_guesses.index: - merged_guesses['res'].iloc[row_ix] = method_f(merged_guesses.iloc[row_ix], weights) + merged_guesses['guess'].loc[row_ix] = method_f(merged_guesses[['guess_1', 'guess_2', 'guess_3', 'guess_4']].iloc[row_ix], weights) + combined_df = input_df1.join(input_df2[[column for column in input_df2.columns.values if column not in input_df1.columns.values]]) + combined_df = combined_df.join(input_df3[[column for column in input_df3.columns.values if column not in combined_df.columns.values]]) + combined_df = combined_df.join(input_df4[[column for column in input_df4.columns.values if column not in combined_df.columns.values]]) + if 'guess' in combined_df.columns.values: + return combined_df.drop('guess', 1).join(merged_guesses['guess']).copy() + else: + return combined_df.join(merged_guesses['guess']) - return pd.join(combined_df, merged_guesses['res'].copy()) def _train_model_and_predict(self, input_df, model, **kwargs): From bb8fc28d771a2b56dc76499eaa51e5397fda7869 Mon Sep 17 00:00:00 2001 From: Nathan Date: Thu, 25 Feb 2016 23:59:59 -0600 Subject: [PATCH 04/13] Changed Twitter Bot to TPOT in License, added consensus_two, three, four, simple tests. --- tests.py | 63 ++++++++++++++++++++++++++++++++++++++++++++++ tpot/tpot.py | 70 +++++++++++++++++++++++++++++++++------------------- 2 files changed, 107 insertions(+), 26 deletions(-) diff --git a/tests.py b/tests.py index 70f3a10a..276336d6 100644 --- a/tests.py +++ b/tests.py @@ -124,3 +124,66 @@ def test_static_models(): assert np.array_equal(result['guess'].values, sklearn_model_obj.predict(testing_features)), "Model {} failed".format(str(model)) +def test_consensus(): + tpot_obj = TPOT() + + df1 = pd.DataFrame({'class': [1,1,1,1,1,0,0,0,0,0], + 'guess': [1,0,1,0,1,0,1,0,1,0]}) + + df2 = pd.DataFrame({'class': [1,1,1,1,1,0,0,0,0,0], + 'guess': [0,0,0,0,0,1,1,1,1,1]}) + + df3 = pd.DataFrame({'class': [1,1,1,1,1,0,0,0,0,0], + 'guess': [0,1,0,1,0,1,0,1,0,1]}) + + df4 = pd.DataFrame({'class': [1,1,1,1,1,0,0,0,0,0], + 'guess': [1,1,0,0,1,1,0,0,1,1]}) + + uniform = [1.0] * 10 + adaboost = [1.0] * 10 + + accuracy1 = 0.60 + accuracy2 = 0.0 + accuracy3 = 0.40 + accuracy4 = 0.50 + res_two_acc_max = pd.DataFrame({'class': [1,1,1,1,1,0,0,0,0,0], + 'guess': [1,0,1,0,1,0,1,0,1,0]}) + res_two_acc_mean = pd.DataFrame({'class': [1,1,1,1,1,0,0,0,0,0], + 'guess': [1,0,1,0,1,0,1,0,1,0]}) + res_two_acc_med = pd.DataFrame({'class': [1,1,1,1,1,0,0,0,0,0], + 'guess': [1,0,1,0,1,0,1,0,1,0]}) + res_two_acc_min = pd.DataFrame({'class': [1,1,1,1,1,0,0,0,0,0], + 'guess': [0,0,0,0,0,1,1,1,1,1]}) + res_two_uni_max = pd.DataFrame({'class': [1,1,1,1,1,0,0,0,0,0], + 'guess': [1,0,1,0,1,0,1,0,1,0]}) + res_two_ada_max = pd.DataFrame({'class': [1,1,1,1,1,0,0,0,0,0], + 'guess': [1,0,1,0,1,0,1,0,1,0]}) + res_three_ada_max = pd.DataFrame({'class': [1,1,1,1,1,0,0,0,0,0], + 'guess': [1,1,0,0,1,1,0,0,1,1]}) + res_four_ada_max = pd.DataFrame({'class': [1,1,1,1,1,0,0,0,0,0], + 'guess': [1,0,0,0,1,1,1,0,1,1]}) + + tests = [(tpot_obj._consensus_two, 0, 3, df1, df2), #accuracy, max + (tpot_obj._consensus_two, 0, 4, df1, df2), #accuracy mean + (tpot_obj._consensus_two, 0, 5, df1, df2), # accuracy median + (tpot_obj._consensus_two, 0, 6, df1, df2), # accuracy min + (tpot_obj._consensus_two, 1, 3, df1, df2), # uniform max + (tpot_obj._consensus_two, 2, 3, df1, df2), # adaboost max + (tpot_obj._consensus_three, 2, 3, df1, df3, df4), # adaboost max + (tpot_obj._consensus_four, 2, 3, df1, df2, df3, df4) #adaboost max + ] + results = [res_two_acc_max, + res_two_acc_mean, + res_two_acc_med, + res_two_acc_min, + res_two_uni_max, + res_two_ada_max, + res_three_ada_max, + res_four_ada_max + ] + + for test, res in zip(tests, results): + print test[1:3] + print test[0](*test[1:])['guess'].values + assert np.array_equal(test[0](*test[1:])['guess'].values, res['guess'].values) + diff --git a/tpot/tpot.py b/tpot/tpot.py index 7c743151..36276a13 100644 --- a/tpot/tpot.py +++ b/tpot/tpot.py @@ -14,7 +14,7 @@ WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with -the Twitter Bot library. If not, see http://www.gnu.org/licenses/. +the TPOT library. If not, see http://www.gnu.org/licenses/. """ @@ -131,8 +131,6 @@ def __init__(self, population_size=100, generations=100, self.pset.addTerminal(val, int) for val in [100.0, 10.0, 1.0, 0.1, 0.01, 0.001, 0.0001]: self.pset.addTerminal(val, float) - #for val in ['max', 'mean', 'median', 'min', 'accuracy', 'uniform', 'adaboost']: - # self.pset.addTerminal(val, str) creator.create('FitnessMax', base.Fitness, weights=(1.0,)) creator.create('Individual', gp.PrimitiveTree, fitness=creator.FitnessMax) @@ -883,30 +881,49 @@ def _get_ht_dict(self, classes, weights): ctr += 1 return ret + + def _get_top(self, classes, tups): + """Return the class from the row in the first DataFrame passed to the function (e.g., input_df1) + """ + values = [tup[0] for tup in tups if tup[1] == tups[0][1]] + for class_ in classes: + if class_ in values: + return class_ + #max freq class def _max_class(self, classes, weights): + """Return the class with the highest weight, or the class that appears first with that weight (e.g., input_df1) + """ ht = self._get_ht_dict(classes, weights) - return sorted(ht.items(), key=operator.itemgetter(1))[-1][0] + return self._get_top(classes, sorted(ht.items(), key=operator.itemgetter(1), reverse=True)) def _mean_class(self, classes, weights): + """Return the class closest to the mean weight, or the class that appears first with that weight (e.g., input_df1) + """ ht = self._get_ht_dict(classes, weights) mean_val = np.mean(ht.values()) - return sorted(((x, abs(y - mean_val)) for (x,y) in ht.items()), key=operator.itemgetter(1))[0][0] + return self._get_top(classes, sorted(((x, abs(y - mean_val)) for (x,y) in ht.items()), key=operator.itemgetter(1))) def _median_class(self, classes, weights): + """Return the class closest to the median weight, or the class that appears first with that weight (e.g., input_df1) + """ ht = self._get_ht_dict(classes, weights) median_val = np.median(ht.values()) - return sorted(((x, abs(y - median_val)) for (x,y) in ht.items()), key=operator.itemgetter(1))[0][0] + return self._get_top(classes, sorted(((x, abs(y - median_val)) for (x,y) in ht.items()), key=operator.itemgetter(1))) #minimum frequency class def _min_class(self, classes, weights): + """Return the class with the minimal weight, or the class that appears first with that weight (e.g., input_df1) + """ ht = self._get_ht_dict(classes, weights) - return sorted(ht.items(), key=operator.itemgetter(1))[0][0] + return self._get_top(classes, sorted(ht.items(), key=operator.itemgetter(1))) def _adaboost(self, classes, gt): + """Weigh each of a DataFrame's guesses according to an adaboost-like scheme. + """ # weigh the incorrect classes higher than the correct classes - num_correct = len(np.where(classes == gt)) - total = classes.size# + #num_correct = len(np.where(classes == gt)) + #total = classes.size # this is not quite right, the weight here is for the classifier as a whole, not the weifghted error of a single mistake #weights = pd.Series(data=np.array(0.5 * np.log((1- error)/(error))) @@ -914,8 +931,6 @@ def _adaboost(self, classes, gt): #init_weights = np.sum(np.exp(-1 * gt * 1 * classes), axis=0) return (1.0) - def _ident(*args): - return args def _consensus_two(self, weighting, method, input_df1, input_df2): """Takes the classifications of different models and combines them in a meaningful manner. @@ -958,15 +973,10 @@ def _consensus_two(self, weighting, method, input_df1, input_df2): weighting = options[weighting % 7] method = options[method % 7] #Establish the weights for each dataframe/classifier - #guesses_gt = [df[['guess','class']] for df in dfs] - - weights = [] - - #for tup in guesses_gt: for df in dfs: tup = df[['guess', 'class']] - num_correct = len(np.where(tup['guess'] == tup['class'])) + num_correct = len(np.where(tup['guess'] == tup['class'])[0]) total_vals = len(tup['guess'].index) if weighting == 'accuracy': weights.append(float(num_correct) / float(total_vals)) @@ -974,6 +984,7 @@ def _consensus_two(self, weighting, method, input_df1, input_df2): weights.append(1.0) elif weighting == 'adaboost': weights.append(self._adaboost(tup['guess'], tup['class'])) + #Set the method function for evaluating each DataFrame method_f = None if method == 'max': method_f = self._max_class @@ -984,9 +995,11 @@ def _consensus_two(self, weighting, method, input_df1, input_df2): elif method == 'min': method_f = self._min_class - merged_guesses = pd.DataFrame(data=input_df1[['guess']], columns=['guess_1']) + # Initialize the dataFrame containing just the guesses, and to hold the results + merged_guesses = pd.DataFrame(data=input_df1[['guess']].values, columns=['guess_1']) merged_guesses.loc[:, 'guess_2'] = input_df2['guess'] merged_guesses.loc[:, 'guess'] = None + for row_ix in merged_guesses.index: merged_guesses['guess'].loc[row_ix] = method_f(merged_guesses[['guess_1', 'guess_2']].iloc[row_ix], weights) combined_df = input_df1.join(input_df2[[column for column in input_df2.columns.values if column not in input_df1.columns.values]]) @@ -994,6 +1007,7 @@ def _consensus_two(self, weighting, method, input_df1, input_df2): return combined_df.drop('guess', 1).join(merged_guesses['guess']).copy() else: return combined_df.join(merged_guesses['guess']) + def _consensus_three(self, weighting, method, input_df1, input_df2, input_df3): """Takes the classifications of different models and combines them in a meaningful manner. @@ -1037,9 +1051,7 @@ def _consensus_three(self, weighting, method, input_df1, input_df2, input_df3): weighting = options[weighting % 7] method = options[method % 7] #Establish the weights for each dataframe/classifier - #guesses_gt = [df[['guess','class']] for df in dfs] weights = [] - #for tup in guesses_gt: for df in dfs: tup = df[['guess', 'class']] num_correct = len(np.where(tup['guess'] == tup['class'])) @@ -1050,6 +1062,8 @@ def _consensus_three(self, weighting, method, input_df1, input_df2, input_df3): weights.append(1.0) elif weighting == 'adaboost': weights.append(self._adaboost(tup['guess'], tup['class'])) + + #Set the method function for evaluating each DataFrame method_f = None if method == 'max': method_f = self._max_class @@ -1060,10 +1074,12 @@ def _consensus_three(self, weighting, method, input_df1, input_df2, input_df3): elif method == 'min': method_f = self._min_class - merged_guesses = pd.DataFrame(data=input_df1[['guess']], columns=['guess_1']) + # Initialize the dataFrame containing just the guesses, and to hold the results + merged_guesses = pd.DataFrame(data=input_df1[['guess']].values, columns=['guess_1']) merged_guesses.loc[:, 'guess_2'] = input_df2['guess'] merged_guesses.loc[:, 'guess_3'] = input_df3['guess'] merged_guesses.loc[:, 'guess'] = None + for row_ix in merged_guesses.index: merged_guesses['guess'].loc[row_ix] = method_f(merged_guesses[['guess_1', 'guess_2', 'guess_3']].iloc[row_ix], weights) combined_df = input_df1.join(input_df2[[column for column in input_df2.columns.values if column not in input_df1.columns.values]]) @@ -1120,9 +1136,7 @@ def _consensus_four(self, weighting, method, input_df1, input_df2, input_df3, in weighting = options[weighting % 7] method = options[method % 7] #Establish the weights for each dataframe/classifier - #guesses_gt = [df[['guess','class']] for df in dfs] weights = [] - #for tup in guesses_gt: for df in dfs: tup = df[['guess', 'class']] num_correct = len(np.where(tup['guess'] == tup['class'])) @@ -1133,6 +1147,8 @@ def _consensus_four(self, weighting, method, input_df1, input_df2, input_df3, in weights.append(1.0) elif weighting == 'adaboost': weights.append(self._adaboost(tup['guess'], tup['class'])) + + #Set the method function for evaluating each DataFrame method_f = None if method == 'max': method_f = self._max_class @@ -1143,11 +1159,14 @@ def _consensus_four(self, weighting, method, input_df1, input_df2, input_df3, in elif method == 'min': method_f = self._min_class - merged_guesses = pd.DataFrame(data=input_df1[['guess']], columns=['guess_1']) + # Initialize the dataFrame containing just the guesses, and to hold the results + merged_guesses = pd.DataFrame(data=input_df1[['guess']].values, columns=['guess_1']) + merged_guesses = pd.DataFrame(data=input_df1[['guess']].values, columns=['guess_1']) merged_guesses.loc[:, 'guess_2'] = input_df2['guess'] merged_guesses.loc[:, 'guess_3'] = input_df3['guess'] - merged_guesses.loc[:, 'guess_4'] = input_df3['guess'] + merged_guesses.loc[:, 'guess_4'] = input_df4['guess'] merged_guesses.loc[:, 'guess'] = None + for row_ix in merged_guesses.index: merged_guesses['guess'].loc[row_ix] = method_f(merged_guesses[['guess_1', 'guess_2', 'guess_3', 'guess_4']].iloc[row_ix], weights) combined_df = input_df1.join(input_df2[[column for column in input_df2.columns.values if column not in input_df1.columns.values]]) @@ -1561,7 +1580,6 @@ def _evaluate_individual(self, individual, training_testing_data): # Throw out GP expressions that are too large to be compiled in Python return 0., - print(individual) result = func(training_testing_data) result = result[result['group'] == 'testing'] res = self.scoring_function(result) From 4ff886c20b587bcb9c5aac8e0cfe15d5d8560ce4 Mon Sep 17 00:00:00 2001 From: Nathan Date: Fri, 26 Feb 2016 21:36:16 -0600 Subject: [PATCH 05/13] Removed adaboost, moved some consensus operator options to class fields. --- tests.py | 18 +++++++------- tpot/tpot.py | 69 +++++++++++++++++++++++----------------------------- 2 files changed, 40 insertions(+), 47 deletions(-) diff --git a/tests.py b/tests.py index 276336d6..d2aaa1bf 100644 --- a/tests.py +++ b/tests.py @@ -163,21 +163,21 @@ def test_consensus(): res_four_ada_max = pd.DataFrame({'class': [1,1,1,1,1,0,0,0,0,0], 'guess': [1,0,0,0,1,1,1,0,1,1]}) - tests = [(tpot_obj._consensus_two, 0, 3, df1, df2), #accuracy, max - (tpot_obj._consensus_two, 0, 4, df1, df2), #accuracy mean - (tpot_obj._consensus_two, 0, 5, df1, df2), # accuracy median - (tpot_obj._consensus_two, 0, 6, df1, df2), # accuracy min - (tpot_obj._consensus_two, 1, 3, df1, df2), # uniform max - (tpot_obj._consensus_two, 2, 3, df1, df2), # adaboost max - (tpot_obj._consensus_three, 2, 3, df1, df3, df4), # adaboost max - (tpot_obj._consensus_four, 2, 3, df1, df2, df3, df4) #adaboost max + tests = [(tpot_obj._consensus_two, 0, 2, df1, df2), #accuracy, max + (tpot_obj._consensus_two, 0, 3, df1, df2), #accuracy mean + (tpot_obj._consensus_two, 0, 4, df1, df2), # accuracy median + (tpot_obj._consensus_two, 0, 5, df1, df2), # accuracy min + (tpot_obj._consensus_two, 1, 2, df1, df2), # uniform max + #(tpot_obj._consensus_two, 2, 3, df1, df2), # adaboost max + (tpot_obj._consensus_three, 1, 2, df1, df3, df4), # uni max + (tpot_obj._consensus_four, 1, 2, df1, df2, df3, df4) #uni max ] results = [res_two_acc_max, res_two_acc_mean, res_two_acc_med, res_two_acc_min, res_two_uni_max, - res_two_ada_max, + #res_two_ada_max, res_three_ada_max, res_four_ada_max ] diff --git a/tpot/tpot.py b/tpot/tpot.py index 36276a13..bb73c6c1 100644 --- a/tpot/tpot.py +++ b/tpot/tpot.py @@ -149,6 +149,10 @@ def __init__(self, population_size=100, generations=100, self.scoring_function=self._balanced_accuracy else: self.scoring_function=scoring_function + + self._consensus_options = ['accuracy', 'uniform', 'max', 'mean', 'median', 'min'] + self._num_consensus_options = len(self._consensus_options) + self._consensus_opt_split_ix = 1 def fit(self, features, classes, feature_names=None): @@ -918,20 +922,6 @@ def _min_class(self, classes, weights): ht = self._get_ht_dict(classes, weights) return self._get_top(classes, sorted(ht.items(), key=operator.itemgetter(1))) - def _adaboost(self, classes, gt): - """Weigh each of a DataFrame's guesses according to an adaboost-like scheme. - """ - # weigh the incorrect classes higher than the correct classes - #num_correct = len(np.where(classes == gt)) - #total = classes.size - # this is not quite right, the weight here is for the classifier as a whole, not the weifghted error of a single mistake - #weights = pd.Series(data=np.array(0.5 * np.log((1- error)/(error))) - - # e ^ - y_i * alpha_m * k_m (x_i) - #init_weights = np.sum(np.exp(-1 * gt * 1 * classes), axis=0) - return (1.0) - - def _consensus_two(self, weighting, method, input_df1, input_df2): """Takes the classifications of different models and combines them in a meaningful manner. @@ -962,16 +952,19 @@ def _consensus_two(self, weighting, method, input_df1, input_df2): return df return dfs[0].copy() - #if weighting not in ['accuracy', 'uniform', 'adaboost']: - if weighting % 7 > 2: + options = self._consensus_options + num_options = self._num_consensus_options + opt_split_ix = self._consensus_opt_split_ix + + #if weighting not in ['accuracy', 'uniform']: + if weighting % num_options > (opt_split_ix): return dfs[0].copy() #if method not in ['max', 'mean', 'median', 'min']: - if method % 7 <= 2: + if method % num_options <= (opt_split_ix): return dfs[0].copy() - options = ['accuracy', 'uniform', 'adaboost', 'max', 'mean', 'median', 'min'] - weighting = options[weighting % 7] - method = options[method % 7] + weighting = options[weighting % num_options] + method = options[method % num_options] #Establish the weights for each dataframe/classifier weights = [] for df in dfs: @@ -982,8 +975,6 @@ def _consensus_two(self, weighting, method, input_df1, input_df2): weights.append(float(num_correct) / float(total_vals)) elif weighting == 'uniform': weights.append(1.0) - elif weighting == 'adaboost': - weights.append(self._adaboost(tup['guess'], tup['class'])) #Set the method function for evaluating each DataFrame method_f = None if method == 'max': @@ -1040,16 +1031,19 @@ def _consensus_three(self, weighting, method, input_df1, input_df2, input_df3): return df return dfs[0].copy() - #if weighting not in ['accuracy', 'uniform', 'adaboost']: - if weighting % 7 > 2: + options = self._consensus_options + num_options = self._num_consensus_options + opt_split_ix = self._consensus_opt_split_ix + + #if weighting not in ['accuracy', 'uniform']: + if weighting % num_options > opt_split_ix: return dfs[0].copy() #if method not in ['max', 'mean', 'median', 'min']: - if method % 7 <= 2: + if method % num_options <= opt_split_ix: return dfs[0].copy() - options = ['accuracy', 'uniform', 'adaboost', 'max', 'mean', 'median', 'min'] - weighting = options[weighting % 7] - method = options[method % 7] + weighting = options[weighting % num_options] + method = options[method % num_options] #Establish the weights for each dataframe/classifier weights = [] for df in dfs: @@ -1060,8 +1054,6 @@ def _consensus_three(self, weighting, method, input_df1, input_df2, input_df3): weights.append(float(num_correct) / float(total_vals)) elif weighting == 'uniform': weights.append(1.0) - elif weighting == 'adaboost': - weights.append(self._adaboost(tup['guess'], tup['class'])) #Set the method function for evaluating each DataFrame method_f = None @@ -1125,16 +1117,19 @@ def _consensus_four(self, weighting, method, input_df1, input_df2, input_df3, in return df return dfs[0].copy() - #if weighting not in ['accuracy', 'uniform', 'adaboost']: - if weighting % 7 > 2: + options = self._consensus_options + num_options = self._num_consensus_options + opt_split_ix = self._consensus_opt_split_ix + + #if weighting not in ['accuracy', 'uniform']: + if weighting % num_options > opt_split_ix: return dfs[0].copy() #if method not in ['max', 'mean', 'median', 'min']: - if method % 7 <= 2: + if method % num_options <= opt_split_ix: return dfs[0].copy() - options = ['accuracy', 'uniform', 'adaboost', 'max', 'mean', 'median', 'min'] - weighting = options[weighting % 7] - method = options[method % 7] + weighting = options[weighting % num_options] + method = options[method % num_options] #Establish the weights for each dataframe/classifier weights = [] for df in dfs: @@ -1145,8 +1140,6 @@ def _consensus_four(self, weighting, method, input_df1, input_df2, input_df3, in weights.append(float(num_correct) / float(total_vals)) elif weighting == 'uniform': weights.append(1.0) - elif weighting == 'adaboost': - weights.append(self._adaboost(tup['guess'], tup['class'])) #Set the method function for evaluating each DataFrame method_f = None From 191c4facedb2fe69d26a3315ee38d98c731e1424 Mon Sep 17 00:00:00 2001 From: Nathan Date: Fri, 26 Feb 2016 21:47:37 -0600 Subject: [PATCH 06/13] cleanup comments, remove ident and uncomment combine_dfs --- tpot/tpot.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/tpot/tpot.py b/tpot/tpot.py index bb73c6c1..f3467509 100644 --- a/tpot/tpot.py +++ b/tpot/tpot.py @@ -108,7 +108,7 @@ def __init__(self, population_size=100, generations=100, self.pset.addPrimitive(self.svc, [pd.DataFrame, float], pd.DataFrame) self.pset.addPrimitive(self.knnc, [pd.DataFrame, int], pd.DataFrame) self.pset.addPrimitive(self.gradient_boosting, [pd.DataFrame, float, int, int], pd.DataFrame) - #self.pset.addPrimitive(self._combine_dfs, [pd.DataFrame, pd.DataFrame], pd.DataFrame) + self.pset.addPrimitive(self._combine_dfs, [pd.DataFrame, pd.DataFrame], pd.DataFrame) self.pset.addPrimitive(self._variance_threshold, [pd.DataFrame, float], pd.DataFrame) self.pset.addPrimitive(self._select_kbest, [pd.DataFrame, int], pd.DataFrame) self.pset.addPrimitive(self._select_percentile, [pd.DataFrame, int], pd.DataFrame) @@ -125,7 +125,6 @@ def __init__(self, population_size=100, generations=100, self.pset.addPrimitive(self._consensus_two, [int, int, pd.DataFrame, pd.DataFrame], pd.DataFrame) self.pset.addPrimitive(self._consensus_three, [int, int, pd.DataFrame, pd.DataFrame, pd.DataFrame], pd.DataFrame) self.pset.addPrimitive(self._consensus_four, [int, int, pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame], pd.DataFrame) - #self.pset.addPrimitive(self._ident, [str], str) for val in range(0, 101): self.pset.addTerminal(val, int) @@ -875,7 +874,6 @@ def gradient_boosting(self, input_df, learning_rate, n_estimators, max_depth): def _get_ht_dict(self, classes, weights): ret = {} - #instead of weights of class needs to be weights of classifier ctr = 0 for cls in classes: try: @@ -894,7 +892,6 @@ def _get_top(self, classes, tups): if class_ in values: return class_ - #max freq class def _max_class(self, classes, weights): """Return the class with the highest weight, or the class that appears first with that weight (e.g., input_df1) """ @@ -915,7 +912,6 @@ def _median_class(self, classes, weights): median_val = np.median(ht.values()) return self._get_top(classes, sorted(((x, abs(y - median_val)) for (x,y) in ht.items()), key=operator.itemgetter(1))) - #minimum frequency class def _min_class(self, classes, weights): """Return the class with the minimal weight, or the class that appears first with that weight (e.g., input_df1) """ From 22a3e19b2281d7ca8356bda8290b2c8fa8e465a9 Mon Sep 17 00:00:00 2001 From: Nathan Date: Fri, 26 Feb 2016 21:47:37 -0600 Subject: [PATCH 07/13] cleanup comments, remove ident and uncomment combine_dfs --- tpot/tpot.py | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/tpot/tpot.py b/tpot/tpot.py index bb73c6c1..cceee419 100644 --- a/tpot/tpot.py +++ b/tpot/tpot.py @@ -108,7 +108,7 @@ def __init__(self, population_size=100, generations=100, self.pset.addPrimitive(self.svc, [pd.DataFrame, float], pd.DataFrame) self.pset.addPrimitive(self.knnc, [pd.DataFrame, int], pd.DataFrame) self.pset.addPrimitive(self.gradient_boosting, [pd.DataFrame, float, int, int], pd.DataFrame) - #self.pset.addPrimitive(self._combine_dfs, [pd.DataFrame, pd.DataFrame], pd.DataFrame) + self.pset.addPrimitive(self._combine_dfs, [pd.DataFrame, pd.DataFrame], pd.DataFrame) self.pset.addPrimitive(self._variance_threshold, [pd.DataFrame, float], pd.DataFrame) self.pset.addPrimitive(self._select_kbest, [pd.DataFrame, int], pd.DataFrame) self.pset.addPrimitive(self._select_percentile, [pd.DataFrame, int], pd.DataFrame) @@ -125,7 +125,6 @@ def __init__(self, population_size=100, generations=100, self.pset.addPrimitive(self._consensus_two, [int, int, pd.DataFrame, pd.DataFrame], pd.DataFrame) self.pset.addPrimitive(self._consensus_three, [int, int, pd.DataFrame, pd.DataFrame, pd.DataFrame], pd.DataFrame) self.pset.addPrimitive(self._consensus_four, [int, int, pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame], pd.DataFrame) - #self.pset.addPrimitive(self._ident, [str], str) for val in range(0, 101): self.pset.addTerminal(val, int) @@ -875,7 +874,6 @@ def gradient_boosting(self, input_df, learning_rate, n_estimators, max_depth): def _get_ht_dict(self, classes, weights): ret = {} - #instead of weights of class needs to be weights of classifier ctr = 0 for cls in classes: try: @@ -894,7 +892,6 @@ def _get_top(self, classes, tups): if class_ in values: return class_ - #max freq class def _max_class(self, classes, weights): """Return the class with the highest weight, or the class that appears first with that weight (e.g., input_df1) """ @@ -915,7 +912,6 @@ def _median_class(self, classes, weights): median_val = np.median(ht.values()) return self._get_top(classes, sorted(((x, abs(y - median_val)) for (x,y) in ht.items()), key=operator.itemgetter(1))) - #minimum frequency class def _min_class(self, classes, weights): """Return the class with the minimal weight, or the class that appears first with that weight (e.g., input_df1) """ @@ -1600,8 +1596,6 @@ def _balanced_accuracy(self, result): all_classes = list(set(result['class'].values)) all_class_accuracies = [] for this_class in all_classes: - #print("this class {}".format(this_class)) - #print("result['guess'] {}".format(result['guess'])) this_class_accuracy = len(result[(result['guess'] == this_class) \ & (result['class'] == this_class)])\ / float(len(result[result['class'] == this_class])) From c560f1abd241160cf43e6cbd00518e0a7b913a8f Mon Sep 17 00:00:00 2001 From: Nathan Date: Fri, 26 Feb 2016 21:56:06 -0600 Subject: [PATCH 08/13] Added comment to get_ht_dict --- tpot/tpot.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tpot/tpot.py b/tpot/tpot.py index cceee419..bec213ac 100644 --- a/tpot/tpot.py +++ b/tpot/tpot.py @@ -873,6 +873,8 @@ def gradient_boosting(self, input_df, learning_rate, n_estimators, max_depth): def _get_ht_dict(self, classes, weights): + """Return a dictionary where the keys are the unique class values present in this row of guesses, and the weights are the weights assigned to each guess. + """ ret = {} ctr = 0 for cls in classes: From cbf49459c418a5e98a0874881c7df23bdef5b99e Mon Sep 17 00:00:00 2001 From: Nathan Date: Sat, 27 Feb 2016 09:14:30 -0600 Subject: [PATCH 09/13] Updated export_utils, cleanup --- tpot/export_utils.py | 256 +++++++++++++++++++++++++++++++++++++++++++ tpot/tpot.py | 7 +- 2 files changed, 259 insertions(+), 4 deletions(-) diff --git a/tpot/export_utils.py b/tpot/export_utils.py index f1813bae..7c284239 100644 --- a/tpot/export_utils.py +++ b/tpot/export_utils.py @@ -22,6 +22,91 @@ import deap +consensus_options = ['accuracy', 'uniform', 'max', 'mean', 'median', 'min'] +num_consensus_options = 6 +consensus_opt_split_ix = 1 + +def consensus_operator_prefix(weight_scheme, method, operator_text): + """Utility function for generating the first part of the consensus operator text + + Parameters + ---------- + weight_scheme: integer + The corrected-for weight_scheme index in consensus_options + method: integer + The corrected-for method index in consensus_options + operator_text: String + The current operator export string (to be appended onto) + + Returns + ------- + operator_text: String + The updated operator export string + """ + + operator_text +='''\n +def _get_ht_dict(classes, weights): + """Return a dictionary where the keys are the unique class values present in this row of guesses, and the weights are the weights assigned to each guess. + """ + ret = {} + ctr = 0 + for cls in classes: + try: + ret[cls] += weights[ctr] + except: + ret[cls] = weights[ctr] + ctr += 1 + return ret + +def _get_top( classes, tups): + """Return the class from the row in the first DataFrame passed to the function (e.g., input_df1) + """ + values = [tup[0] for tup in tups if tup[1] == tups[0][1]] + for class_ in classes: + if class_ in values: + return class_ + ''' + if consensus_options[method % num_consensus_options] == 'max': + operator_text += '''\n +def _max_class(classes, weights): + """Return the class with the highest weight, or the class that appears first with that weight (e.g., input_df1) + """ + ht = _get_ht_dict(classes, weights) + return _get_top(classes, sorted(ht.items(), key=operator.itemgetter(1), reverse=True)) +method = _max_class + ''' + elif consensus_options[method % num_consensus_options] == 'mean': + operator_text += '''\n +def _mean_class( classes, weights): + """Return the class closest to the mean weight, or the class that appears first with that weight (e.g., input_df1) + """ + ht = _get_ht_dict(classes, weights) + mean_val = np.mean(ht.values()) + return _get_top(classes, sorted(((x, abs(y - mean_val)) for (x,y) in ht.items()), key=operator.itemgetter(1))) +method = _mean_class + ''' + elif consensus_options[method % num_consensus_options] == 'median': + operator_text += '''\n +def _median_class(classes, weights): + """Return the class closest to the median weight, or the class that appears first with that weight (e.g., input_df1) + """ + ht = _get_ht_dict(classes, weights) + median_val = np.median(ht.values()) + return _get_top(classes, sorted(((x, abs(y - median_val)) for (x,y) in ht.items()), key=operator.itemgetter(1))) +method = _median_class + ''' + elif consensus_options[method % num_consensus_options] == 'min': + operator_text += '''\n +def _min_class(classes, weights): + """Return the class with the minimal weight, or the class that appears first with that weight (e.g., input_df1) + """ + ht = _get_ht_dict(classes, weights) + return _get_top(classes, sorted(ht.items(), key=operator.itemgetter(1))) +method = _min_class + ''' + return operator_text + + def replace_mathematical_operators(exported_pipeline): """Replace all of the mathematical operators with their results for use in export(self, output_file_name) @@ -291,6 +376,177 @@ def replace_function_calls(pipeline_list): operator_text += '\n# Combine two DataFrames' operator_text += '\n{2} = {0}.join({1}[[column for column in {1}.columns.values if column not in {0}.columns.values]])\n'.format(operator[2], operator[3], result_name) + elif operator_name == '_consensus_two': + weight_scheme = int(operator[2]) + method = int(operator[3]) + if weight_scheme % num_consensus_options > consensus_opt_split_ix: + weight_scheme = consensus_opt_split_ix + if method % num_consensus_options <= consensus_opt_split_ix: + method = consensus_opt_split_ix + 1 + + operator_text += consensus_operator_prefix(weight_scheme, method, operator_text) + + operator_text += '\n# Combine two DataFrames' + operator_text += '\ndfs = [{0}, {1}]'.format(operator[4], operator[5]) + operator_text += ''' +ignore_consensus = False +if any(len(df.columns) == 3 for df in dfs): + found = False + + for df in dfs: + if len(df.columns) > 3: + {0} = df.copy() + found = True + break + if not found: + ignore_consensus = True + {0} = df.copy() + +if not ignore_consensus: + weights = [] + for df in dfs: + tup = df[['guess', 'class']] + num_correct = len(np.where(tup['guess'] == tup['class'])[0]) + total_vals = len(tup['guess'].index)'''.format(result_name) + if consensus_options[weight_scheme % num_consensus_options] == 'accuracy': + operator_text +=''' + weights.append(float(num_correct) / float(total_vals)) + ''' + elif consensus_options[weight_scheme % num_consensus_options] == 'uniform': + operator_text +=''' + weights.append(1.0) + ''' + operator_text += ''' + # Initialize the dataFrame containing just the guesses, and to hold the results + merged_guesses = pd.DataFrame(data={0}[['guess']].values, columns=['guess_1']) + merged_guesses.loc[:, 'guess_2'] = {1}['guess'] + merged_guesses.loc[:, 'guess'] = None + + for row_ix in merged_guesses.index: + merged_guesses['guess'].loc[row_ix] = method(merged_guesses[['guess_1', 'guess_2']].iloc[row_ix], weights) + {2} = {0}.join({1}[[column for column in {1}.columns.values if column not in {0}.columns.values]]) + if 'guess' in {2}.columns.values: + {2} = {2}.drop('guess', 1).join(merged_guesses['guess']).copy() + else: + {2} = {2}.join(merged_guesses['guess']) + '''.format(operator[4], operator[5], result_name) + + elif operator_name == '_consensus_three': + weight_scheme = int(operator[2]) + method = int(operator[3]) + if weight_scheme % num_consensus_options > consensus_opt_split_ix: + weight_scheme = consensus_opt_split_ix + if method % num_consensus_options <= consensus_opt_split_ix: + method = consensus_opt_split_ix + 1 + + operator_text += consensus_operator_prefix(weight_scheme, method, operator_text) + + operator_text += '\n# Combine three DataFrames' + operator_text += '\ndfs = [{0}, {1}, {2}]'.format(operator[4], operator[5], operator[6]) + operator_text += ''' +ignore_consensus = False +if any(len(df.columns) == 3 for df in dfs): + found = False + + for df in dfs: + if len(df.columns) > 3: + {0} = df.copy() + found = True + break + if not found: + ignore_consensus = True + {0} = df.copy() + +if not ignore_consensus: + weights = [] + for df in dfs: + tup = df[['guess', 'class']] + num_correct = len(np.where(tup['guess'] == tup['class'])[0]) + total_vals = len(tup['guess'].index)'''.format(result_name) + if consensus_options[weight_scheme % num_consensus_options] == 'accuracy': + operator_text +=''' + weights.append(float(num_correct) / float(total_vals)) + ''' + elif consensus_options[weight_scheme % num_consensus_options] == 'uniform': + operator_text +=''' + weights.append(1.0) + ''' + operator_text += ''' + # Initialize the dataFrame containing just the guesses, and to hold the results + merged_guesses = pd.DataFrame(data={0}[['guess']].values, columns=['guess_1']) + merged_guesses.loc[:, 'guess_2'] = {1}['guess'] + merged_guesses.loc[:, 'guess_3'] = {2}['guess'] + merged_guesses.loc[:, 'guess'] = None + + for row_ix in merged_guesses.index: + merged_guesses['guess'].loc[row_ix] = method(merged_guesses[['guess_1', 'guess_2', 'guess_3']].iloc[row_ix], weights) + {3} = {0}.join({1}[[column for column in {1}.columns.values if column not in {0}.columns.values]]) + {3} = {3}.join({2}[[column for column in {2}.columns.values if column not in {3}.columns.values]]) + if 'guess' in {3}.columns.values: + {3} = {3}.drop('guess', 1).join(merged_guesses['guess']).copy() + else: + {3} = {3}.join(merged_guesses['guess']) + '''.format(operator[4], operator[5], operator[6], result_name) + + elif operator_name == '_consensus_four': + weight_scheme = int(operator[2]) + method = int(operator[3]) + if weight_scheme % num_consensus_options > consensus_opt_split_ix: + weight_scheme = consensus_opt_split_ix + if method % num_consensus_options <= consensus_opt_split_ix: + method = consensus_opt_split_ix + 1 + + operator_text += consensus_operator_prefix(weight_scheme, method, operator_text) + + operator_text += '\n# Combine four DataFrames' + operator_text += '\ndfs = [{0}, {1}, {2}, {3}]'.format(operator[4], operator[5], operator[6], operator[7]) + operator_text += ''' +ignore_consensus = False +if any(len(df.columns) == 3 for df in dfs): + found = False + + for df in dfs: + if len(df.columns) > 3: + {0} = df.copy() + found = True + break + if not found: + ignore_consensus = True + {0} = df.copy() + +if not ignore_consensus: + weights = [] + for df in dfs: + tup = df[['guess', 'class']] + num_correct = len(np.where(tup['guess'] == tup['class'])[0]) + total_vals = len(tup['guess'].index)'''.format(result_name) + if consensus_options[weight_scheme % num_consensus_options] == 'accuracy': + operator_text +=''' + weights.append(float(num_correct) / float(total_vals)) + ''' + elif consensus_options[weight_scheme % num_consensus_options] == 'uniform': + operator_text +=''' + weights.append(1.0) + ''' + operator_text += ''' + # Initialize the dataFrame containing just the guesses, and to hold the results + merged_guesses = pd.DataFrame(data={0}[['guess']].values, columns=['guess_1']) + merged_guesses.loc[:, 'guess_2'] = {1}['guess'] + merged_guesses.loc[:, 'guess_3'] = {2}['guess'] + merged_guesses.loc[:, 'guess_4'] = {3}['guess'] + merged_guesses.loc[:, 'guess'] = None + + for row_ix in merged_guesses.index: + merged_guesses['guess'].loc[row_ix] = method(merged_guesses[['guess_1', 'guess_2', 'guess_3', 'guess_4']].iloc[row_ix], weights) + {4} = {0}.join({1}[[column for column in {1}.columns.values if column not in {0}.columns.values]]) + {4} = {4}.join({2}[[column for column in {2}.columns.values if column not in {4}.columns.values]]) + {4} = {4}.join({3}[[column for column in {3}.columns.values if column not in {4}.columns.values]]) + if 'guess' in {4}.columns.values: + {4} = {4}.drop('guess', 1).join(merged_guesses['guess']).copy() + else: + {4} = {4}.join(merged_guesses['guess']) + '''.format(operator[4], operator[5], operator[6], result_name) + elif operator_name == '_variance_threshold': operator_text += ''' # Use Scikit-learn's VarianceThreshold for feature selection diff --git a/tpot/tpot.py b/tpot/tpot.py index 26a67919..0bff51d7 100644 --- a/tpot/tpot.py +++ b/tpot/tpot.py @@ -131,9 +131,9 @@ def __init__(self, population_size=100, generations=100, self._pset.addPrimitive(self._div, [int, int], float) # Other operators - self.pset.addPrimitive(self._consensus_two, [int, int, pd.DataFrame, pd.DataFrame], pd.DataFrame) - self.pset.addPrimitive(self._consensus_three, [int, int, pd.DataFrame, pd.DataFrame, pd.DataFrame], pd.DataFrame) - self.pset.addPrimitive(self._consensus_four, [int, int, pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame], pd.DataFrame) + self._pset.addPrimitive(self._consensus_two, [int, int, pd.DataFrame, pd.DataFrame], pd.DataFrame) + self._pset.addPrimitive(self._consensus_three, [int, int, pd.DataFrame, pd.DataFrame, pd.DataFrame], pd.DataFrame) + self._pset.addPrimitive(self._consensus_four, [int, int, pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame], pd.DataFrame) for val in range(0, 101): self._pset.addTerminal(val, int) for val in [100.0, 10.0, 1.0, 0.1, 0.01, 0.001, 0.0001]: @@ -845,7 +845,6 @@ def _consensus_four(self, weighting, method, input_df1, input_df2, input_df3, in # Initialize the dataFrame containing just the guesses, and to hold the results merged_guesses = pd.DataFrame(data=input_df1[['guess']].values, columns=['guess_1']) - merged_guesses = pd.DataFrame(data=input_df1[['guess']].values, columns=['guess_1']) merged_guesses.loc[:, 'guess_2'] = input_df2['guess'] merged_guesses.loc[:, 'guess_3'] = input_df3['guess'] merged_guesses.loc[:, 'guess_4'] = input_df4['guess'] From 26a162e264d12f4f497abaddc239583c07e1eed4 Mon Sep 17 00:00:00 2001 From: Nathan Date: Sat, 27 Feb 2016 09:27:03 -0600 Subject: [PATCH 10/13] Corrected one line in export_utils for the consensus_operators --- tpot/export_utils.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tpot/export_utils.py b/tpot/export_utils.py index 7c284239..aa8ce54d 100644 --- a/tpot/export_utils.py +++ b/tpot/export_utils.py @@ -400,7 +400,7 @@ def replace_function_calls(pipeline_list): break if not found: ignore_consensus = True - {0} = df.copy() + {0} = dfs[0].copy() if not ignore_consensus: weights = [] @@ -455,7 +455,7 @@ def replace_function_calls(pipeline_list): break if not found: ignore_consensus = True - {0} = df.copy() + {0} = dfs[0].copy() if not ignore_consensus: weights = [] @@ -512,7 +512,7 @@ def replace_function_calls(pipeline_list): break if not found: ignore_consensus = True - {0} = df.copy() + {0} = dfs[0].copy() if not ignore_consensus: weights = [] From 7fac07316c194204a071f41d1f005e2634edb735 Mon Sep 17 00:00:00 2001 From: Nathan Date: Sat, 27 Feb 2016 10:01:00 -0600 Subject: [PATCH 11/13] Updated print statements in tests to be print function call from python3 --- tests.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests.py b/tests.py index 736b94c3..c1a1014a 100644 --- a/tests.py +++ b/tests.py @@ -186,7 +186,7 @@ def test_consensus(): ] for test, res in zip(tests, results): - print test[1:3] - print test[0](*test[1:])['guess'].values + print(test[1:3]) + print(test[0](*test[1:])['guess'].values) assert np.array_equal(test[0](*test[1:])['guess'].values, res['guess'].values) From 47248ede495d70ca1c088ac93bfaeefa1199f3f4 Mon Sep 17 00:00:00 2001 From: Nathan Date: Sat, 27 Feb 2016 17:07:03 -0600 Subject: [PATCH 12/13] 2to3 conversion of consensus stuff, updated export_utils and tests --- tests.py | 34 +++++++++++++++++++++------------- tpot/export_utils.py | 12 ++++++------ tpot/tpot.py | 16 ++++++++-------- 3 files changed, 35 insertions(+), 27 deletions(-) diff --git a/tests.py b/tests.py index c1a1014a..7c39e618 100644 --- a/tests.py +++ b/tests.py @@ -92,15 +92,15 @@ def test_xgboost(): def test_combine_dfs(): tpot_obj = TPOT() - df1 = pd.DataFrame({'a': range(10), - 'b': range(10, 20)}) + df1 = pd.DataFrame({'a': list(range(10)), + 'b': list(range(10, 20))}) - df2 = pd.DataFrame({'b': range(10, 20), - 'c': range(20, 30)}) + df2 = pd.DataFrame({'b': list(range(10, 20)), + 'c': list(range(20, 30))}) - combined_df = pd.DataFrame({'a': range(10), - 'b': range(10, 20), - 'c': range(20, 30)}) + combined_df = pd.DataFrame({'a': list(range(10)), + 'b': list(range(10, 20)), + 'c': list(range(20, 30))}) assert tpot_obj._combine_dfs(df1, df2).equals(combined_df) @@ -130,16 +130,24 @@ def test_static_models(): def test_consensus(): tpot_obj = TPOT() - df1 = pd.DataFrame({'class': [1,1,1,1,1,0,0,0,0,0], + df1 = pd.DataFrame({'00001': [2,2,2,2,2,2,2,2,2,2], + '00002': [3,3,3,3,3,3,3,3,3,3], + 'class': [1,1,1,1,1,0,0,0,0,0], 'guess': [1,0,1,0,1,0,1,0,1,0]}) - df2 = pd.DataFrame({'class': [1,1,1,1,1,0,0,0,0,0], + df2 = pd.DataFrame({'00001': [2,2,2,2,2,2,2,2,2,2], + '00002': [3,3,3,3,3,3,3,3,3,3], + 'class': [1,1,1,1,1,0,0,0,0,0], 'guess': [0,0,0,0,0,1,1,1,1,1]}) - df3 = pd.DataFrame({'class': [1,1,1,1,1,0,0,0,0,0], + df3 = pd.DataFrame({'00001': [2,2,2,2,2,2,2,2,2,2], + '00002': [3,3,3,3,3,3,3,3,3,3], + 'class': [1,1,1,1,1,0,0,0,0,0], 'guess': [0,1,0,1,0,1,0,1,0,1]}) - df4 = pd.DataFrame({'class': [1,1,1,1,1,0,0,0,0,0], + df4 = pd.DataFrame({'00001': [2,2,2,2,2,2,2,2,2,2], + '00002': [3,3,3,3,3,3,3,3,3,3], + 'class': [1,1,1,1,1,0,0,0,0,0], 'guess': [1,1,0,0,1,1,0,0,1,1]}) uniform = [1.0] * 10 @@ -186,7 +194,7 @@ def test_consensus(): ] for test, res in zip(tests, results): - print(test[1:3]) - print(test[0](*test[1:])['guess'].values) + #print(test[1:3]) + #print(test[0](*test[1:])['guess'].values) assert np.array_equal(test[0](*test[1:])['guess'].values, res['guess'].values) diff --git a/tpot/export_utils.py b/tpot/export_utils.py index aa8ce54d..78dd5bf8 100644 --- a/tpot/export_utils.py +++ b/tpot/export_utils.py @@ -72,7 +72,7 @@ def _max_class(classes, weights): """Return the class with the highest weight, or the class that appears first with that weight (e.g., input_df1) """ ht = _get_ht_dict(classes, weights) - return _get_top(classes, sorted(ht.items(), key=operator.itemgetter(1), reverse=True)) + return _get_top(classes, sorted(list(ht.items()), key=operator.itemgetter(1), reverse=True)) method = _max_class ''' elif consensus_options[method % num_consensus_options] == 'mean': @@ -81,8 +81,8 @@ def _mean_class( classes, weights): """Return the class closest to the mean weight, or the class that appears first with that weight (e.g., input_df1) """ ht = _get_ht_dict(classes, weights) - mean_val = np.mean(ht.values()) - return _get_top(classes, sorted(((x, abs(y - mean_val)) for (x,y) in ht.items()), key=operator.itemgetter(1))) + mean_val = np.mean(list(ht.values())) + return _get_top(classes, sorted(((x, abs(y - mean_val)) for (x,y) in list(ht.items())), key=operator.itemgetter(1))) method = _mean_class ''' elif consensus_options[method % num_consensus_options] == 'median': @@ -91,8 +91,8 @@ def _median_class(classes, weights): """Return the class closest to the median weight, or the class that appears first with that weight (e.g., input_df1) """ ht = _get_ht_dict(classes, weights) - median_val = np.median(ht.values()) - return _get_top(classes, sorted(((x, abs(y - median_val)) for (x,y) in ht.items()), key=operator.itemgetter(1))) + median_val = np.median(list(ht.values())) + return _get_top(classes, sorted(((x, abs(y - median_val)) for (x,y) in list(ht.items())), key=operator.itemgetter(1))) method = _median_class ''' elif consensus_options[method % num_consensus_options] == 'min': @@ -101,7 +101,7 @@ def _min_class(classes, weights): """Return the class with the minimal weight, or the class that appears first with that weight (e.g., input_df1) """ ht = _get_ht_dict(classes, weights) - return _get_top(classes, sorted(ht.items(), key=operator.itemgetter(1))) + return _get_top(classes, sorted(list(ht.items()), key=operator.itemgetter(1))) method = _min_class ''' return operator_text diff --git a/tpot/tpot.py b/tpot/tpot.py index 0bff51d7..c21845fb 100644 --- a/tpot/tpot.py +++ b/tpot/tpot.py @@ -18,7 +18,7 @@ """ -from __future__ import print_function + import argparse import operator import random @@ -593,27 +593,27 @@ def _max_class(self, classes, weights): """Return the class with the highest weight, or the class that appears first with that weight (e.g., input_df1) """ ht = self._get_ht_dict(classes, weights) - return self._get_top(classes, sorted(ht.items(), key=operator.itemgetter(1), reverse=True)) + return self._get_top(classes, sorted(list(ht.items()), key=operator.itemgetter(1), reverse=True)) def _mean_class(self, classes, weights): """Return the class closest to the mean weight, or the class that appears first with that weight (e.g., input_df1) """ ht = self._get_ht_dict(classes, weights) - mean_val = np.mean(ht.values()) - return self._get_top(classes, sorted(((x, abs(y - mean_val)) for (x,y) in ht.items()), key=operator.itemgetter(1))) + mean_val = np.mean(list(ht.values())) + return self._get_top(classes, sorted(((x, abs(y - mean_val)) for (x,y) in list(ht.items())), key=operator.itemgetter(1))) def _median_class(self, classes, weights): """Return the class closest to the median weight, or the class that appears first with that weight (e.g., input_df1) """ ht = self._get_ht_dict(classes, weights) - median_val = np.median(ht.values()) - return self._get_top(classes, sorted(((x, abs(y - median_val)) for (x,y) in ht.items()), key=operator.itemgetter(1))) + median_val = np.median(list(ht.values())) + return self._get_top(classes, sorted(((x, abs(y - median_val)) for (x,y) in list(ht.items())), key=operator.itemgetter(1))) def _min_class(self, classes, weights): """Return the class with the minimal weight, or the class that appears first with that weight (e.g., input_df1) """ ht = self._get_ht_dict(classes, weights) - return self._get_top(classes, sorted(ht.items(), key=operator.itemgetter(1))) + return self._get_top(classes, sorted(list(ht.items()), key=operator.itemgetter(1))) def _consensus_two(self, weighting, method, input_df1, input_df2): """Takes the classifications of different models and combines them in a meaningful manner. @@ -1488,7 +1488,7 @@ def float_range(value): args = parser.parse_args() if args.version: - from _version import __version__ + from ._version import __version__ print('TPOT version: {}'.format(__version__)) return elif args.input_file is None: From 1e621508a6369a90d5b2ce03abc5323aa17704f1 Mon Sep 17 00:00:00 2001 From: Nathan Date: Tue, 8 Mar 2016 22:38:31 -0600 Subject: [PATCH 13/13] Added threshold consensus operator --- tests.py | 4 ++++ tpot/export_utils.py | 19 +++++++++++++++++-- tpot/tpot.py | 21 ++++++++++++++++++++- 3 files changed, 41 insertions(+), 3 deletions(-) diff --git a/tests.py b/tests.py index 7c39e618..f6d44553 100644 --- a/tests.py +++ b/tests.py @@ -169,6 +169,8 @@ def test_consensus(): 'guess': [1,0,1,0,1,0,1,0,1,0]}) res_two_ada_max = pd.DataFrame({'class': [1,1,1,1,1,0,0,0,0,0], 'guess': [1,0,1,0,1,0,1,0,1,0]}) + res_three_uni_thresh = pd.DataFrame({'class': [1,1,1,1,1,0,0,0,0,0], + 'guess': [0,0,0,0,0,1,1,1,1,1]}) res_three_ada_max = pd.DataFrame({'class': [1,1,1,1,1,0,0,0,0,0], 'guess': [1,1,0,0,1,1,0,0,1,1]}) res_four_ada_max = pd.DataFrame({'class': [1,1,1,1,1,0,0,0,0,0], @@ -180,6 +182,7 @@ def test_consensus(): (tpot_obj._consensus_two, 0, 5, df1, df2), # accuracy min (tpot_obj._consensus_two, 1, 2, df1, df2), # uniform max #(tpot_obj._consensus_two, 2, 3, df1, df2), # adaboost max + (tpot_obj._consensus_three, 1, 6, df1, df2, df3), #uni threshold (tpot_obj._consensus_three, 1, 2, df1, df3, df4), # uni max (tpot_obj._consensus_four, 1, 2, df1, df2, df3, df4) #uni max ] @@ -189,6 +192,7 @@ def test_consensus(): res_two_acc_min, res_two_uni_max, #res_two_ada_max, + res_three_uni_thresh, res_three_ada_max, res_four_ada_max ] diff --git a/tpot/export_utils.py b/tpot/export_utils.py index 78dd5bf8..73abdc07 100644 --- a/tpot/export_utils.py +++ b/tpot/export_utils.py @@ -22,8 +22,8 @@ import deap -consensus_options = ['accuracy', 'uniform', 'max', 'mean', 'median', 'min'] -num_consensus_options = 6 +consensus_options = ['accuracy', 'uniform', 'max', 'mean', 'median', 'min', 'threshold'] +num_consensus_options = 7 consensus_opt_split_ix = 1 def consensus_operator_prefix(weight_scheme, method, operator_text): @@ -104,6 +104,21 @@ def _min_class(classes, weights): return _get_top(classes, sorted(list(ht.items()), key=operator.itemgetter(1))) method = _min_class ''' + elif consensus_options[method % num_consensus_options] == 'threshold': + operator_text += '''\n +def _threshold_class(self, classes, weights): + """Return the class with that contains a certain percentage of the weight + """ + ht = _get_ht_dict(classes, weights) + total_weight = sum(list(ht.values())) + threshold = 0.75 + sorted_classes = sorted(((x, float(y) / total_weight) for (x,y) in list(ht.items()) if (float(y) / total_weight) > threshold), key=operator.itemgetter(1), reverse=True) + while len(sorted_classes) == 0: + threshold = threshold - 0.05 + sorted_classes = sorted(((x, float(y) / total_weight) for (x,y) in list(ht.items()) if (float(y) / total_weight) > threshold), key=operator.itemgetter(1), reverse=True) + + return _get_top(classes, sorted_classes) + ''' return operator_text diff --git a/tpot/tpot.py b/tpot/tpot.py index 0000df84..32fa8b66 100644 --- a/tpot/tpot.py +++ b/tpot/tpot.py @@ -157,7 +157,7 @@ def __init__(self, population_size=100, generations=100, else: self.scoring_function=scoring_function - self._consensus_options = ['accuracy', 'uniform', 'max', 'mean', 'median', 'min'] + self._consensus_options = ['accuracy', 'uniform', 'max', 'mean', 'median', 'min', 'threshold'] self._num_consensus_options = len(self._consensus_options) self._consensus_opt_split_ix = 1 @@ -615,6 +615,19 @@ def _min_class(self, classes, weights): ht = self._get_ht_dict(classes, weights) return self._get_top(classes, sorted(list(ht.items()), key=operator.itemgetter(1))) + def _threshold_class(self, classes, weights): + """Return the class with that contains a certain percentage of the weight + """ + ht = self._get_ht_dict(classes, weights) + total_weight = sum(list(ht.values())) + threshold = 0.75 + sorted_classes = sorted(((x, float(y) / total_weight) for (x,y) in list(ht.items()) if (float(y) / total_weight) > threshold), key=operator.itemgetter(1), reverse=True) + while len(sorted_classes) == 0: + threshold = threshold - 0.05 + sorted_classes = sorted(((x, float(y) / total_weight) for (x,y) in list(ht.items()) if (float(y) / total_weight) > threshold), key=operator.itemgetter(1), reverse=True) + + return self._get_top(classes, sorted_classes) + def _consensus_two(self, weighting, method, input_df1, input_df2): """Takes the classifications of different models and combines them in a meaningful manner. @@ -678,6 +691,8 @@ def _consensus_two(self, weighting, method, input_df1, input_df2): method_f = self._median_class elif method == 'min': method_f = self._min_class + elif method =='threshold': + method_f = self._threshold_class # Initialize the dataFrame containing just the guesses, and to hold the results merged_guesses = pd.DataFrame(data=input_df1[['guess']].values, columns=['guess_1']) @@ -758,6 +773,8 @@ def _consensus_three(self, weighting, method, input_df1, input_df2, input_df3): method_f = self._median_class elif method == 'min': method_f = self._min_class + elif method =='threshold': + method_f = self._threshold_class # Initialize the dataFrame containing just the guesses, and to hold the results merged_guesses = pd.DataFrame(data=input_df1[['guess']].values, columns=['guess_1']) @@ -842,6 +859,8 @@ def _consensus_four(self, weighting, method, input_df1, input_df2, input_df3, in method_f = self._median_class elif method == 'min': method_f = self._min_class + elif method =='threshold': + method_f = self._threshold_class # Initialize the dataFrame containing just the guesses, and to hold the results merged_guesses = pd.DataFrame(data=input_df1[['guess']].values, columns=['guess_1'])