diff --git a/benchmarks/cifar_exp/fte_bte_exp.py b/benchmarks/cifar_exp/fte_bte_exp.py index 975918cf59..6d68789e97 100644 --- a/benchmarks/cifar_exp/fte_bte_exp.py +++ b/benchmarks/cifar_exp/fte_bte_exp.py @@ -1,12 +1,15 @@ #%% import random -import matplotlib.pyplot as plt import tensorflow as tf -import keras -from keras import layers +from tensorflow import keras +from tensorflow.keras import layers from itertools import product import pandas as pd +from losses import ( + SupervisedContrastiveLoss, +) # adapted version of SupConLoss for ftebte setting, uses cosine similarity matrix + import numpy as np import pickle @@ -16,6 +19,8 @@ from joblib import Parallel, delayed from multiprocessing import Pool +from tensorflow.keras.optimizers import Adam +from tensorflow.keras.callbacks import EarlyStopping from proglearn.progressive_learner import ProgressiveLearner from proglearn.deciders import SimpleArgmaxAverage from proglearn.transformers import ( @@ -24,9 +29,8 @@ ) from proglearn.voters import TreeClassificationVoter, KNNClassificationVoter -import tensorflow as tf - import time +import sys #%% def unpickle(file): @@ -35,6 +39,25 @@ def unpickle(file): return dict +def get_size(obj, seen=None): + """Recursively finds size of objects""" + size = sys.getsizeof(obj) + if seen is None: + seen = set() + obj_id = id(obj) + if obj_id in seen: + return 0 + # Important mark as seen *before* entering recursion to gracefully handle + # self-referential objects + seen.add(obj_id) + if isinstance(obj, dict): + size += sum([get_size(v, seen) for v in obj.values()]) + size += sum([get_size(k, seen) for k in obj.keys()]) + elif hasattr(obj, "__dict__"): + size += get_size(obj.__dict__, seen) + return size + + #%% def LF_experiment( train_x, @@ -58,6 +81,8 @@ def LF_experiment( train_times_across_tasks = [] single_task_inference_times_across_tasks = [] multitask_inference_times_across_tasks = [] + time_info = [] + mem_info = [] if model == "dnn": default_transformer_class = NeuralClassificationTransformer @@ -123,10 +148,16 @@ def LF_experiment( default_transformer_kwargs = { "network": network, "euclidean_layer_idx": -2, - "num_classes": 10, - "optimizer": keras.optimizers.Adam(3e-4), + "loss": SupervisedContrastiveLoss, + "optimizer": Adam(3e-4), + "fit_kwargs": { + "epochs": 100, + "callbacks": [EarlyStopping(patience=5, monitor="val_loss")], + "verbose": False, + "validation_split": 0.33, + "batch_size": 32, + }, } - default_voter_class = KNNClassificationVoter default_voter_kwargs = {"k": int(np.log2(num_points_per_task))} @@ -152,10 +183,12 @@ def LF_experiment( for task_ii in range(10): print("Starting Task {} For Fold {}".format(task_ii, shift)) + + train_start_time = time.time() + if acorn is not None: np.random.seed(acorn) - train_start_time = time.time() progressive_learner.add_task( X=train_x[ task_ii * 5000 @@ -168,7 +201,7 @@ def LF_experiment( + (slot + 1) * num_points_per_task ], num_transformers=1 if model == "dnn" else ntrees, - transformer_voter_decider_split=[0.67, 0.33, 0], + transformer_voter_decider_split=[0.63, 0.37, 0], decider_kwargs={ "classes": np.unique( train_y[ @@ -181,17 +214,54 @@ def LF_experiment( ) train_end_time = time.time() + single_learner = ProgressiveLearner( + default_transformer_class=default_transformer_class, + default_transformer_kwargs=default_transformer_kwargs, + default_voter_class=default_voter_class, + default_voter_kwargs=default_voter_kwargs, + default_decider_class=default_decider_class, + ) + + if acorn is not None: + np.random.seed(acorn) + + single_learner.add_task( + X=train_x[ + task_ii * 5000 + + slot * num_points_per_task : task_ii * 5000 + + (slot + 1) * num_points_per_task + ], + y=train_y[ + task_ii * 5000 + + slot * num_points_per_task : task_ii * 5000 + + (slot + 1) * num_points_per_task + ], + num_transformers=1 if model == "dnn" else (task_ii + 1) * ntrees, + transformer_voter_decider_split=[0.67, 0.33, 0], + decider_kwargs={ + "classes": np.unique( + train_y[ + task_ii * 5000 + + slot * num_points_per_task : task_ii * 5000 + + (slot + 1) * num_points_per_task + ] + ) + }, + ) + + time_info.append(train_end_time - train_start_time) + mem_info.append(get_size(progressive_learner)) train_times_across_tasks.append(train_end_time - train_start_time) single_task_inference_start_time = time.time() - llf_task = progressive_learner.predict( + single_task = single_learner.predict( X=test_x[task_ii * 1000 : (task_ii + 1) * 1000, :], - transformer_ids=[task_ii], - task_id=task_ii, + transformer_ids=[0], + task_id=0, ) single_task_inference_end_time = time.time() single_task_accuracies[task_ii] = np.mean( - llf_task == test_y[task_ii * 1000 : (task_ii + 1) * 1000] + single_task == test_y[task_ii * 1000 : (task_ii + 1) * 1000] ) single_task_inference_times_across_tasks.append( single_task_inference_end_time - single_task_inference_start_time @@ -236,8 +306,7 @@ def LF_experiment( + str(ntrees) + "_" + str(shift) - + "_" - + str(slot) + + "_SupervisedContrastiveLoss" + ".pickle" ) with open(file_to_save, "wb") as f: @@ -350,9 +419,9 @@ def run_parallel_exp( ) if model == "dnn": - config = tf.ConfigProto() + config = tf.compat.v1.ConfigProto() config.gpu_options.allow_growth = True - sess = tf.Session(config=config) + sess = tf.compat.v1.Session(config=config) with tf.device("/gpu:" + str(shift % 4)): LF_experiment( train_x, @@ -383,8 +452,8 @@ def run_parallel_exp( #%% ### MAIN HYPERPARAMS ### -model = "uf" -num_points_per_task = 500 +model = "dnn" +num_points_per_task = 500 # change from 5000 to 500 ######################## (X_train, y_train), (X_test, y_test) = keras.datasets.cifar100.load_data() @@ -399,7 +468,7 @@ def run_parallel_exp( #%% if model == "uf": - slot_fold = range(10) + slot_fold = range(1) shift_fold = range(1, 7, 1) n_trees = [10] iterable = product(n_trees, shift_fold, slot_fold) @@ -410,24 +479,15 @@ def run_parallel_exp( for ntree, shift, slot in iterable ) elif model == "dnn": - slot_fold = range(10) + slot_fold = range(10) # edit this default 10 is correct? - def perform_shift(shift_slot_tuple): - shift, slot = shift_slot_tuple - return run_parallel_exp( - data_x, data_y, 0, model, num_points_per_task, slot=slot, shift=shift - ) - - print("Performing Stage 1 Shifts") - stage_1_shifts = range(1, 5) - stage_1_iterable = product(stage_1_shifts, slot_fold) - with Pool(4) as p: - p.map(perform_shift, stage_1_iterable) - - print("Performing Stage 2 Shifts") - stage_2_shifts = range(5, 7) - stage_2_iterable = product(stage_2_shifts, slot_fold) - with Pool(4) as p: - p.map(perform_shift, stage_2_iterable) + # sequential + slot_fold = range(1) + shift_fold = [1, 2, 3, 4, 5, 6] + n_trees = [0] + iterable = product(n_trees, shift_fold, slot_fold) -# %% + for ntree, shift, slot in iterable: + run_parallel_exp( + data_x, data_y, ntree, model, num_points_per_task, slot=slot, shift=shift + ) diff --git a/benchmarks/cifar_exp/losses.py b/benchmarks/cifar_exp/losses.py new file mode 100644 index 0000000000..b3c25dc501 --- /dev/null +++ b/benchmarks/cifar_exp/losses.py @@ -0,0 +1,63 @@ +import math +import tensorflow as tf +from tensorflow.keras import backend as K +import tensorflow_addons as tfa + + +def logDiff(yTrue, yPred): + return K.sum(K.log(yTrue) - K.log(yPred)) + + +# sum over samples in batch (anchors) -> +# average over similar samples (positive) -> +# of - log softmax positive / sum negatives (wrt cos similarity) +# i.e. \sum_i -1/|P(i)| \sum_{p \in P(i)} log [exp(z_i @ z_p / t) / \sum_{n \in N(i)} exp(z_i @ z_n / t)] +# = \sum_i [log[\sum_{n \in N(i)} exp(z_i @ z_n / t)] - 1/|P(i)| \sum_{p \in P(i)} log [exp(z_i @ z_p / t)]] +def supervised_contrastive_loss(yTrue, yPred): + temp = 0.1 + r = yPred + y = yTrue + r, _ = tf.linalg.normalize(r, axis=1) + r_dists = tf.matmul(r, tf.transpose(r)) + r_dists = tf.linalg.set_diag( + r_dists, tf.zeros(r_dists.shape[0], dtype=r_dists.dtype) + ) # exclude itself distance + r_dists = r_dists / temp + y_norms = tf.reduce_sum(y * y, 1) + y = y_norms - 2 * tf.matmul(y, tf.transpose(y)) + tf.transpose(y_norms) + y = tf.cast(y / 2, r_dists.dtype) # scale onehot distances to 0 and 1 + negative_sum = tf.math.log( + tf.reduce_sum(y * tf.exp(r_dists), axis=1) + ) # y zeros diagonal 1's + positive_sum = (1 - y) * r_dists + n_nonzero = tf.math.reduce_sum(1 - y, axis=1) - 1 # Subtract diagonal + positive_sum = tf.reduce_sum(positive_sum, axis=1) / tf.cast( + n_nonzero, positive_sum.dtype + ) + loss = tf.reduce_sum(negative_sum - positive_sum) + return loss + + +# siamese networks version +def contrastiveLoss(yTrue, yPred): + # make sure the datatypes are the same + yTrue = tf.cast(yTrue, yPred.dtype) + squaredPreds = K.square(yPred) + squaredMargin = K.square(K.maximum(1 - yPred, 0)) + loss = K.mean(yTrue * squaredPreds + (1 - yTrue) * squaredMargin) + return loss + + +def cosSimilarity(vec1, vec2): + sim = tf.reduce_sum(tf.reduce_sum(tf.multiply(vec1, vec1))) + return sim + + +def SupervisedContrastiveLoss(yTrue, yPred): + temp = 0.1 + r = yPred + y = yTrue + r, _ = tf.linalg.normalize(r, axis=1) + r_dists = tf.matmul(r, tf.transpose(r)) + logits = tf.divide(r_dists, temp) + return tfa.losses.npairs_loss(tf.squeeze(tf.reduce_sum(y * y, 1)), logits) diff --git a/benchmarks/cifar_exp/plot_compare_two_algos.py b/benchmarks/cifar_exp/plot_compare_two_algos.py new file mode 100644 index 0000000000..10c01165c9 --- /dev/null +++ b/benchmarks/cifar_exp/plot_compare_two_algos.py @@ -0,0 +1,347 @@ +#%% +import pickle +import matplotlib.pyplot as plt +from matplotlib import rcParams + +rcParams.update({"figure.autolayout": True}) +import numpy as np +from itertools import product +import seaborn as sns + +### MAIN HYPERPARAMS ### +ntrees = 0 +shifts = 6 +task_num = 10 +model = "dnn" +######################## +algo1_name = "SupervisedContrastiveLoss" +algo2_name = "CategoricalCrossEntropy" +#%% +def unpickle(file): + with open(file, "rb") as fo: + dict = pickle.load(fo, encoding="bytes") + return dict + + +def get_fte_bte(err, single_err, ntrees): + bte = [[] for i in range(10)] + te = [[] for i in range(10)] + fte = [] + + for i in range(10): + for j in range(i, 10): + bte[i].append(err[i][i] / err[j][i]) + te[i].append(single_err[i] / err[j][i]) + + for i in range(10): + fte.append(single_err[i] / err[i][i]) + + return fte, bte, te + + +def calc_mean_bte(btes, task_num=10, reps=6): + mean_bte = [[] for i in range(task_num)] + + for j in range(task_num): + tmp = 0 + for i in range(reps): + tmp += np.array(btes[i][j]) + + tmp = tmp / reps + mean_bte[j].extend(tmp) + + return mean_bte + + +def calc_mean_te(tes, task_num=10, reps=6): + mean_te = [[] for i in range(task_num)] + + for j in range(task_num): + tmp = 0 + for i in range(reps): + tmp += np.array(tes[i][j]) + + tmp = tmp / reps + mean_te[j].extend(tmp) + + return mean_te + + +def calc_mean_fte(ftes, task_num=10, reps=6): + fte = np.asarray(ftes) + + return list(np.mean(np.asarray(fte), axis=0)) + + +def calc_mean_err(err, task_num=10, reps=6): + mean_err = [[] for i in range(task_num)] + + for j in range(task_num): + tmp = 0 + for i in range(reps): + tmp += np.array(err[i][j]) + + tmp = tmp / reps + mean_err[j].extend([tmp]) + + return mean_err + + +def calc_mean_multitask_time(multitask_time, task_num=10, reps=6): + mean_multitask_time = [[] for i in range(task_num)] + + for j in range(task_num): + tmp = 0 + for i in range(reps): + tmp += np.array(multitask_time[i][j]) + + tmp = tmp / reps + mean_multitask_time[j].extend([tmp]) + + return mean_multitask_time + + +#%% +reps = shifts + +btes = [[] for i in range(task_num)] +ftes = [[] for i in range(task_num)] +tes = [[] for i in range(task_num)] +err_ = [[] for i in range(task_num)] +btes2 = [[] for i in range(task_num)] +ftes2 = [[] for i in range(task_num)] +tes2 = [[] for i in range(task_num)] +err_2 = [[] for i in range(task_num)] + + +te_tmp = [[] for _ in range(reps)] +bte_tmp = [[] for _ in range(reps)] +fte_tmp = [[] for _ in range(reps)] +err_tmp = [[] for _ in range(reps)] +train_time_tmp = [[] for _ in range(reps)] +single_task_inference_time_tmp = [[] for _ in range(reps)] +multitask_inference_time_tmp = [[] for _ in range(reps)] +te_tmp2 = [[] for _ in range(reps)] +bte_tmp2 = [[] for _ in range(reps)] +fte_tmp2 = [[] for _ in range(reps)] +err_tmp2 = [[] for _ in range(reps)] +train_time_tmp2 = [[] for _ in range(reps)] +single_task_inference_time_tmp2 = [[] for _ in range(reps)] +multitask_inference_time_tmp2 = [[] for _ in range(reps)] + +count = 0 +for shift in range(shifts): + filename = ( + "result/result/" + + model + + str(ntrees) + + "_" + + str(shift + 1) + + "_" + + algo1_name + + ".pickle" + ) + filename2 = ( + "result/result/" + + model + + str(ntrees) + + "_" + + str(shift + 1) + + "_" + + algo2_name + + ".pickle" + ) + multitask_df, single_task_df = unpickle(filename) + multitask_df2, single_task_df2 = unpickle(filename2) + err = [[] for _ in range(10)] + multitask_inference_times = [[] for _ in range(10)] + err2 = [[] for _ in range(10)] + multitask_inference_times2 = [[] for _ in range(10)] + for ii in range(10): + err[ii].extend( + 1 - np.array(multitask_df[multitask_df["base_task"] == ii + 1]["accuracy"]) + ) + err2[ii].extend( + 1 + - np.array(multitask_df2[multitask_df2["base_task"] == ii + 1]["accuracy"]) + ) + multitask_inference_times[ii].extend( + np.array( + multitask_df[multitask_df["base_task"] == ii + 1][ + "multitask_inference_times" + ] + ) + ) + multitask_inference_times2[ii].extend( + np.array( + multitask_df2[multitask_df2["base_task"] == ii + 1][ + "multitask_inference_times" + ] + ) + ) + single_err = 1 - np.array(single_task_df["accuracy"]) + single_err2 = 1 - np.array(single_task_df2["accuracy"]) + fte, bte, te = get_fte_bte(err, single_err, ntrees) + fte2, bte2, te2 = get_fte_bte(err2, single_err2, ntrees) + + err_ = [[] for i in range(task_num)] + for i in range(task_num): + for j in range(task_num - i): + err_[i].append(err[i + j][i]) + err_2 = [[] for i in range(task_num)] + for i in range(task_num): + for j in range(task_num - i): + err_2[i].append(err2[i + j][i]) + + train_time_tmp[count].extend(np.array(single_task_df["train_times"])) + single_task_inference_time_tmp[count].extend( + np.array(single_task_df["single_task_inference_times"]) + ) + multitask_inference_time_tmp[count].extend(multitask_inference_times) + te_tmp[count].extend(te) + bte_tmp[count].extend(bte) + fte_tmp[count].extend(fte) + err_tmp[count].extend(err_) + train_time_tmp2[count].extend(np.array(single_task_df2["train_times"])) + single_task_inference_time_tmp2[count].extend( + np.array(single_task_df2["single_task_inference_times"]) + ) + multitask_inference_time_tmp2[count].extend(multitask_inference_times2) + te_tmp2[count].extend(te2) + bte_tmp2[count].extend(bte2) + fte_tmp2[count].extend(fte2) + err_tmp2[count].extend(err_2) + count += 1 + +te = calc_mean_te(te_tmp, reps=reps) +bte = calc_mean_bte(bte_tmp, reps=reps) +fte = calc_mean_fte(fte_tmp, reps=reps) +error = calc_mean_err(err_tmp, reps=reps) +te2 = calc_mean_te(te_tmp2, reps=reps) +bte2 = calc_mean_bte(bte_tmp2, reps=reps) +fte2 = calc_mean_fte(fte_tmp2, reps=reps) +error2 = calc_mean_err(err_tmp2, reps=reps) + +train_time = np.mean(train_time_tmp, axis=0) +single_task_inference_time = np.mean(single_task_inference_time_tmp, axis=0) +multitask_inference_time = calc_mean_multitask_time(multitask_inference_time_tmp) +multitask_inference_time = [ + np.mean(multitask_inference_time[i]) for i in range(len(multitask_inference_time)) +] +train_time2 = np.mean(train_time_tmp2, axis=0) +single_task_inference_time2 = np.mean(single_task_inference_time_tmp2, axis=0) +multitask_inference_time2 = calc_mean_multitask_time(multitask_inference_time_tmp2) +multitask_inference_time2 = [ + np.mean(multitask_inference_time2[i]) for i in range(len(multitask_inference_time2)) +] + +#%% +sns.set() + +n_tasks = 10 +clr = ["#e41a1c", "#a65628", "#377eb8", "#4daf4a", "#984ea3", "#ff7f00", "#CCCC00"] + +fontsize = 22 +ticksize = 20 + +fig, ax = plt.subplots(2, 2, figsize=(16, 11.5)) +fig.suptitle(algo1_name + " - " + algo2_name, fontsize=25) +difference = [] +zip_object = zip(fte2, fte) +for fte2_i, fte_i in zip_object: + difference.append(fte2_i - fte_i) +ax[0][0].plot( + np.arange(1, n_tasks + 1), + difference, + c="red", + marker=".", + markersize=14, + linewidth=3, +) +ax[0][0].hlines(1, 1, n_tasks, colors="grey", linestyles="dashed", linewidth=1.5) +ax[0][0].tick_params(labelsize=ticksize) +ax[0][0].set_xlabel("Number of tasks seen", fontsize=fontsize) +ax[0][0].set_ylabel("FTE Difference", fontsize=fontsize) + + +for i in range(n_tasks): + + et = np.asarray(bte[i]) + et2 = np.asarray(bte2[i]) + ns = np.arange(i + 1, n_tasks + 1) + ax[0][1].plot(ns, et2 - et, c="red", linewidth=2.6) + +ax[0][1].set_xlabel("Number of tasks seen", fontsize=fontsize) +ax[0][1].set_ylabel("BTE Difference", fontsize=fontsize) +ax[0][1].tick_params(labelsize=ticksize) +ax[0][1].hlines(1, 1, n_tasks, colors="grey", linestyles="dashed", linewidth=1.5) + + +for i in range(n_tasks): + + et = np.asarray(te[i]) + et2 = np.asarray(te2[i]) + ns = np.arange(i + 1, n_tasks + 1) + ax[1][0].plot(ns, et2 - et, c="red", linewidth=2.6) + +ax[1][0].set_xlabel("Number of tasks seen", fontsize=fontsize) +ax[1][0].set_ylabel("TE Difference", fontsize=fontsize) +ax[1][0].tick_params(labelsize=ticksize) +ax[1][0].hlines(1, 1, n_tasks, colors="grey", linestyles="dashed", linewidth=1.5) + + +for i in range(n_tasks): + et = np.asarray(error[i][0]) + et2 = np.asarray(error2[i][0]) + ns = np.arange(i + 1, n_tasks + 1) + + ax[1][1].plot(ns, 1 - et2 - (1 - et), c="red", linewidth=2.6) + +ax[1][1].set_xlabel("Number of tasks seen", fontsize=fontsize) +ax[1][1].set_ylabel("Accuracy Difference", fontsize=fontsize) +ax[1][1].tick_params(labelsize=ticksize) + +plt.savefig("result/result/", dpi=300) +plt.close() + +ax = plt.subplot(111) + +# Hide the right and top spines +ax.spines["right"].set_visible(False) +ax.spines["top"].set_visible(False) + +# Only show ticks on the left and bottom spines +ax.yaxis.set_ticks_position("left") +ax.xaxis.set_ticks_position("bottom") + +ax.plot( + range(len(train_time)), + train_time, + linewidth=3, + linestyle="solid", + label="Train Time", +) +ax.plot( + range(len(single_task_inference_time)), + single_task_inference_time, + linewidth=3, + linestyle="solid", + label="Single Task Inference Time", +) +ax.plot( + range(len(multitask_inference_time)), + multitask_inference_time, + linewidth=3, + linestyle="solid", + label="Multi-Task Inference Time", +) + + +ax.set_xlabel("Number of Tasks Seen", fontsize=fontsize) +ax.set_ylabel("Time (seconds)", fontsize=fontsize) +ax.tick_params(labelsize=ticksize) +ax.legend(fontsize=22) + +plt.tight_layout() + +# %% diff --git a/benchmarks/cifar_exp/result/figs/SupConLoss-CatCrossEntropy__dnn.pdf b/benchmarks/cifar_exp/result/figs/SupConLoss-CatCrossEntropy__dnn.pdf new file mode 100644 index 0000000000..e017ab499e Binary files /dev/null and b/benchmarks/cifar_exp/result/figs/SupConLoss-CatCrossEntropy__dnn.pdf differ