Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Supervised Contrastive Loss #518

Open
wants to merge 10 commits into
base: staging
Choose a base branch
from
140 changes: 100 additions & 40 deletions benchmarks/cifar_exp/fte_bte_exp.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,15 @@
#%%
import random
import matplotlib.pyplot as plt
import tensorflow as tf
import keras
from keras import layers
from tensorflow import keras
from tensorflow.keras import layers
from itertools import product
import pandas as pd

from losses import (
SupervisedContrastiveLoss,
) # adapted version of SupConLoss for ftebte setting, uses cosine similarity matrix

import numpy as np
import pickle

@@ -16,6 +19,8 @@
from joblib import Parallel, delayed
from multiprocessing import Pool

from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from proglearn.progressive_learner import ProgressiveLearner
from proglearn.deciders import SimpleArgmaxAverage
from proglearn.transformers import (
@@ -24,9 +29,8 @@
)
from proglearn.voters import TreeClassificationVoter, KNNClassificationVoter

import tensorflow as tf

import time
import sys

#%%
def unpickle(file):
@@ -35,6 +39,25 @@ def unpickle(file):
return dict


def get_size(obj, seen=None):
"""Recursively finds size of objects"""
size = sys.getsizeof(obj)
if seen is None:
seen = set()
obj_id = id(obj)
if obj_id in seen:
return 0
# Important mark as seen *before* entering recursion to gracefully handle
# self-referential objects
seen.add(obj_id)
if isinstance(obj, dict):
size += sum([get_size(v, seen) for v in obj.values()])
size += sum([get_size(k, seen) for k in obj.keys()])
elif hasattr(obj, "__dict__"):
size += get_size(obj.__dict__, seen)
return size


#%%
def LF_experiment(
train_x,
@@ -58,6 +81,8 @@ def LF_experiment(
train_times_across_tasks = []
single_task_inference_times_across_tasks = []
multitask_inference_times_across_tasks = []
time_info = []
mem_info = []

if model == "dnn":
default_transformer_class = NeuralClassificationTransformer
@@ -123,10 +148,16 @@ def LF_experiment(
default_transformer_kwargs = {
"network": network,
"euclidean_layer_idx": -2,
"num_classes": 10,
"optimizer": keras.optimizers.Adam(3e-4),
"loss": SupervisedContrastiveLoss,
"optimizer": Adam(3e-4),
"fit_kwargs": {
"epochs": 100,
"callbacks": [EarlyStopping(patience=5, monitor="val_loss")],
"verbose": False,
"validation_split": 0.33,
"batch_size": 32,
},
}

default_voter_class = KNNClassificationVoter
default_voter_kwargs = {"k": int(np.log2(num_points_per_task))}

@@ -152,10 +183,12 @@ def LF_experiment(

for task_ii in range(10):
print("Starting Task {} For Fold {}".format(task_ii, shift))

train_start_time = time.time()

if acorn is not None:
np.random.seed(acorn)

train_start_time = time.time()
progressive_learner.add_task(
X=train_x[
task_ii * 5000
@@ -168,7 +201,7 @@ def LF_experiment(
+ (slot + 1) * num_points_per_task
],
num_transformers=1 if model == "dnn" else ntrees,
transformer_voter_decider_split=[0.67, 0.33, 0],
transformer_voter_decider_split=[0.63, 0.37, 0],
decider_kwargs={
"classes": np.unique(
train_y[
@@ -181,17 +214,54 @@ def LF_experiment(
)
train_end_time = time.time()

single_learner = ProgressiveLearner(
default_transformer_class=default_transformer_class,
default_transformer_kwargs=default_transformer_kwargs,
default_voter_class=default_voter_class,
default_voter_kwargs=default_voter_kwargs,
default_decider_class=default_decider_class,
)

if acorn is not None:
np.random.seed(acorn)

single_learner.add_task(
X=train_x[
task_ii * 5000
+ slot * num_points_per_task : task_ii * 5000
+ (slot + 1) * num_points_per_task
],
y=train_y[
task_ii * 5000
+ slot * num_points_per_task : task_ii * 5000
+ (slot + 1) * num_points_per_task
],
num_transformers=1 if model == "dnn" else (task_ii + 1) * ntrees,
transformer_voter_decider_split=[0.67, 0.33, 0],
decider_kwargs={
"classes": np.unique(
train_y[
task_ii * 5000
+ slot * num_points_per_task : task_ii * 5000
+ (slot + 1) * num_points_per_task
]
)
},
)

time_info.append(train_end_time - train_start_time)
mem_info.append(get_size(progressive_learner))
train_times_across_tasks.append(train_end_time - train_start_time)

single_task_inference_start_time = time.time()
llf_task = progressive_learner.predict(
single_task = single_learner.predict(
X=test_x[task_ii * 1000 : (task_ii + 1) * 1000, :],
transformer_ids=[task_ii],
task_id=task_ii,
transformer_ids=[0],
task_id=0,
)
single_task_inference_end_time = time.time()
single_task_accuracies[task_ii] = np.mean(
llf_task == test_y[task_ii * 1000 : (task_ii + 1) * 1000]
single_task == test_y[task_ii * 1000 : (task_ii + 1) * 1000]
)
single_task_inference_times_across_tasks.append(
single_task_inference_end_time - single_task_inference_start_time
@@ -236,8 +306,7 @@ def LF_experiment(
+ str(ntrees)
+ "_"
+ str(shift)
+ "_"
+ str(slot)
+ "_SupervisedContrastiveLoss"
+ ".pickle"
)
with open(file_to_save, "wb") as f:
@@ -350,9 +419,9 @@ def run_parallel_exp(
)

if model == "dnn":
config = tf.ConfigProto()
config = tf.compat.v1.ConfigProto()
config.gpu_options.allow_growth = True
sess = tf.Session(config=config)
sess = tf.compat.v1.Session(config=config)
with tf.device("/gpu:" + str(shift % 4)):
LF_experiment(
train_x,
@@ -383,8 +452,8 @@ def run_parallel_exp(

#%%
### MAIN HYPERPARAMS ###
model = "uf"
num_points_per_task = 500
model = "dnn"
num_points_per_task = 500 # change from 5000 to 500
########################

(X_train, y_train), (X_test, y_test) = keras.datasets.cifar100.load_data()
@@ -399,7 +468,7 @@ def run_parallel_exp(

#%%
if model == "uf":
slot_fold = range(10)
slot_fold = range(1)
shift_fold = range(1, 7, 1)
n_trees = [10]
iterable = product(n_trees, shift_fold, slot_fold)
@@ -410,24 +479,15 @@ def run_parallel_exp(
for ntree, shift, slot in iterable
)
elif model == "dnn":
slot_fold = range(10)
slot_fold = range(10) # edit this default 10 is correct?

def perform_shift(shift_slot_tuple):
shift, slot = shift_slot_tuple
return run_parallel_exp(
data_x, data_y, 0, model, num_points_per_task, slot=slot, shift=shift
)

print("Performing Stage 1 Shifts")
stage_1_shifts = range(1, 5)
stage_1_iterable = product(stage_1_shifts, slot_fold)
with Pool(4) as p:
p.map(perform_shift, stage_1_iterable)

print("Performing Stage 2 Shifts")
stage_2_shifts = range(5, 7)
stage_2_iterable = product(stage_2_shifts, slot_fold)
with Pool(4) as p:
p.map(perform_shift, stage_2_iterable)
# sequential
slot_fold = range(1)
shift_fold = [1, 2, 3, 4, 5, 6]
n_trees = [0]
iterable = product(n_trees, shift_fold, slot_fold)

# %%
for ntree, shift, slot in iterable:
run_parallel_exp(
data_x, data_y, ntree, model, num_points_per_task, slot=slot, shift=shift
)
63 changes: 63 additions & 0 deletions benchmarks/cifar_exp/losses.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
import math
import tensorflow as tf
from tensorflow.keras import backend as K
import tensorflow_addons as tfa


def logDiff(yTrue, yPred):
return K.sum(K.log(yTrue) - K.log(yPred))


# sum over samples in batch (anchors) ->
# average over similar samples (positive) ->
# of - log softmax positive / sum negatives (wrt cos similarity)
# i.e. \sum_i -1/|P(i)| \sum_{p \in P(i)} log [exp(z_i @ z_p / t) / \sum_{n \in N(i)} exp(z_i @ z_n / t)]
# = \sum_i [log[\sum_{n \in N(i)} exp(z_i @ z_n / t)] - 1/|P(i)| \sum_{p \in P(i)} log [exp(z_i @ z_p / t)]]
def supervised_contrastive_loss(yTrue, yPred):
temp = 0.1
r = yPred
y = yTrue
r, _ = tf.linalg.normalize(r, axis=1)
r_dists = tf.matmul(r, tf.transpose(r))
r_dists = tf.linalg.set_diag(
r_dists, tf.zeros(r_dists.shape[0], dtype=r_dists.dtype)
) # exclude itself distance
r_dists = r_dists / temp
y_norms = tf.reduce_sum(y * y, 1)
y = y_norms - 2 * tf.matmul(y, tf.transpose(y)) + tf.transpose(y_norms)
y = tf.cast(y / 2, r_dists.dtype) # scale onehot distances to 0 and 1
negative_sum = tf.math.log(
tf.reduce_sum(y * tf.exp(r_dists), axis=1)
) # y zeros diagonal 1's
positive_sum = (1 - y) * r_dists
n_nonzero = tf.math.reduce_sum(1 - y, axis=1) - 1 # Subtract diagonal
positive_sum = tf.reduce_sum(positive_sum, axis=1) / tf.cast(
n_nonzero, positive_sum.dtype
)
loss = tf.reduce_sum(negative_sum - positive_sum)
return loss


# siamese networks version
def contrastiveLoss(yTrue, yPred):
# make sure the datatypes are the same
yTrue = tf.cast(yTrue, yPred.dtype)
squaredPreds = K.square(yPred)
squaredMargin = K.square(K.maximum(1 - yPred, 0))
loss = K.mean(yTrue * squaredPreds + (1 - yTrue) * squaredMargin)
return loss


def cosSimilarity(vec1, vec2):
sim = tf.reduce_sum(tf.reduce_sum(tf.multiply(vec1, vec1)))
return sim


def SupervisedContrastiveLoss(yTrue, yPred):
temp = 0.1
r = yPred
y = yTrue
r, _ = tf.linalg.normalize(r, axis=1)
r_dists = tf.matmul(r, tf.transpose(r))
logits = tf.divide(r_dists, temp)
return tfa.losses.npairs_loss(tf.squeeze(tf.reduce_sum(y * y, 1)), logits)
347 changes: 347 additions & 0 deletions benchmarks/cifar_exp/plot_compare_two_algos.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,347 @@
#%%
import pickle
import matplotlib.pyplot as plt
from matplotlib import rcParams

rcParams.update({"figure.autolayout": True})
import numpy as np
from itertools import product
import seaborn as sns

### MAIN HYPERPARAMS ###
ntrees = 0
shifts = 6
task_num = 10
model = "dnn"
########################
algo1_name = "SupervisedContrastiveLoss"
algo2_name = "CategoricalCrossEntropy"
#%%
def unpickle(file):
with open(file, "rb") as fo:
dict = pickle.load(fo, encoding="bytes")
return dict


def get_fte_bte(err, single_err, ntrees):
bte = [[] for i in range(10)]
te = [[] for i in range(10)]
fte = []

for i in range(10):
for j in range(i, 10):
bte[i].append(err[i][i] / err[j][i])
te[i].append(single_err[i] / err[j][i])

for i in range(10):
fte.append(single_err[i] / err[i][i])

return fte, bte, te


def calc_mean_bte(btes, task_num=10, reps=6):
mean_bte = [[] for i in range(task_num)]

for j in range(task_num):
tmp = 0
for i in range(reps):
tmp += np.array(btes[i][j])

tmp = tmp / reps
mean_bte[j].extend(tmp)

return mean_bte


def calc_mean_te(tes, task_num=10, reps=6):
mean_te = [[] for i in range(task_num)]

for j in range(task_num):
tmp = 0
for i in range(reps):
tmp += np.array(tes[i][j])

tmp = tmp / reps
mean_te[j].extend(tmp)

return mean_te


def calc_mean_fte(ftes, task_num=10, reps=6):
fte = np.asarray(ftes)

return list(np.mean(np.asarray(fte), axis=0))


def calc_mean_err(err, task_num=10, reps=6):
mean_err = [[] for i in range(task_num)]

for j in range(task_num):
tmp = 0
for i in range(reps):
tmp += np.array(err[i][j])

tmp = tmp / reps
mean_err[j].extend([tmp])

return mean_err


def calc_mean_multitask_time(multitask_time, task_num=10, reps=6):
mean_multitask_time = [[] for i in range(task_num)]

for j in range(task_num):
tmp = 0
for i in range(reps):
tmp += np.array(multitask_time[i][j])

tmp = tmp / reps
mean_multitask_time[j].extend([tmp])

return mean_multitask_time


#%%
reps = shifts

btes = [[] for i in range(task_num)]
ftes = [[] for i in range(task_num)]
tes = [[] for i in range(task_num)]
err_ = [[] for i in range(task_num)]
btes2 = [[] for i in range(task_num)]
ftes2 = [[] for i in range(task_num)]
tes2 = [[] for i in range(task_num)]
err_2 = [[] for i in range(task_num)]


te_tmp = [[] for _ in range(reps)]
bte_tmp = [[] for _ in range(reps)]
fte_tmp = [[] for _ in range(reps)]
err_tmp = [[] for _ in range(reps)]
train_time_tmp = [[] for _ in range(reps)]
single_task_inference_time_tmp = [[] for _ in range(reps)]
multitask_inference_time_tmp = [[] for _ in range(reps)]
te_tmp2 = [[] for _ in range(reps)]
bte_tmp2 = [[] for _ in range(reps)]
fte_tmp2 = [[] for _ in range(reps)]
err_tmp2 = [[] for _ in range(reps)]
train_time_tmp2 = [[] for _ in range(reps)]
single_task_inference_time_tmp2 = [[] for _ in range(reps)]
multitask_inference_time_tmp2 = [[] for _ in range(reps)]

count = 0
for shift in range(shifts):
filename = (
"result/result/"
+ model
+ str(ntrees)
+ "_"
+ str(shift + 1)
+ "_"
+ algo1_name
+ ".pickle"
)
filename2 = (
"result/result/"
+ model
+ str(ntrees)
+ "_"
+ str(shift + 1)
+ "_"
+ algo2_name
+ ".pickle"
)
multitask_df, single_task_df = unpickle(filename)
multitask_df2, single_task_df2 = unpickle(filename2)
err = [[] for _ in range(10)]
multitask_inference_times = [[] for _ in range(10)]
err2 = [[] for _ in range(10)]
multitask_inference_times2 = [[] for _ in range(10)]
for ii in range(10):
err[ii].extend(
1 - np.array(multitask_df[multitask_df["base_task"] == ii + 1]["accuracy"])
)
err2[ii].extend(
1
- np.array(multitask_df2[multitask_df2["base_task"] == ii + 1]["accuracy"])
)
multitask_inference_times[ii].extend(
np.array(
multitask_df[multitask_df["base_task"] == ii + 1][
"multitask_inference_times"
]
)
)
multitask_inference_times2[ii].extend(
np.array(
multitask_df2[multitask_df2["base_task"] == ii + 1][
"multitask_inference_times"
]
)
)
single_err = 1 - np.array(single_task_df["accuracy"])
single_err2 = 1 - np.array(single_task_df2["accuracy"])
fte, bte, te = get_fte_bte(err, single_err, ntrees)
fte2, bte2, te2 = get_fte_bte(err2, single_err2, ntrees)

err_ = [[] for i in range(task_num)]
for i in range(task_num):
for j in range(task_num - i):
err_[i].append(err[i + j][i])
err_2 = [[] for i in range(task_num)]
for i in range(task_num):
for j in range(task_num - i):
err_2[i].append(err2[i + j][i])

train_time_tmp[count].extend(np.array(single_task_df["train_times"]))
single_task_inference_time_tmp[count].extend(
np.array(single_task_df["single_task_inference_times"])
)
multitask_inference_time_tmp[count].extend(multitask_inference_times)
te_tmp[count].extend(te)
bte_tmp[count].extend(bte)
fte_tmp[count].extend(fte)
err_tmp[count].extend(err_)
train_time_tmp2[count].extend(np.array(single_task_df2["train_times"]))
single_task_inference_time_tmp2[count].extend(
np.array(single_task_df2["single_task_inference_times"])
)
multitask_inference_time_tmp2[count].extend(multitask_inference_times2)
te_tmp2[count].extend(te2)
bte_tmp2[count].extend(bte2)
fte_tmp2[count].extend(fte2)
err_tmp2[count].extend(err_2)
count += 1

te = calc_mean_te(te_tmp, reps=reps)
bte = calc_mean_bte(bte_tmp, reps=reps)
fte = calc_mean_fte(fte_tmp, reps=reps)
error = calc_mean_err(err_tmp, reps=reps)
te2 = calc_mean_te(te_tmp2, reps=reps)
bte2 = calc_mean_bte(bte_tmp2, reps=reps)
fte2 = calc_mean_fte(fte_tmp2, reps=reps)
error2 = calc_mean_err(err_tmp2, reps=reps)

train_time = np.mean(train_time_tmp, axis=0)
single_task_inference_time = np.mean(single_task_inference_time_tmp, axis=0)
multitask_inference_time = calc_mean_multitask_time(multitask_inference_time_tmp)
multitask_inference_time = [
np.mean(multitask_inference_time[i]) for i in range(len(multitask_inference_time))
]
train_time2 = np.mean(train_time_tmp2, axis=0)
single_task_inference_time2 = np.mean(single_task_inference_time_tmp2, axis=0)
multitask_inference_time2 = calc_mean_multitask_time(multitask_inference_time_tmp2)
multitask_inference_time2 = [
np.mean(multitask_inference_time2[i]) for i in range(len(multitask_inference_time2))
]

#%%
sns.set()

n_tasks = 10
clr = ["#e41a1c", "#a65628", "#377eb8", "#4daf4a", "#984ea3", "#ff7f00", "#CCCC00"]

fontsize = 22
ticksize = 20

fig, ax = plt.subplots(2, 2, figsize=(16, 11.5))
fig.suptitle(algo1_name + " - " + algo2_name, fontsize=25)
difference = []
zip_object = zip(fte2, fte)
for fte2_i, fte_i in zip_object:
difference.append(fte2_i - fte_i)
ax[0][0].plot(
np.arange(1, n_tasks + 1),
difference,
c="red",
marker=".",
markersize=14,
linewidth=3,
)
ax[0][0].hlines(1, 1, n_tasks, colors="grey", linestyles="dashed", linewidth=1.5)
ax[0][0].tick_params(labelsize=ticksize)
ax[0][0].set_xlabel("Number of tasks seen", fontsize=fontsize)
ax[0][0].set_ylabel("FTE Difference", fontsize=fontsize)


for i in range(n_tasks):

et = np.asarray(bte[i])
et2 = np.asarray(bte2[i])
ns = np.arange(i + 1, n_tasks + 1)
ax[0][1].plot(ns, et2 - et, c="red", linewidth=2.6)

ax[0][1].set_xlabel("Number of tasks seen", fontsize=fontsize)
ax[0][1].set_ylabel("BTE Difference", fontsize=fontsize)
ax[0][1].tick_params(labelsize=ticksize)
ax[0][1].hlines(1, 1, n_tasks, colors="grey", linestyles="dashed", linewidth=1.5)


for i in range(n_tasks):

et = np.asarray(te[i])
et2 = np.asarray(te2[i])
ns = np.arange(i + 1, n_tasks + 1)
ax[1][0].plot(ns, et2 - et, c="red", linewidth=2.6)

ax[1][0].set_xlabel("Number of tasks seen", fontsize=fontsize)
ax[1][0].set_ylabel("TE Difference", fontsize=fontsize)
ax[1][0].tick_params(labelsize=ticksize)
ax[1][0].hlines(1, 1, n_tasks, colors="grey", linestyles="dashed", linewidth=1.5)


for i in range(n_tasks):
et = np.asarray(error[i][0])
et2 = np.asarray(error2[i][0])
ns = np.arange(i + 1, n_tasks + 1)

ax[1][1].plot(ns, 1 - et2 - (1 - et), c="red", linewidth=2.6)

ax[1][1].set_xlabel("Number of tasks seen", fontsize=fontsize)
ax[1][1].set_ylabel("Accuracy Difference", fontsize=fontsize)
ax[1][1].tick_params(labelsize=ticksize)

plt.savefig("result/result/", dpi=300)
plt.close()

ax = plt.subplot(111)

# Hide the right and top spines
ax.spines["right"].set_visible(False)
ax.spines["top"].set_visible(False)

# Only show ticks on the left and bottom spines
ax.yaxis.set_ticks_position("left")
ax.xaxis.set_ticks_position("bottom")

ax.plot(
range(len(train_time)),
train_time,
linewidth=3,
linestyle="solid",
label="Train Time",
)
ax.plot(
range(len(single_task_inference_time)),
single_task_inference_time,
linewidth=3,
linestyle="solid",
label="Single Task Inference Time",
)
ax.plot(
range(len(multitask_inference_time)),
multitask_inference_time,
linewidth=3,
linestyle="solid",
label="Multi-Task Inference Time",
)


ax.set_xlabel("Number of Tasks Seen", fontsize=fontsize)
ax.set_ylabel("Time (seconds)", fontsize=fontsize)
ax.tick_params(labelsize=ticksize)
ax.legend(fontsize=22)

plt.tight_layout()

# %%
Binary file not shown.