pami_contrastivelearning_pavlinec_zarbock_hammer_zeilmann.py

# -*- coding: utf-8 -*-
"""PAMI_ContrastiveLearning_Pavlinec_Zarbock_Hammer_Zeilmann.ipynb

Automatically generated by Colaboratory.

Original file is located at
    https://colab.research.google.com/drive/18yehqihoIuzPElyj5CcMf1YPj2-O1hKq
"""

import torch
import torch.nn as nn
import torch.nn.functional as F
import torchvision
import torchvision.transforms as T
from torchsummary import summary
from matplotlib import cm
from mpl_toolkits.mplot3d import Axes3D
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure
import pandas as pd
import numpy as np
import random

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.manifold import TSNE
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from sklearn.neighbors import KNeighborsClassifier

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# hyperparameter batch size
batch_size = 1024

# Trying simple transformation to see if they have an impact
#
# -> Loss went down too quickly without learning additional features, therefore 
# not covered in our final experimental setup

# Initialize transformations for DownloadDataset function
transform_TensorOnly = T.Compose(
    [T.ToTensor(),
    T.Normalize((0.5), (0.5))
])

transform_simple = []
transform_affine = []
transform_FER = []
transform_SimCLR = []

def InitializeTransforms(sizevariable, colorjitter):
    global transform_simple
    global transform_affine
    global transform_TensorOnly
    global transform_FER
    global transform_SimCLR
    size = sizevariable
    color_jitter = T.ColorJitter(0.8 * colorjitter, 0.8 * colorjitter, 0.8 * colorjitter, 0.2 * colorjitter)

    transform_simple = T.Compose([
        T.RandomHorizontalFlip(p=1),
        T.RandomGrayscale(p=0.2),
        T.ToTensor(),
        T.Normalize((0.5), (0.5))
    ])

    transform_affine = T.Compose([
        T.RandomHorizontalFlip(p=1),
        T.RandomAffine((-30,+30)),
        T.Resize(64),
        T.ToTensor(),
        T.Normalize((0.5), (0.5))
    ])
### Following three transformations are used in the report:

    #Transformation for normalizing images only
    transform_TensorOnly = T.Compose(
        [T.ToTensor(),
        T.Normalize((0.5), (0.5))
    ])

    #Transformation from paper Guo, FER, 16
    transform_FER = T.Compose([
        T.Resize(sizevariable*2),
        T.Pad(18),
        T.RandomRotation((-15,+15)),
        T.CenterCrop(sizevariable*2*1.125),
        T.Resize(sizevariable),
        T.ToTensor(),
        T.Normalize((0.5), (0.5))
    ])

    #Transformation from paper Chen et al., SimCLR, 20
    #size = sizevariable
    #s=1
    #color_jitter = T.ColorJitter(0.8 * s, 0.8 * s, 0.8 * s, 0.2 * s)
    transform_SimCLR = T.Compose([
        T.RandomResizedCrop(size=size),
        T.RandomHorizontalFlip(),
        T.RandomApply([color_jitter], p=0.8),
        T.RandomGrayscale(p=0.2),
        T.GaussianBlur(kernel_size=int(0.1 * 32)),
        T.ToTensor(),
        T.Normalize((0.5), (0.5))
    ])

def DownloadDataset(dataset,transformation):
    global sizevariable
    global colorchannels
    global classes
    global train_dataset
    global train_dataset_transformed
    global test_dataset
    if (dataset == "MNIST"):
        classes = ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9']
        train_dataset = torchvision.datasets.MNIST(root='./data', train=True, download=True, transform=transform_TensorOnly)
        train_dataset_transformed = torchvision.datasets.MNIST(root='./data', train=True, download=True, transform=transformation)
        test_dataset = torchvision.datasets.MNIST(root='./data', train=False, download=True, transform=transform_TensorOnly)
    elif (dataset == "CIFAR10"):
        classes = ['airplane', 'automobile', 'bird', 'cat', 'deer', 'dog', 'frog', 'horse', 'ship', 'truck']
        train_dataset = torchvision.datasets.CIFAR10(root='./data', train=True, download=True, transform=transform_TensorOnly)
        train_dataset_transformed = torchvision.datasets.CIFAR10(root='./data', train=True, download=True, transform=transformation)
        test_dataset = torchvision.datasets.CIFAR10(root='./data', train=False, download=True, transform=transform_TensorOnly)
    else: 
        print("Download failed, please use MNIST or CIFAR10 as datasets.")

"""Choose one of the following six download functions. These functions will overwrite each other if you run more than one."""

# Run for MNIST dataset with normalization only
sizevariable = 28
colorchannels = 1
InitializeTransforms(sizevariable, 1)
DownloadDataset("MNIST", transform_TensorOnly)

# Run for MNIST dataset with FER image augmentation
sizevariable = 28
colorchannels = 1
InitializeTransforms(sizevariable, 1)
DownloadDataset("MNIST", transform_FER)

# Run for MNIST dataset with SimCLR image augmentation
sizevariable = 28
colorchannels = 1
InitializeTransforms(sizevariable, 1)
DownloadDataset("MNIST", transform_SimCLR)

# Run for CIFAR10 dataset with normalization only
sizevariable = 32
colorchannels = 3
InitializeTransforms(sizevariable, 1)
DownloadDataset("CIFAR10", transform_TensorOnly)

# Run for CIFAR10 dataset with FER image augmentation
sizevariable = 32
colorchannels = 3
InitializeTransforms(sizevariable, 1)
DownloadDataset("CIFAR10", transform_FER)

# Run for CIFAR10 dataset with SimCLR image augmentation
sizevariable = 32
colorchannels = 3
InitializeTransforms(sizevariable, 1)
DownloadDataset("CIFAR10", transform_SimCLR)

# Initiate empty lists for preparation
for i in classes:
    exec(f'train_dataset_{i}_prep = []')
    exec(f'train_dataset_not_{i}_prep = []')
    exec(f'train_dataset_{i}_transformed_prep = []')

train_dataset_positive = []
train_dataset_negative = []
train_dataset_triplets = [] 

# Fill original and negative example lists per class
for i in train_dataset:
    train_dataset_classes = classes.copy()
    for j in range(10):
        if (i[1] == j):
            exec(f'train_dataset_{train_dataset_classes[j]}_prep.append(i)')
            del train_dataset_classes[j]
            for k in range(9):
                exec(f'train_dataset_not_{train_dataset_classes[k]}_prep.append(i)')

# Fill transformed (positive example) lists per class
for i in train_dataset_transformed:
    for j in range(10):
        if (i[1] == j):
            exec(f'train_dataset_{classes[j]}_transformed_prep.append(i)') 

for i in classes:
    # Shuffle the 'not' datasets; to have a mix of classes in the negative pairs / triplets
    exec(f'random.shuffle(train_dataset_not_{i}_prep)')

    # Create empty lists for each class (positive pairs, negative pairs and triplets)
    exec(f'train_dataset_{i}_positive = []')
    exec(f'train_dataset_{i}_negative = []')
    exec(f'train_dataset_{i}_triplets = []')

    # Index 1 for positive pairs and -1 for negative pairs, used in CosineEmbeddingLoss function
    # Range 5000 to cut down the "train_dataset_not_{i}_prep" lists
    for j in range(5000):
        exec(f'train_dataset_{i}_positive.append((train_dataset_{i}_prep[{j}][0], train_dataset_{i}_transformed_prep[{j}][0], torch.tensor([1])))')
        exec(f'train_dataset_{i}_negative.append((train_dataset_{i}_prep[{j}][0], train_dataset_not_{i}_prep[{j}][0], torch.tensor([-1])))')
        exec(f'train_dataset_{i}_triplets.append((train_dataset_{i}_prep[{j}][0], train_dataset_{i}_transformed_prep[{j}][0], train_dataset_not_{i}_prep[{j}][0]))')

    #Combine all lists 
    exec(f'train_dataset_positive.extend(train_dataset_{i}_positive)')
    exec(f'train_dataset_negative.extend(train_dataset_{i}_negative)')
    exec(f'train_dataset_triplets.extend(train_dataset_{i}_triplets)')

# Positive pairs and negative pairs are combined as a last step
train_dataset_pairwise = train_dataset_positive + train_dataset_negative

# Initiate the data loaders
train_loader_pairwise = torch.utils.data.DataLoader(train_dataset_pairwise, batch_size=batch_size, shuffle=True)
train_loader_triplets = torch.utils.data.DataLoader(train_dataset_triplets, batch_size=batch_size, shuffle=True)

# test_loader_total will be used later for evaluation
test_loader_total = torch.utils.data.DataLoader(test_dataset, batch_size=batch_size, shuffle=True)

"""You can use the following code to preview the first 16 images in one batch."""

def imshow(img):
    img = img / 2 + 0.5  # unnormalize
    npimg = img.numpy()
    plt.imshow(np.transpose(npimg, (1, 2, 0)))
    plt.show()

# show examples of original images
dataiter_og = iter(train_loader_triplets)
images_og, transformed_og, negative_og = dataiter_og.next()
imshow(torchvision.utils.make_grid(images_og[0:16]))
imshow(torchvision.utils.make_grid(transformed_og[0:16]))
imshow(torchvision.utils.make_grid(negative_og[0:16]))

#FINAL for CIFAR10 and MNIST

class CNN(nn.Module):
    def __init__(self):
        super(CNN, self).__init__()
        self.disc = nn.Sequential(
            nn.Conv2d(colorchannels, 32, 2, 2, 1, bias=False),
            nn.LeakyReLU(0.2, inplace=True),
            nn.Conv2d(32, 64, 2, 2, 1, bias=False),
            nn.BatchNorm2d(64),
            nn.LeakyReLU(0.2, inplace=True),
            nn.Conv2d(64, 128, 2, 2, 1, bias=False),
            nn.BatchNorm2d(128),
            nn.LeakyReLU(0.2, inplace=True),
            nn.Conv2d(128, 256, 2, 2, 1, bias=False),
            nn.BatchNorm2d(256),
            nn.LeakyReLU(0.2, inplace=True),
            nn.Conv2d(256, 8, 2, 1, 0, bias=False),
            nn.Sigmoid(),
            nn.Flatten()
        )

    def forward(self, input):
        return self.disc(input)

# Set model based on dataset
model_pairwise = CNN().to(device)
model_triplets = CNN().to(device)

# Define learning rates
learning_rate_pairwise = 0.05
learning_rate_triplets = 0.15

# Define loss and optimizer
criterion_pairwise = nn.CosineEmbeddingLoss()
criterion_triplets = nn.TripletMarginLoss()
optimizer_pairwise = torch.optim.SGD(model_pairwise.parameters(), lr=learning_rate_pairwise)
optimizer_triplets = torch.optim.SGD(model_triplets.parameters(), lr=learning_rate_triplets)

summary(model_triplets,(colorchannels,sizevariable,sizevariable))

# Helper functions to evaluate on the test
#Please ignore details of code here; the individual steps reappear in the code later; and are documented there

#Returns top-n-accuracy; adapted from: https://towardsdatascience.com/understanding-top-n-accuracy-metrics-8aa90170b35
def top_n_accuracy(X,y,n,classifier):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
    clf = classifier
    clf.fit(X_train,y_train)
    predictions = clf.predict(X_test)
    probs = clf.predict_proba(X_test)
    topn = np.argsort(probs, axis = 1)[:,-n:]
    y_true = np.array(y_test)
    return np.mean(np.array([1 if y_true[k] in topn[k] else 0 for k in range(len(topn))]))

# Function to test the current network representation with test data by training a logistic regression
# Prints accuracy and top-3-accuracy
def evaluate(model):
    targets = []
    features = []
    with torch.no_grad():
        for images, labels in (test_loader_total):
            images = images.to(device)
            labels = labels.to(device)
            labels = labels.data.cpu().numpy()
            for i in range(len(labels)):
                label = labels[i]
                targets.append(label)
            outputs = model(images)
            outputs = outputs.data.cpu().numpy()
            for i in range(len(outputs)):
                output = outputs[i]
                features.append(output)
    training_data, test_data, train_label, test_label = train_test_split(features, targets, train_size=0.8)
    logreg = LogisticRegression(max_iter=1000)
    logreg.fit(X=training_data, y=train_label)
    pred_label=logreg.predict(test_data)
    print('Accuracy Logistic Regression')
    print(accuracy_score(test_label, pred_label))
    top3accuracy = top_n_accuracy(features, targets, 3, LogisticRegression(max_iter=1000))
    print('Top 3 accuracy')
    print(top3accuracy)

# Number of epochs
num_epochs_pairwise = 10
num_epochs_triplets = 10

# Steps only used for logging
n_total_steps_pairwise = len(train_loader_pairwise)
n_total_steps_triplets = len(train_loader_triplets)

#Pairwise Training

for epoch in range(num_epochs_pairwise):
    if (epoch > 0):
        evaluate(model_pairwise)
    for i, (original, transformed, index) in enumerate(train_loader_pairwise):

        original = original.to(device)
        transformed = transformed.to(device)
        index = index.flatten().to(device)

        # Forward pass
        outputO = model_pairwise(original)
        outputT = model_pairwise(transformed)

        # Calculate loss
        loss = criterion_pairwise(outputO, outputT, index)

        # Backward and optimize
        optimizer_pairwise.zero_grad()
        loss.backward()
        optimizer_pairwise.step()

        # Logging
        if (i+1) % 30 == 0:
            print (f'Epoch [{epoch+1}/{num_epochs_pairwise}], Step [{i+1}/{n_total_steps_pairwise}], Loss: {loss.item():.4f}')

print('Finished Training')

#Triplet Training

for epoch in range(num_epochs_triplets):
    if (epoch % 5 == 0) and (epoch > 0):
        evaluate(model_triplets)
    for i, (anchor, positive, negative) in enumerate(train_loader_triplets):

        anchor = anchor.to(device)
        positive = positive.to(device)
        negative = negative.to(device)

        # Forward pass
        outputA = model_triplets(anchor)
        outputP = model_triplets(positive)
        outputN = model_triplets(negative)

        # Calculate loss
        loss = criterion_triplets(outputA, outputP, outputN)

        # Backward and optimize
        optimizer_triplets.zero_grad()
        loss.backward()
        optimizer_triplets.step()

        # Logging
        if (i+1) % 10 == 0:
            print (f'Epoch [{epoch+1}/{num_epochs_triplets}], Step [{i+1}/{n_total_steps_triplets}], Loss: {loss.item():.4f}')

print('Finished Training')

#For evaluating the model, the model is applied to the test data

def ModelTesting(model):
    global targets
    targets = [] # Targets will contain the class labels
    global features
    features = [] #Features will contain models output -> Contrastive Representation
    with torch.no_grad(): 
        for images, labels in (test_loader_total):
            images = images.to(device)
            labels = labels.to(device)
            labels = labels.data.cpu().numpy()
            for i in range(len(labels)): #Loop through one batch, append label to list of targets
                label = labels[i]
                targets.append(label)
            outputs = model(images)
            outputs = outputs.data.cpu().numpy() #Loop through one batch, append output to list of features
            for i in range(len(outputs)):
                output = outputs[i]
                features.append(output)

"""Choose one of the following validation options that you want to display via TSNE."""

evaluate(model_triplets)

# Run this for testing the pairwise-trained model
ModelTesting(model_pairwise)

# Run this for testing the triplet-trained model
ModelTesting(model_triplets)

# Reduce number of features and targets to 1000 so that the visualization is not too cluttered
features = np.array(features)
features_cut = features[:1000,]
targets = np.array(targets)
targets_cut = targets[:1000,]

# After experimenting with different perplexities, we have decided to use 20 as 'main settings'
tsne = TSNE(n_components=2, verbose=1, perplexity=20, n_iter=5000)
tsne_results = tsne.fit_transform(features_cut)

# Plot those points as a scatter plot and label them based on the labels (adapted from: https://towardsdatascience.com/visualizing-feature-vectors-embeddings-using-pca-and-t-sne-ef157cea3a42)
cmap = cm.get_cmap('tab10')
fig, ax = plt.subplots(figsize=(8,8))
num_categories = 10
for lab in range(num_categories):
    indices = targets_cut==lab
    ax.scatter(tsne_results[indices,0],tsne_results[indices,1], c=np.array(cmap(lab)).reshape(1,4), label = classes[lab] ,alpha=0.5)
ax.legend(fontsize='large', markerscale=2)
plt.show()

#We look at 6 different perplexity settings, to get a better idea of the representation.
# -> Perplexity assumes the number of close neighbors each point has

plt.figure(figsize = (10,5))
plt.subplots_adjust(top = 1.5)
cmap = cm.get_cmap('tab10')
for index, p in enumerate([3, 10, 15, 25, 35,50]):
  tsne = TSNE(n_components = 2, perplexity = p, random_state=0, learning_rate=100, n_iter=5000)
  tsne_results = tsne.fit_transform(features_cut)
  num_categories = 10 
  for lab in range(num_categories):
    indices = targets_cut==lab
    plt.subplot(2,3,index+1)
    plt.scatter(tsne_results[indices,0],tsne_results[indices,1], c=np.array(cmap(lab)).reshape(1,4), label = lab ,alpha=0.5)
    #plt.legend(fontsize='large', markerscale=2)
    plt.title('Perplexity = '+ str(p))

# t-SNE can be learned for three dimensions as well
# The results can be plotted on a 3-dimensional scatterplot

#Plot in 3D
tsne = TSNE(3, verbose=1, perplexity=30, learning_rate=200) 
tsne_proj = tsne.fit_transform(features_cut) 
cmap = cm.get_cmap('tab10') 
fig = plt.figure(figsize=(8,8)) 
ax = fig.add_subplot(111, projection='3d') 
num_categories = 10 
for lab in range(num_categories): 
  indices = targets_cut==lab 
  ax.scatter(tsne_proj[indices,0],tsne_proj[indices,1],tsne_proj[indices,2], c=np.array(cmap(lab)).reshape(1,4), label = classes[lab] ,alpha=0.5) 
ax.legend(fontsize='large', markerscale=2) 
plt.show()

#Attempt classification

#Logistic Regression with scikit-learn

#Features are the outputs of the trained model, applied to the test data (n = 10000)

#We further split the test data into train (n = 2000) and test(n = 8000) (-> Train and test in regard to the classification)

training_data, test_data, train_label, test_label = train_test_split(features, targets, train_size=0.8)
#training_data, test_data, train_label, test_label = train_test_split(tsne_results, targets_test, train_size=0.8) #-> Train with T-SNE
print("Training set size:", len(training_data))
print("Test set size:", len(test_data))


logreg = LogisticRegression(max_iter=1000)
logreg.fit(X=training_data, y=train_label)

#Calculate the accuracy

logreg = LogisticRegression(max_iter=1000)
logreg.fit(X=training_data, y=train_label)

pred_label=logreg.predict(test_data)
  
print('Accuracy Logistic Regression')
print(accuracy_score(test_label, pred_label))

#Confusion Matrix
cm = confusion_matrix(test_label, pred_label)

##Plot of confusion matrix (https://scikit-learn.org/stable/auto_examples/model_selection/plot_confusion_matrix.html)
fig, ax = plt.subplots(figsize=(10,10))
np.set_printoptions(precision=3)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=classes)
# Plot non-normalized confusion matrix
disp.plot(cmap = plt.cm.Blues, ax=ax)
plt.show()

#For getting a better idea of the prediction accuracy, we can look at the top-n-accuracy

#Top N accuracy (https://towardsdatascience.com/understanding-top-n-accuracy-metrics-8aa90170b35)

def top_n_accuracy(X,y,n,classifier):
  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
  clf = classifier
  clf.fit(X_train,y_train)
  predictions = clf.predict(X_test)
  probs = clf.predict_proba(X_test)
  topn = np.argsort(probs, axis = 1)[:,-n:]
  y_true = np.array(y_test)
  return np.mean(np.array([1 if y_true[k] in topn[k] else 0 for k in range(len(topn))]))

#Calculate top n accuracy -> Percentage of labels that lie in the top n predictions with highest probability
n = 3
acc = top_n_accuracy(features, targets, n, LogisticRegression(max_iter=1000))
print("Top", n, "accuracy:", acc)

# Do check, if the learning may have found nonlinear contrastive representations, we can also try knn, with varying amounts of labels

#-> We have found, that using knn with n_neighbors between 20 and 100 usually slightly outperforms Logistic Regression, although not dramatically (1-3%) 

# Use this cell for your code
knn = KNeighborsClassifier(n_neighbors=30)
knn.fit(X=training_data, y=train_label)
#Calculate accuracy
pred_label=knn.predict(test_data)
accuracy_score(test_label, pred_label)