From 28bfc4892bf23b192225c9e2ed49c120d9325476 Mon Sep 17 00:00:00 2001
From: fazelehh <fazeleh.hoseini@ai.se>
Date: Mon, 14 Oct 2024 15:02:08 +0000
Subject: [PATCH] cifar example structure

---
 examples/mia/cifar/audit.yml                  |  44 ++++++
 examples/mia/cifar/cifar_handler.py           |  70 +++++++++
 .../mia/cifar/utils/cifar_data_prepration.py  | 142 ++++++++++++++++++
 .../mia/cifar/utils/cifar_model_prepration.py | 108 +++++++++++++
 4 files changed, 364 insertions(+)

diff --git a/examples/mia/cifar/audit.yml b/examples/mia/cifar/audit.yml
index e69de29..37854ae 100644
--- a/examples/mia/cifar/audit.yml
+++ b/examples/mia/cifar/audit.yml
@@ -0,0 +1,44 @@
+audit:  # Configurations for auditing
+  random_seed: 1234  # Integer specifying the random seed
+  attack_list:
+    rmia:
+      training_data_fraction: 0.5  # Fraction of the auxilary dataset to use for this attack (in each shadow model training)
+      attack_data_fraction: 0.5 # Fraction of auxiliary dataset to sample from during attack
+      num_shadow_models: 3 # Number of shadow models to train
+      online: True # perform online or offline attack
+      temperature: 2
+      gamma: 2.0
+      offline_a: 0.33 # parameter from which we compute p(x) from p_OUT(x) such that p_IN(x) = a p_OUT(x) + b.
+      offline_b: 0.66
+    qmia:
+      training_data_fraction: 1.0  # Fraction of the auxilary dataset (data without train and test indices) to use for training the quantile regressor
+      epochs: 5  # Number of training epochs for quantile regression
+    population:
+      attack_data_fraction: 1.0  # Fraction of the auxilary dataset to use for this attack
+    lira:
+      training_data_fraction: 0.5  # Fraction of the auxilary dataset to use for this attack (in each shadow model training)
+      num_shadow_models: 8 # Number of shadow models to train
+      online: False # perform online or offline attack
+      fixed_variance: True # Use a fixed variance for the whole audit
+      boosting: True
+    loss_traj:
+      training_distill_data_fraction : 0.7 # Fraction of the auxilary dataset to use for training the distillation models D_s = (1-D_KD)/2
+      number_of_traj: 10 # Number of epochs (number of points in the loss trajectory)
+      label_only: False # True or False
+      mia_classifier_epochs: 100
+
+  output_dir: "./leakpro_output"
+  attack_type: "mia" #mia, gia
+  modality: "tabular"
+
+target:
+  # Target model path
+  module_path: "utils/adult_model_preparation.py"
+  model_class: "AdultNet" 
+  # Data paths
+  target_folder: "./target"
+  data_path: "./data/adult_data.pkl"
+
+shadow_model:
+  
+distillation_model:
diff --git a/examples/mia/cifar/cifar_handler.py b/examples/mia/cifar/cifar_handler.py
index e69de29..13424f7 100644
--- a/examples/mia/cifar/cifar_handler.py
+++ b/examples/mia/cifar/cifar_handler.py
@@ -0,0 +1,70 @@
+"""Module containing the class to handle the user input for the CIFAR100 dataset."""
+
+import torch
+from torch import cuda, device, optim, sigmoid
+from torch.nn import CrossEntropyLoss
+from torch.utils.data import DataLoader
+from tqdm import tqdm
+
+from leakpro import AbstractInputHandler
+
+class Cifar100InputHandler(AbstractInputHandler):
+    """Class to handle the user input for the CIFAR100 dataset."""
+
+    def __init__(self, configs: dict) -> None:
+        super().__init__(configs = configs)
+
+
+    def get_criterion(self)->None:
+        """Set the CrossEntropyLoss for the model."""
+        return CrossEntropyLoss()
+
+    def get_optimizer(self, model:torch.nn.Module) -> None:
+        """Set the optimizer for the model."""
+        learning_rate = 0.1
+        momentum = 0.8
+        return optim.SGD(model.parameters(), lr=learning_rate, momentum=momentum)
+
+    def train(
+        self,
+        dataloader: DataLoader,
+        model: torch.nn.Module = None,
+        criterion: torch.nn.Module = None,
+        optimizer: optim.Optimizer = None,
+        epochs: int = None,
+    ) -> dict:
+        """Model training procedure."""
+
+        # read hyperparams for training (the parameters for the dataloader are defined in get_dataloader):
+        if epochs is None:
+            raise ValueError("epochs not found in configs")
+
+        # prepare training
+        gpu_or_cpu = device("cuda" if cuda.is_available() else "cpu")
+        model.to(gpu_or_cpu)
+
+        # training loop
+        for epoch in range(epochs):
+            train_loss, train_acc = 0, 0
+            model.train()
+            for inputs, labels in tqdm(dataloader, desc=f"Epoch {epoch+1}/{epochs}"):
+                labels = labels.long()
+                inputs, labels = inputs.to(gpu_or_cpu, non_blocking=True), labels.to(gpu_or_cpu, non_blocking=True)
+                optimizer.zero_grad()
+                outputs = model(inputs)
+                loss = criterion(outputs, labels)
+                pred = outputs.data.max(1, keepdim=True)[1]
+                loss.backward()
+                optimizer.step()
+
+                # Accumulate performance of shadow model
+                train_acc += pred.eq(labels.data.view_as(pred)).sum()
+                train_loss += loss.item()
+
+            log_train_str = (
+                f"Epoch: {epoch+1}/{epochs} | Train Loss: {train_loss/len(dataloader):.8f} | "
+                f"Train Acc: {float(train_acc)/len(dataloader.dataset):.8f}")
+            self.logger.info(log_train_str)
+        model.to("cpu")
+
+        return {"model": model, "metrics": {"accuracy": train_acc, "loss": train_loss}}
diff --git a/examples/mia/cifar/utils/cifar_data_prepration.py b/examples/mia/cifar/utils/cifar_data_prepration.py
index e69de29..b074552 100644
--- a/examples/mia/cifar/utils/cifar_data_prepration.py
+++ b/examples/mia/cifar/utils/cifar_data_prepration.py
@@ -0,0 +1,142 @@
+import os
+import numpy as np
+import pandas as pd
+import joblib
+import pickle
+from sklearn.model_selection import train_test_split
+from torchvision import transforms, datasets
+from torchvision.datasets import CIFAR10, CIFAR100
+import urllib.request
+from torch.utils.data import Dataset, Subset, DataLoader
+from torch import tensor, float32, cat
+
+
+
+class CifarDataset(Dataset):
+    def __init__(self, x, y, transform=None,  indices=None):
+        """
+        Custom dataset for CIFAR data.
+
+        Args:
+            x (torch.Tensor): Tensor of input images.
+            y (torch.Tensor): Tensor of labels.
+            transform (callable, optional): Optional transform to be applied on the image tensors.
+        """
+        self.x = x
+        self.y = y
+        self.transform = transform  
+        self.indices = indices
+
+    def __len__(self):
+        """Return the total number of samples."""
+        return len(self.y)
+
+    def __getitem__(self, idx):
+        """Retrieve the image and its corresponding label at index 'idx'."""
+        image = self.x[idx]
+        label = self.y[idx]
+
+        # Apply transformations to the image if any
+        if self.transform:
+            image = self.transform(image)
+
+        return image, label
+    
+    @classmethod
+    def from_cifar10(cls, root="./data", download=True, transform=None):
+        # Load the CIFAR10 train and test datasets
+        trainset = CIFAR10(root=root, train=True, download=download, transform=transforms.ToTensor())
+        testset = CIFAR10(root=root, train=False, download=download, transform=transforms.ToTensor())
+
+        # Concatenate both datasets' data and labels
+        data = cat([tensor(trainset.data, dtype=float32), 
+                          tensor(testset.data, dtype=float32)], 
+                          dim=0)
+        # Rescale data from [0, 255] to [0, 1]
+        data /= 255.0
+        normalize = transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
+        data = data.permute(0, 3, 1, 2)
+        data = normalize(data)
+        
+        targets = cat([tensor(trainset.targets), tensor(testset.targets)], dim=0)
+
+        return cls(data, targets)
+    
+    @classmethod
+    def from_cifar100(cls, root="./data", download=True, transform=None):
+        # Load the CIFAR10 train and test datasets
+        trainset = CIFAR100(root=root, train=True, download=download, transform=transforms.ToTensor())
+        testset = CIFAR100(root=root, train=False, download=download, transform=transforms.ToTensor())
+
+        # Concatenate both datasets' data and labels
+        data = cat([tensor(trainset.data, dtype=float32), 
+                          tensor(testset.data, dtype=float32)], 
+                          dim=0)
+        # Rescale data from [0, 255] to [0, 1]
+        data /= 255.0
+        normalize = transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
+        data = data.permute(0, 3, 1, 2)
+        data = normalize(data)
+        
+        targets = cat([tensor(trainset.targets), tensor(testset.targets)], dim=0)
+
+        return cls(data, targets)
+
+    def subset(self, indices):
+        """Return a subset of the dataset based on the given indices."""
+        return CifarDataset(self.x[indices], self.y[indices], transform=self.transform)
+
+
+def get_cifar10_dataset(data_path):
+    # Create the combined CIFAR-10 dataset
+
+    transform = transforms.Compose(
+        [transforms.ToTensor(),
+        transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))])
+
+    population = CifarDataset.from_cifar10(root=data_path, download=True, transform=transform)
+
+    file_path = data_path + "cifar10.pkl"
+    if not os.path.exists(file_path):
+        with open(file_path, "wb") as file:
+            pickle.dump(population, file)
+            print(f"Save data to {file_path}.pkl")
+
+    # Create a subset of the dataset (first 1000 samples)   
+    pretrain_indices = list(range(50000)) # first 1000 indices is the training set
+    test_indices = list(range(50001, 51000)) # next 1000 indices is the test set
+    client_indices = list(range(51001, 51002)) # first 1000 indices is the pretrain set
+    trainset = population.subset(client_indices)
+    testset = population.subset(test_indices)
+    pretrainset = population.subset(pretrain_indices)
+
+    return trainset, testset, pretrainset
+
+
+def get_cifar100_dataset(data_path):
+    # Create the combined CIFAR-100 dataset
+
+    transform = transforms.Compose(
+        [transforms.ToTensor(),
+        transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))])
+
+    population = CifarDataset.from_cifar100(root=data_path, download=True, transform=transform)
+
+    file_path = data_path + "cifar100.pkl"
+    if not os.path.exists(file_path):
+        with open(file_path, "wb") as file:
+            pickle.dump(population, file)
+            print(f"Save data to {file_path}.pkl")
+
+    # Create a subset of the dataset (first 1000 samples)   
+    pretrain_indices = list(range(50000)) # first 1000 indices is the training set
+    test_indices = list(range(50001, 51000)) # next 1000 indices is the test set
+    client_indices = list(range(51001, 51002)) # first 1000 indices is the pretrain set
+    trainset = population.subset(client_indices)
+    testset = population.subset(test_indices)
+    pretrainset = population.subset(pretrain_indices)
+
+    return trainset, testset, pretrainset
+
+
+
diff --git a/examples/mia/cifar/utils/cifar_model_prepration.py b/examples/mia/cifar/utils/cifar_model_prepration.py
index e69de29..77b17ff 100644
--- a/examples/mia/cifar/utils/cifar_model_prepration.py
+++ b/examples/mia/cifar/utils/cifar_model_prepration.py
@@ -0,0 +1,108 @@
+import torch.nn as nn
+from torch import device, optim, cuda, no_grad, save, sigmoid
+import torchvision.models as models
+import pickle
+from tqdm import tqdm
+
+class ResNet18(nn.Module):
+    def __init__(self, num_classes):
+        super(ResNet18, self).__init__()
+        self.model = models.resnet18(pretrained=False)
+        self.model.fc = nn.Linear(self.model.fc.in_features, num_classes)
+        self.init_params = {"num_classes": num_classes}
+    
+    def forward(self, x):
+        return self.model(x)
+
+def evaluate(model, loader, criterion, device):
+    model.eval()
+    loss, acc = 0, 0
+    with no_grad():
+        for data, target in loader:
+            data, target = data.to(device), target.to(device)
+            output = model(data)
+            loss += criterion(output, target).item()
+            pred = output.argmax(dim=1) 
+            acc += pred.eq(target).sum().item()
+        loss /= len(loader)
+        acc = float(acc) / len(loader.dataset)
+    return loss, acc
+
+def create_trained_model_and_metadata(model, train_loader, test_loader, epochs=10, metadata=None):
+    device_name = device("cuda" if cuda.is_available() else "cpu")
+    model.to(device_name)
+    model.train()
+
+    criterion = nn.CrossEntropyLoss()
+    optimizer = optim.SGD(model.parameters(), lr=0.1, momentum=0.8)
+    train_losses, train_accuracies = [], []
+    test_losses, test_accuracies = [], []
+    
+    for e in tqdm(range(epochs), desc="Training Progress"):
+        model.train()
+        train_acc, train_loss = 0.0, 0.0
+        
+        for data, target in train_loader:
+            data, target = data.to(device_name, non_blocking=True), target.to(device_name, non_blocking=True)
+            optimizer.zero_grad()
+            output = model(data)
+            
+            loss = criterion(output, target)
+            pred = output.argmax(dim=1)  # for multi-class classification
+            train_acc += pred.eq(target).sum().item()
+            
+            loss.backward()
+            optimizer.step()
+            train_loss += loss.item()
+        
+        train_loss /= len(train_loader)
+        train_acc /= len(train_loader.dataset)
+            
+        train_losses.append(train_loss)
+        train_accuracies.append(train_acc)
+        
+        test_loss, test_acc = evaluate(model, test_loader, criterion, device_name)
+        test_losses.append(test_loss)
+        test_accuracies.append(test_acc)
+
+    # Move the model back to the CPU
+    model.to("cpu")
+    with open("target/target_model.pkl", "wb") as f:
+        save(model.state_dict(), f)
+
+    # Create metadata and store it
+    meta_data = {}
+    meta_data["train_indices"] = train_loader.dataset.indices
+    meta_data["test_indices"] = test_loader.dataset.indices
+    meta_data["num_train"] = len(meta_data["train_indices"])
+    
+    # Write init params
+    meta_data["init_params"] = {}
+    for key, value in model.init_params.items():
+        meta_data["init_params"][key] = value
+    
+    # read out optimizer parameters
+    meta_data["optimizer"] = {}
+    meta_data["optimizer"]["name"] = optimizer.__class__.__name__.lower()
+    meta_data["optimizer"]["lr"] = optimizer.param_groups[0].get("lr", 0)
+    meta_data["optimizer"]["weight_decay"] = optimizer.param_groups[0].get("weight_decay", 0)
+    meta_data["optimizer"]["momentum"] = optimizer.param_groups[0].get("momentum", 0)
+    meta_data["optimizer"]["dampening"] = optimizer.param_groups[0].get("dampening", 0)
+    meta_data["optimizer"]["nesterov"] = optimizer.param_groups[0].get("nesterov", False)
+
+    # read out criterion parameters
+    meta_data["loss"] = {}
+    meta_data["loss"]["name"] = criterion.__class__.__name__.lower()
+
+    meta_data["batch_size"] = train_loader.batch_size
+    meta_data["epochs"] = epochs
+    meta_data["train_acc"] = train_acc
+    meta_data["test_acc"] = test_acc
+    meta_data["train_loss"] = train_loss
+    meta_data["test_loss"] = test_loss
+    meta_data["dataset"] = "cifar10"
+    
+    with open("target/model_metadata.pkl", "wb") as f:
+        pickle.dump(meta_data, f)
+    
+    return train_accuracies, train_losses, test_accuracies, test_losses