From 28bfc4892bf23b192225c9e2ed49c120d9325476 Mon Sep 17 00:00:00 2001 From: fazelehh Date: Mon, 14 Oct 2024 15:02:08 +0000 Subject: [PATCH] cifar example structure --- examples/mia/cifar/audit.yml | 44 ++++++ examples/mia/cifar/cifar_handler.py | 70 +++++++++ .../mia/cifar/utils/cifar_data_prepration.py | 142 ++++++++++++++++++ .../mia/cifar/utils/cifar_model_prepration.py | 108 +++++++++++++ 4 files changed, 364 insertions(+) diff --git a/examples/mia/cifar/audit.yml b/examples/mia/cifar/audit.yml index e69de29..37854ae 100644 --- a/examples/mia/cifar/audit.yml +++ b/examples/mia/cifar/audit.yml @@ -0,0 +1,44 @@ +audit: # Configurations for auditing + random_seed: 1234 # Integer specifying the random seed + attack_list: + rmia: + training_data_fraction: 0.5 # Fraction of the auxilary dataset to use for this attack (in each shadow model training) + attack_data_fraction: 0.5 # Fraction of auxiliary dataset to sample from during attack + num_shadow_models: 3 # Number of shadow models to train + online: True # perform online or offline attack + temperature: 2 + gamma: 2.0 + offline_a: 0.33 # parameter from which we compute p(x) from p_OUT(x) such that p_IN(x) = a p_OUT(x) + b. + offline_b: 0.66 + qmia: + training_data_fraction: 1.0 # Fraction of the auxilary dataset (data without train and test indices) to use for training the quantile regressor + epochs: 5 # Number of training epochs for quantile regression + population: + attack_data_fraction: 1.0 # Fraction of the auxilary dataset to use for this attack + lira: + training_data_fraction: 0.5 # Fraction of the auxilary dataset to use for this attack (in each shadow model training) + num_shadow_models: 8 # Number of shadow models to train + online: False # perform online or offline attack + fixed_variance: True # Use a fixed variance for the whole audit + boosting: True + loss_traj: + training_distill_data_fraction : 0.7 # Fraction of the auxilary dataset to use for training the distillation models D_s = (1-D_KD)/2 + number_of_traj: 10 # Number of epochs (number of points in the loss trajectory) + label_only: False # True or False + mia_classifier_epochs: 100 + + output_dir: "./leakpro_output" + attack_type: "mia" #mia, gia + modality: "tabular" + +target: + # Target model path + module_path: "utils/adult_model_preparation.py" + model_class: "AdultNet" + # Data paths + target_folder: "./target" + data_path: "./data/adult_data.pkl" + +shadow_model: + +distillation_model: diff --git a/examples/mia/cifar/cifar_handler.py b/examples/mia/cifar/cifar_handler.py index e69de29..13424f7 100644 --- a/examples/mia/cifar/cifar_handler.py +++ b/examples/mia/cifar/cifar_handler.py @@ -0,0 +1,70 @@ +"""Module containing the class to handle the user input for the CIFAR100 dataset.""" + +import torch +from torch import cuda, device, optim, sigmoid +from torch.nn import CrossEntropyLoss +from torch.utils.data import DataLoader +from tqdm import tqdm + +from leakpro import AbstractInputHandler + +class Cifar100InputHandler(AbstractInputHandler): + """Class to handle the user input for the CIFAR100 dataset.""" + + def __init__(self, configs: dict) -> None: + super().__init__(configs = configs) + + + def get_criterion(self)->None: + """Set the CrossEntropyLoss for the model.""" + return CrossEntropyLoss() + + def get_optimizer(self, model:torch.nn.Module) -> None: + """Set the optimizer for the model.""" + learning_rate = 0.1 + momentum = 0.8 + return optim.SGD(model.parameters(), lr=learning_rate, momentum=momentum) + + def train( + self, + dataloader: DataLoader, + model: torch.nn.Module = None, + criterion: torch.nn.Module = None, + optimizer: optim.Optimizer = None, + epochs: int = None, + ) -> dict: + """Model training procedure.""" + + # read hyperparams for training (the parameters for the dataloader are defined in get_dataloader): + if epochs is None: + raise ValueError("epochs not found in configs") + + # prepare training + gpu_or_cpu = device("cuda" if cuda.is_available() else "cpu") + model.to(gpu_or_cpu) + + # training loop + for epoch in range(epochs): + train_loss, train_acc = 0, 0 + model.train() + for inputs, labels in tqdm(dataloader, desc=f"Epoch {epoch+1}/{epochs}"): + labels = labels.long() + inputs, labels = inputs.to(gpu_or_cpu, non_blocking=True), labels.to(gpu_or_cpu, non_blocking=True) + optimizer.zero_grad() + outputs = model(inputs) + loss = criterion(outputs, labels) + pred = outputs.data.max(1, keepdim=True)[1] + loss.backward() + optimizer.step() + + # Accumulate performance of shadow model + train_acc += pred.eq(labels.data.view_as(pred)).sum() + train_loss += loss.item() + + log_train_str = ( + f"Epoch: {epoch+1}/{epochs} | Train Loss: {train_loss/len(dataloader):.8f} | " + f"Train Acc: {float(train_acc)/len(dataloader.dataset):.8f}") + self.logger.info(log_train_str) + model.to("cpu") + + return {"model": model, "metrics": {"accuracy": train_acc, "loss": train_loss}} diff --git a/examples/mia/cifar/utils/cifar_data_prepration.py b/examples/mia/cifar/utils/cifar_data_prepration.py index e69de29..b074552 100644 --- a/examples/mia/cifar/utils/cifar_data_prepration.py +++ b/examples/mia/cifar/utils/cifar_data_prepration.py @@ -0,0 +1,142 @@ +import os +import numpy as np +import pandas as pd +import joblib +import pickle +from sklearn.model_selection import train_test_split +from torchvision import transforms, datasets +from torchvision.datasets import CIFAR10, CIFAR100 +import urllib.request +from torch.utils.data import Dataset, Subset, DataLoader +from torch import tensor, float32, cat + + + +class CifarDataset(Dataset): + def __init__(self, x, y, transform=None, indices=None): + """ + Custom dataset for CIFAR data. + + Args: + x (torch.Tensor): Tensor of input images. + y (torch.Tensor): Tensor of labels. + transform (callable, optional): Optional transform to be applied on the image tensors. + """ + self.x = x + self.y = y + self.transform = transform + self.indices = indices + + def __len__(self): + """Return the total number of samples.""" + return len(self.y) + + def __getitem__(self, idx): + """Retrieve the image and its corresponding label at index 'idx'.""" + image = self.x[idx] + label = self.y[idx] + + # Apply transformations to the image if any + if self.transform: + image = self.transform(image) + + return image, label + + @classmethod + def from_cifar10(cls, root="./data", download=True, transform=None): + # Load the CIFAR10 train and test datasets + trainset = CIFAR10(root=root, train=True, download=download, transform=transforms.ToTensor()) + testset = CIFAR10(root=root, train=False, download=download, transform=transforms.ToTensor()) + + # Concatenate both datasets' data and labels + data = cat([tensor(trainset.data, dtype=float32), + tensor(testset.data, dtype=float32)], + dim=0) + # Rescale data from [0, 255] to [0, 1] + data /= 255.0 + normalize = transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)) + data = data.permute(0, 3, 1, 2) + data = normalize(data) + + targets = cat([tensor(trainset.targets), tensor(testset.targets)], dim=0) + + return cls(data, targets) + + @classmethod + def from_cifar100(cls, root="./data", download=True, transform=None): + # Load the CIFAR10 train and test datasets + trainset = CIFAR100(root=root, train=True, download=download, transform=transforms.ToTensor()) + testset = CIFAR100(root=root, train=False, download=download, transform=transforms.ToTensor()) + + # Concatenate both datasets' data and labels + data = cat([tensor(trainset.data, dtype=float32), + tensor(testset.data, dtype=float32)], + dim=0) + # Rescale data from [0, 255] to [0, 1] + data /= 255.0 + normalize = transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)) + data = data.permute(0, 3, 1, 2) + data = normalize(data) + + targets = cat([tensor(trainset.targets), tensor(testset.targets)], dim=0) + + return cls(data, targets) + + def subset(self, indices): + """Return a subset of the dataset based on the given indices.""" + return CifarDataset(self.x[indices], self.y[indices], transform=self.transform) + + +def get_cifar10_dataset(data_path): + # Create the combined CIFAR-10 dataset + + transform = transforms.Compose( + [transforms.ToTensor(), + transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))]) + + population = CifarDataset.from_cifar10(root=data_path, download=True, transform=transform) + + file_path = data_path + "cifar10.pkl" + if not os.path.exists(file_path): + with open(file_path, "wb") as file: + pickle.dump(population, file) + print(f"Save data to {file_path}.pkl") + + # Create a subset of the dataset (first 1000 samples) + pretrain_indices = list(range(50000)) # first 1000 indices is the training set + test_indices = list(range(50001, 51000)) # next 1000 indices is the test set + client_indices = list(range(51001, 51002)) # first 1000 indices is the pretrain set + trainset = population.subset(client_indices) + testset = population.subset(test_indices) + pretrainset = population.subset(pretrain_indices) + + return trainset, testset, pretrainset + + +def get_cifar100_dataset(data_path): + # Create the combined CIFAR-100 dataset + + transform = transforms.Compose( + [transforms.ToTensor(), + transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))]) + + population = CifarDataset.from_cifar100(root=data_path, download=True, transform=transform) + + file_path = data_path + "cifar100.pkl" + if not os.path.exists(file_path): + with open(file_path, "wb") as file: + pickle.dump(population, file) + print(f"Save data to {file_path}.pkl") + + # Create a subset of the dataset (first 1000 samples) + pretrain_indices = list(range(50000)) # first 1000 indices is the training set + test_indices = list(range(50001, 51000)) # next 1000 indices is the test set + client_indices = list(range(51001, 51002)) # first 1000 indices is the pretrain set + trainset = population.subset(client_indices) + testset = population.subset(test_indices) + pretrainset = population.subset(pretrain_indices) + + return trainset, testset, pretrainset + + + diff --git a/examples/mia/cifar/utils/cifar_model_prepration.py b/examples/mia/cifar/utils/cifar_model_prepration.py index e69de29..77b17ff 100644 --- a/examples/mia/cifar/utils/cifar_model_prepration.py +++ b/examples/mia/cifar/utils/cifar_model_prepration.py @@ -0,0 +1,108 @@ +import torch.nn as nn +from torch import device, optim, cuda, no_grad, save, sigmoid +import torchvision.models as models +import pickle +from tqdm import tqdm + +class ResNet18(nn.Module): + def __init__(self, num_classes): + super(ResNet18, self).__init__() + self.model = models.resnet18(pretrained=False) + self.model.fc = nn.Linear(self.model.fc.in_features, num_classes) + self.init_params = {"num_classes": num_classes} + + def forward(self, x): + return self.model(x) + +def evaluate(model, loader, criterion, device): + model.eval() + loss, acc = 0, 0 + with no_grad(): + for data, target in loader: + data, target = data.to(device), target.to(device) + output = model(data) + loss += criterion(output, target).item() + pred = output.argmax(dim=1) + acc += pred.eq(target).sum().item() + loss /= len(loader) + acc = float(acc) / len(loader.dataset) + return loss, acc + +def create_trained_model_and_metadata(model, train_loader, test_loader, epochs=10, metadata=None): + device_name = device("cuda" if cuda.is_available() else "cpu") + model.to(device_name) + model.train() + + criterion = nn.CrossEntropyLoss() + optimizer = optim.SGD(model.parameters(), lr=0.1, momentum=0.8) + train_losses, train_accuracies = [], [] + test_losses, test_accuracies = [], [] + + for e in tqdm(range(epochs), desc="Training Progress"): + model.train() + train_acc, train_loss = 0.0, 0.0 + + for data, target in train_loader: + data, target = data.to(device_name, non_blocking=True), target.to(device_name, non_blocking=True) + optimizer.zero_grad() + output = model(data) + + loss = criterion(output, target) + pred = output.argmax(dim=1) # for multi-class classification + train_acc += pred.eq(target).sum().item() + + loss.backward() + optimizer.step() + train_loss += loss.item() + + train_loss /= len(train_loader) + train_acc /= len(train_loader.dataset) + + train_losses.append(train_loss) + train_accuracies.append(train_acc) + + test_loss, test_acc = evaluate(model, test_loader, criterion, device_name) + test_losses.append(test_loss) + test_accuracies.append(test_acc) + + # Move the model back to the CPU + model.to("cpu") + with open("target/target_model.pkl", "wb") as f: + save(model.state_dict(), f) + + # Create metadata and store it + meta_data = {} + meta_data["train_indices"] = train_loader.dataset.indices + meta_data["test_indices"] = test_loader.dataset.indices + meta_data["num_train"] = len(meta_data["train_indices"]) + + # Write init params + meta_data["init_params"] = {} + for key, value in model.init_params.items(): + meta_data["init_params"][key] = value + + # read out optimizer parameters + meta_data["optimizer"] = {} + meta_data["optimizer"]["name"] = optimizer.__class__.__name__.lower() + meta_data["optimizer"]["lr"] = optimizer.param_groups[0].get("lr", 0) + meta_data["optimizer"]["weight_decay"] = optimizer.param_groups[0].get("weight_decay", 0) + meta_data["optimizer"]["momentum"] = optimizer.param_groups[0].get("momentum", 0) + meta_data["optimizer"]["dampening"] = optimizer.param_groups[0].get("dampening", 0) + meta_data["optimizer"]["nesterov"] = optimizer.param_groups[0].get("nesterov", False) + + # read out criterion parameters + meta_data["loss"] = {} + meta_data["loss"]["name"] = criterion.__class__.__name__.lower() + + meta_data["batch_size"] = train_loader.batch_size + meta_data["epochs"] = epochs + meta_data["train_acc"] = train_acc + meta_data["test_acc"] = test_acc + meta_data["train_loss"] = train_loss + meta_data["test_loss"] = test_loss + meta_data["dataset"] = "cifar10" + + with open("target/model_metadata.pkl", "wb") as f: + pickle.dump(meta_data, f) + + return train_accuracies, train_losses, test_accuracies, test_losses