diff --git a/config/audit.yaml b/config/audit.yaml deleted file mode 100755 index f9ea4a4d..00000000 --- a/config/audit.yaml +++ /dev/null @@ -1,127 +0,0 @@ -audit: # Configurations for auditing - random_seed: 1234 # Integer specifying the random seed - attack_list: - # rmia: - # training_data_fraction: 0.1 # Fraction of the auxilary dataset to use for this attack (in each shadow model training) - # attack_data_fraction: 0.1 # Fraction of auxiliary dataset to sample from during attack - # num_shadow_models: 3 # Number of shadow models to train - # online: True # perform online or offline attack - # temperature: 2 - # gamma: 2.0 - # offline_a: 0.33 # parameter from which we compute p(x) from p_OUT(x) such that p_IN(x) = a p_OUT(x) + b. - # offline_b: 0.66 - # qmia: - # training_data_fraction: 0.5 # Fraction of the auxilary dataset (data without train and test indices) to use for training the quantile regressor - # epochs: 5 # Number of training epochs for quantile regression - # population: - # attack_data_fraction: 0.1 # Fraction of the auxilary dataset to use for this attack - # lira: - # training_data_fraction: 0.1 # Fraction of the auxilary dataset to use for this attack (in each shadow model training) - # num_shadow_models: 8 # Number of shadow models to train - # online: False # perform online or offline attack - # fixed_variance: True # Use a fixed variance for the whole audit - # boosting: True - # loss_traj: - # training_distill_data_fraction : 0.2 # Fraction of the auxilary dataset to use for training the distillation models D_s = (1-D_KD)/2 - # number_of_traj: 1 # Number of epochs (number of points in the loss trajectory) - # label_only: "False" # True or False - # attack_data_dir: "./leakpro_output/attack_objects/loss_traj" - # mia_classifier_epochs: 10 - # HSJ: - # attack_data_fraction: 0.01 # Fraction of the auxilary dataset to use for this attack - # target_metadata_path: "./target/model_metadata.pkl" - # num_iterations: 2 # Number of iterations for the optimization - # initial_num_evals: 100 # Number of evaluations for number of random vecotr to estimate the gradient - # max_num_evals: 10000 # Maximum number of evaluations - # stepsize_search: "geometric_progression" # Step size search method - # gamma: 1.0 # Gamma for the optimization - # constraint: 2 - # batch_size: 50 - # verbose: True - # epsilon_threshold: 1e-6 - - # gia attacks - inverting_gradients: - total_variation: 0.000001 - attack_lr: 0.1 # lr for the reconstruction optimizer - at_iterations: 500 - gia_settings: - client_batch_size: 1 - num_client_images: 1 - epochs: 4 - optimizer: SGD - learning_rate: 0.0001 - dataset: cifar10 - data_dir: ./target/data - - - report_log: "./leakpro_output/results" # Folder to save the auditing report - config_log: "./leakpro_output/config" # Folder to save the configuration files - target_model_folder: "./target" - attack_folder: "attack_objects" - attack_type: "gia" #mia, gia - split_method: "no_overlapping" # Method of creating the attack dataset - -target: - # Target model path - module_path: "./leakpro/shadow_model_blueprints.py" - model_class: "ResNet18" #"ResNet18, SimpleCNN, ConvNet" - # Data paths - trained_model_path: "./target/target_model.pkl" - trained_model_metadata_path: "./target/model_metadata.pkl" - data_path: "./target/data/cifar10.pkl" - -shadow_model: - storage_path: "./leakpro_output/attack_objects/shadow_models" - # [Optional] Define a shadow model (if none, shadow model will follow the target model) - # Path to a Python file with the shadow model architecture - module_path: "./leakpro/shadow_model_blueprints.py" - # Name of the class to instantiate from the specified file - model_class: "ResNet18" #"ConvNet" - batch_size: 256 - epochs: 1 - - optimizer: - name: sgd #adam, sgd, rmsprop - lr: 0.01 - momentum: 0.9 - weight_decay: 0.0 - loss: - name: crossentropyloss # crossentropyloss, nllloss, mseloss - # Initialization parameters - init_params: {} - -distillation_model: - storage_path: "./leakpro_output/attack_objects/distillation_models" - #module_path: "./leakpro/shadow_model_blueprints.py" - #model_class: "ConvNet" - optimizer: - name: sgd #adam, sgd, rmsprop - lr: 0.01 - momentum: 0.9 - weight_decay: 0.001 - loss: - name: crossentropyloss # crossentropyloss, nllloss, mseloss - # Initialization parameters - init_params: {} - trained_model_path: "./leakpro_output/attack_objects/distillation_target_models/distillation_model.pkl" - trained_model_metadata_path: "./leakpro_output/attack_objects/distillation_target_models/model_metadata.pkl" - data_path: "./leakpro_output/attack_objects/distillation_target_models/cifar100.pkl" - -distillation_shadow_model: - storage_path: "./leakpro_output/attack_objects/distillation_shadow_models" - module_path: "./leakpro/shadow_model_blueprints.py" - # model_class: "ConvNet" - optimizer: - name: sgd #adam, sgd, rmsprop - lr: 0.001 - momentum: 0.9 - weight_decay: 0.001 - loss: - name: crossentropyloss # crossentropyloss, nllloss, mseloss - # Initialization parameters - init_params: {} - trained_model_path: "./leakpro_output/attack_objects/distillation_shadow_models/distillation_model.pkl" - trained_model_metadata_path: "./leakpro_output/attack_objects/distillation_shadow_models/model_metadata.pkl" - data_path: "./leakpro_output/attack_objects/distillation_shadow_models/cifar100.pkl" - diff --git a/config/dev_config/adult.yaml b/config/dev_config/adult.yaml deleted file mode 100755 index a0271d3a..00000000 --- a/config/dev_config/adult.yaml +++ /dev/null @@ -1,15 +0,0 @@ -train: # Configuration for training - type: pytorch # Training framework (we only support pytorch now). - num_target_model: 1 #Integer number for indicating how many target models we want to audit for the privacy game - epochs: 100 # Integer number for indicating the epochs for training target model. For speedyresnet, it uses its own number of epochs. - batch_size: 128 # Integer number for indicating batch size for training the target model. For speedyresnet, it uses its own batch size. - optimizer: SGD # String which indicates the optimizer. We support Adam and SGD. For speedyresnet, it uses its own optimizer. - learning_rate: 0.1 # Float number for indicating learning rate for training the target model. For speedyresnet, it uses its own learning_rate. - weight_decay: 0.01 # Float number for indicating weight decay for training the target model. For speedyresnet, it uses its own weight_decay. - test_batch_size: 256 - -data: # Configuration for data - dataset: adult # String indicates the name of the dataset (i.e., cifar10, cifar100, purchase100, texas1000) - f_train: 0.01 # Float number from 0 to 1 indicating the fraction of the train dataset - f_test: 0.01 # Float number from 0 to 1 indicating the fraction of the test dataset - data_dir: ./target/data # String about where to save the data. \ No newline at end of file diff --git a/config/dev_config/cifar10.yaml b/config/dev_config/cifar10.yaml deleted file mode 100755 index efa586d2..00000000 --- a/config/dev_config/cifar10.yaml +++ /dev/null @@ -1,23 +0,0 @@ -run: # Configurations for a specific run - random_seed: 1234 # Integer number of specifying random seed - log_dir: target # String for indicating where to save all the information, including models and computed signals. We can reuse the models saved in the same log_dir. - -train: # Configuration for training - type: pytorch # Training framework (we only support pytorch now). - num_target_model: 1 #Integer number for indicating how many target models we want to audit for the privacy game - epochs: 10 # Integer number for indicating the epochs for training target model. For speedyresnet, it uses its own number of epochs. - batch_size: 128 # Integer number for indicating batch size for training the target model. For speedyresnet, it uses its own batch size. - optimizer: SGD # String which indicates the optimizer. We support Adam and SGD. For speedyresnet, it uses its own optimizer. - learning_rate: 0.01 # Float number for indicating learning rate for training the target model. For speedyresnet, it uses its own learning_rate. - momentum: 0.9 - weight_decay: 0.0 # Float number for indicating weight decay for training the target model. For speedyresnet, it uses its own weight_decay. - test_batch_size: 128 - num_classes: 10 # Integer number for indicating the number of classes in the dataset - -data: # Configuration for data - dataset: cifar10 # String indicates the name of the dataset - f_train: 0.1 # Float number from 0 to 1 indicating the fraction of the train dataset - f_test: 0.1 # Float number from 0 to 1 indicating the size of the test set - data_dir: ./target/data # String about where to save the data. - - diff --git a/config/dev_config/cifar100.yaml b/config/dev_config/cifar100.yaml deleted file mode 100755 index f71f7927..00000000 --- a/config/dev_config/cifar100.yaml +++ /dev/null @@ -1,23 +0,0 @@ -run: # Configurations for a specific run - random_seed: 1234 # Integer number of specifying random seed - log_dir: target # String for indicating where to save all the information, including models and computed signals. We can reuse the models saved in the same log_dir. - -train: # Configuration for training - type: pytorch # Training framework (we only support pytorch now). - num_target_model: 1 #Integer number for indicating how many target models we want to audit for the privacy game - epochs: 2 # Integer number for indicating the epochs for training target model. For speedyresnet, it uses its own number of epochs. - batch_size: 128 # Integer number for indicating batch size for training the target model. For speedyresnet, it uses its own batch size. - optimizer: SGD # String which indicates the optimizer. We support Adam and SGD. For speedyresnet, it uses its own optimizer. - learning_rate: 0.01 # Float number for indicating learning rate for training the target model. For speedyresnet, it uses its own learning_rate. - momentum: 0.9 - weight_decay: 0.0 # Float number for indicating weight decay for training the target model. For speedyresnet, it uses its own weight_decay. - test_batch_size: 256 - num_classes: 100 # Integer number for indicating the number of classes in the dataset - -data: # Configuration for data - dataset: cifar100 # String indicates the name of the dataset - f_train: 0.495 # Float number from 0 to 1 indicating the fraction of the train dataset - f_test: 0.495 # Float number from 0 to 1 indicating the size of the test set - data_dir: ./target/data # String about where to save the data. - - diff --git a/config/dev_config/cinic10.yaml b/config/dev_config/cinic10.yaml deleted file mode 100755 index 3e06ea34..00000000 --- a/config/dev_config/cinic10.yaml +++ /dev/null @@ -1,21 +0,0 @@ -run: # Configurations for a specific run - random_seed: 1234 # Integer number of specifying random seed - log_dir: target # String for indicating where to save all the information, including models and computed signals. We can reuse the models saved in the same log_dir. - -train: # Configuration for training - type: pytorch # Training framework (we only support pytorch now). - num_target_model: 1 #Integer number for indicating how many target models we want to audit for the privacy game - epochs: 50 # Integer number for indicating the epochs for training target model. For speedyresnet, it uses its own number of epochs. - batch_size: 64 # Integer number for indicating batch size for training the target model. For speedyresnet, it uses its own batch size. - optimizer: SGD # String which indicates the optimizer. We support Adam and SGD. For speedyresnet, it uses its own optimizer. - learning_rate: 0.01 # Float number for indicating learning rate for training the target model. For speedyresnet, it uses its own learning_rate. - momentum: 0.9 - weight_decay: 0.01 # Float number for indicating weight decay for training the target model. For speedyresnet, it uses its own weight_decay. - test_batch_size: 256 - num_classes: 10 # Integer number for indicating the number of classes in the dataset - -data: # Configuration for data - dataset: cinic10 # String indicates the name of the dataset - f_train: 0.1 # Float number from 0 to 1 indicating the fraction of the train dataset - f_test: 0.1 # Float number from 0 to 1 indicating the size of the test set - data_dir: ./target/data # String about where to save the data. \ No newline at end of file diff --git a/dev_utils/cifar100_input_handler.py b/dev_utils/cifar100_input_handler.py deleted file mode 100755 index 0dcb2256..00000000 --- a/dev_utils/cifar100_input_handler.py +++ /dev/null @@ -1,73 +0,0 @@ -"""Module containing the class to handle the user input for the CIFAR100 dataset.""" - -import logging - -import torch -from torch import cuda, device, optim -from torch.utils.data import DataLoader -from tqdm import tqdm - -from leakpro.user_inputs.abstract_input_handler import AbstractInputHandler -from leakpro.utils.import_helper import Self - - -class Cifar100InputHandler(AbstractInputHandler): - """Class to handle the user input for the CIFAR100 dataset.""" - - def __init__(self:Self, configs: dict, logger:logging.Logger) -> None: - super().__init__(configs = configs, logger = logger) - - - def get_criterion(self:Self)->None: - """Set the CrossEntropyLoss for the model.""" - return torch.nn.CrossEntropyLoss() - - def get_optimizer(self: Self, model:torch.nn.Module) -> None: - """Set the optimizer for the model.""" - learning_rate = 0.1 - momentum = 0.8 - return optim.SGD(model.parameters(), lr=learning_rate, momentum=momentum) - - def train( - self: Self, - dataloader: DataLoader, - model: torch.nn.Module = None, - criterion: torch.nn.Module = None, - optimizer: optim.Optimizer = None, - epochs: int = None, - ) -> dict: - """Model training procedure.""" - - # read hyperparams for training (the parameters for the dataloader are defined in get_dataloader): - if epochs is None: - raise ValueError("epochs not found in configs") - - # prepare training - gpu_or_cpu = device("cuda" if cuda.is_available() else "cpu") - model.to(gpu_or_cpu) - - # training loop - for epoch in range(epochs): - train_loss, train_acc = 0, 0 - model.train() - for inputs, labels in tqdm(dataloader, desc=f"Epoch {epoch+1}/{epochs}"): - labels = labels.long() - inputs, labels = inputs.to(gpu_or_cpu, non_blocking=True), labels.to(gpu_or_cpu, non_blocking=True) - optimizer.zero_grad() - outputs = model(inputs) - loss = criterion(outputs, labels) - pred = outputs.data.max(1, keepdim=True)[1] - loss.backward() - optimizer.step() - - # Accumulate performance of shadow model - train_acc += pred.eq(labels.data.view_as(pred)).sum() - train_loss += loss.item() - - log_train_str = ( - f"Epoch: {epoch+1}/{epochs} | Train Loss: {train_loss/len(dataloader):.8f} | " - f"Train Acc: {float(train_acc)/len(dataloader.dataset):.8f}") - self.logger.info(log_train_str) - model.to("cpu") - - return {"model": model, "metrics": {"accuracy": train_acc, "loss": train_loss}} diff --git a/dev_utils/cifar10_gia_input_handler.py b/dev_utils/cifar10_gia_input_handler.py deleted file mode 100755 index 2ec25de7..00000000 --- a/dev_utils/cifar10_gia_input_handler.py +++ /dev/null @@ -1,61 +0,0 @@ -"""Module containing the class to handle the user input for the CIFAR10 dataset.""" - -import logging -from collections import OrderedDict - -from torch import cuda, device -from torch.nn import Module -from torch.utils.data import DataLoader - -from dev_utils.data_modules import CifarModule -from leakpro.fl_utils.gia_module_to_functional import MetaModule -from leakpro.fl_utils.gia_optimizers import MetaOptimizer -from leakpro.user_inputs.abstract_gia_input_handler import AbstractGIAInputHandler -from leakpro.utils.import_helper import Self - - -class Cifar10GIAInputHandler(AbstractGIAInputHandler): - """Class to handle the user input for the CIFAR10 dataset.""" - - def __init__(self:Self, configs: dict, logger:logging.Logger, target_model: Module) -> None: - self.data_module = CifarModule(batch_size=configs["audit"]["gia_settings"]["client_batch_size"]) - super().__init__(configs, logger, target_model, self.data_module) - self.criterion = self.get_criterion() - - def train( - self: Self, - data: DataLoader = None, - optimizer: MetaOptimizer = None, - ) -> list: - """Model training procedure for GIA. - - This training will create a computational graph through multiple steps, which is necessary - for backpropagating to an input image. - - Requires a meta optimizer that performs step to a new set of parameters to keep a functioning - graph. - - Training does not update the original model, but returns a norm of what the update would have been. - """ - gpu_or_cpu = device("cuda" if cuda.is_available() else "cpu") - self.target_model.to(gpu_or_cpu) - patched_model = MetaModule(self.target_model) - - outputs = None - epochs = self.configs["audit"]["gia_settings"]["epochs"] - for _ in range(epochs): - train_loss, train_acc = 0, 0 - for inputs, labels in data: - labels = labels.long() - inputs, labels = inputs.to(gpu_or_cpu, non_blocking=True), labels.to(gpu_or_cpu, non_blocking=True) - outputs = patched_model(inputs, patched_model.parameters) - loss = self.criterion(outputs, labels).sum() - pred = outputs.data.max(1, keepdim=True)[1] - patched_model.parameters = optimizer.step(loss, patched_model.parameters) - train_acc += pred.eq(labels.data.view_as(pred)).sum() - train_loss += loss.item() - model_delta = OrderedDict((name, param - param_origin) - for ((name, param), (name_origin, param_origin)) - in zip(patched_model.parameters.items(), - OrderedDict(self.target_model.named_parameters()).items())) - return list(model_delta.values()) diff --git a/dev_utils/cifar10_input_handler.py b/dev_utils/cifar10_input_handler.py deleted file mode 100755 index 8108f0c2..00000000 --- a/dev_utils/cifar10_input_handler.py +++ /dev/null @@ -1,73 +0,0 @@ -"""Module containing the class to handle the user input for the CIFAR10 dataset.""" - -import logging - -import torch -from torch import cuda, device, optim -from torch.utils.data import DataLoader -from tqdm import tqdm - -from leakpro.user_inputs.abstract_input_handler import AbstractInputHandler -from leakpro.utils.import_helper import Self - - -class Cifar10InputHandler(AbstractInputHandler): - """Class to handle the user input for the CIFAR10 dataset.""" - - def __init__(self:Self, configs: dict, logger:logging.Logger) -> None: - super().__init__(configs = configs, logger = logger) - - - def get_criterion(self:Self)->None: - """Set the CrossEntropyLoss for the model.""" - return torch.nn.CrossEntropyLoss() - - def get_optimizer(self: Self, model:torch.nn.Module) -> None: - """Set the optimizer for the model.""" - learning_rate = 0.1 - momentum = 0.8 - return optim.SGD(model.parameters(), lr=learning_rate, momentum=momentum) - - def train( - self: Self, - dataloader: DataLoader, - model: torch.nn.Module = None, - criterion: torch.nn.Module = None, - optimizer: optim.Optimizer = None, - epochs: int = None, - ) -> dict: - """Model training procedure.""" - - # read hyperparams for training (the parameters for the dataloader are defined in get_dataloader): - if epochs is None: - raise ValueError("epochs not found in configs") - - # prepare training - gpu_or_cpu = device("cuda" if cuda.is_available() else "cpu") - model.to(gpu_or_cpu) - - # training loop - for epoch in range(epochs): - train_loss, train_acc = 0, 0 - model.train() - for inputs, labels in tqdm(dataloader, desc=f"Epoch {epoch+1}/{epochs}"): - labels = labels.long() - inputs, labels = inputs.to(gpu_or_cpu, non_blocking=True), labels.to(gpu_or_cpu, non_blocking=True) - optimizer.zero_grad() - outputs = model(inputs) - loss = criterion(outputs, labels) - pred = outputs.data.max(1, keepdim=True)[1] - loss.backward() - optimizer.step() - - # Accumulate performance of shadow model - train_acc += pred.eq(labels.data.view_as(pred)).sum() - train_loss += loss.item() - - log_train_str = ( - f"Epoch: {epoch+1}/{epochs} | Train Loss: {train_loss/len(dataloader):.8f} | " - f"Train Acc: {float(train_acc)/len(dataloader.dataset):.8f}") - self.logger.info(log_train_str) - model.to("cpu") - - return {"model": model, "metrics": {"accuracy": train_acc, "loss": train_loss}} diff --git a/dev_utils/data_modules.py b/dev_utils/data_modules.py deleted file mode 100755 index f4cacdc0..00000000 --- a/dev_utils/data_modules.py +++ /dev/null @@ -1,115 +0,0 @@ -"""Module with functions for preparing the dataset for training the target models.""" -from abc import ABC, abstractmethod - -import numpy as np -import torchvision -from torch import Tensor, as_tensor, cat, cuda, device, mean, randn, std, tensor -from torch.utils.data import DataLoader, Dataset, Subset, TensorDataset -from torchvision import transforms - -from leakpro.utils.import_helper import List, Self - -DEVICE = device("cuda" if cuda.is_available() else "cpu") - -class DataModule(ABC): - """Abstract base class for data modules.""" - - @abstractmethod - def get_train_val_loaders(self: Self) -> tuple[DataLoader, DataLoader]: - """Abstract method to get train and validation loaders.""" - pass - - @abstractmethod - def get_meanstd(self: Self) -> tuple[Tensor, Tensor]: - """Abstract method to get the mean and std of the dataset.""" - pass - - @abstractmethod - def get_subset(self: Self, num_examples: int) -> DataLoader: - """Abstract method to get a subset of the validation data.""" - pass - - @abstractmethod - def get_subset_idx(self: Self, indexes: List[int]) -> DataLoader: - """Abstract method to get a DataLoader with elements corresponding to the given indexes.""" - pass - - @abstractmethod - def get_at_images(self: Self, client_loader: DataLoader) -> DataLoader: - """Abstract method to get DataLoader with random noise images.""" - pass - -class CifarModule(DataModule): - """Module working with the Cifar10 dataset.""" - - def __init__(self: Self, root: str = "./data", batch_size: int = 32, num_workers: int = 2) -> None: - trainset = torchvision.datasets.CIFAR10(root=root, train=True, download=True, transform=transforms.ToTensor()) - valset = torchvision.datasets.CIFAR10(root=root, train=False, download=True, transform=transforms.ToTensor()) - - data_mean, data_std = self._get_meanstd(trainset) - - transform = transforms.Compose([ - transforms.ToTensor(), - transforms.Normalize(data_mean, data_std)]) - - trainset.transform = transform - valset.transform = transform - - self.trainloader = DataLoader(trainset, batch_size=batch_size, - shuffle=False, drop_last=True, num_workers=num_workers) - self.valloader = DataLoader(valset, batch_size=batch_size, - shuffle=False, drop_last=False, num_workers=num_workers) - self.data_mean = as_tensor(data_mean)[:, None, None] - self.data_std = as_tensor(data_std)[:, None, None] - self.batch_size = batch_size - self.num_workers = num_workers - - def get_train_val_loaders(self: Self) -> tuple[DataLoader, DataLoader]: - """Getter for train and validation loader.""" - return self.trainloader, self.valloader - - def get_meanstd(self:Self) -> tuple[Tensor, Tensor]: - """Get mean and std for the dataset.""" - return self.data_mean, self.data_std - - def _get_meanstd(self: Self, trainset: Dataset) -> tuple[Tensor, Tensor]: - cc = cat([trainset[i][0].reshape(3, -1) for i in range(len(trainset))], dim=1) - data_mean = mean(cc, dim=1).tolist() - data_std = std(cc, dim=1).tolist() - return data_mean, data_std - - def get_subset(self: Self, num_examples: int) -> DataLoader: - """Get a dataloader with num_examples elements from the validation loader.""" - target_ids = np.random.choice(len(self.valloader.dataset), size=num_examples, replace=False) - subset = Subset(self.valloader.dataset, target_ids) - - return DataLoader( - subset, - batch_size=self.batch_size, - num_workers=self.num_workers, - shuffle=False - ) - - def get_subset_idx(self: Self, indexes: List[int]) -> DataLoader: - """Get a DataLoader with elements corresponding to the given indexes from the validation loader.""" - subset = Subset(self.valloader.dataset, indexes) - return DataLoader( - subset, - batch_size=self.batch_size, - num_workers=self.num_workers, - shuffle=False - ) - - def get_at_images(self: Self, client_loader: DataLoader) -> DataLoader: - """DataLoader with random noise images of the same shape as the client_loader's dataset, using the same labels.""" - img_shape = client_loader.dataset[0][0].shape - num_images = len(client_loader.dataset) - reconstruction = randn((num_images, *img_shape)) - labels = [] - for _, label in client_loader: - labels.extend(label.numpy()) - labels = tensor(labels) - reconstruction_dataset = TensorDataset(reconstruction, labels) - reconstruction_loader = DataLoader(reconstruction_dataset, batch_size=32, shuffle=True) - return reconstruction, reconstruction_loader - diff --git a/dev_utils/data_preparation.py b/dev_utils/data_preparation.py deleted file mode 100755 index e4e88bbb..00000000 --- a/dev_utils/data_preparation.py +++ /dev/null @@ -1,200 +0,0 @@ -"""Module with functions for preparing the dataset for training the target models.""" - -import logging -import os -import pickle -import tarfile -from pathlib import Path -from urllib.request import urlretrieve - -import joblib -import numpy as np -import pandas as pd -import torchvision -from sklearn.model_selection import train_test_split -from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler -from torchvision import transforms - -from leakpro.dataset import GeneralDataset - - -def get_adult_dataset(dataset_name: str, data_dir: str, logger:logging.Logger) -> GeneralDataset: - """Get the dataset.""" - path = f"{data_dir}/{dataset_name}" - if os.path.exists(f"{path}.pkl"): - with open(f"{path}.pkl", "rb") as file: - all_data = joblib.load(file) - logger.info(f"Load data from {path}.pkl") - else: - column_names = [ - "age", - "workclass", - "fnlwgt", - "education", - "education-num", - "marital-status", - "occupation", - "relationship", - "race", - "sex", - "capital-gain", - "capital-loss", - "hours-per-week", - "native-country", - "income", - ] - df_train = pd.read_csv(f"{path}/{dataset_name}.data", names=column_names) - df_test = pd.read_csv( - f"{path}/{dataset_name}.test", names=column_names, header=0 - ) - df_test["income"] = df_test["income"].str.replace(".", "", regex=False) - df_concatenated = pd.concat([df_train, df_test], axis=0) - df_replaced = df_concatenated.replace(" ?", np.nan) - df_clean = df_replaced.dropna() - x, y = df_clean.iloc[:, :-1], df_clean.iloc[:, -1] - - categorical_features = [col for col in x.columns if x[col].dtype == "object"] - numerical_features = [ - col for col in x.columns if x[col].dtype in ["int64", "float64"] - ] - - onehot_encoder = OneHotEncoder(sparse_output=False, handle_unknown="ignore") - x_categorical = onehot_encoder.fit_transform(x[categorical_features]) - - scaler = StandardScaler() - x_numerical = scaler.fit_transform(x[numerical_features]) - - x = np.hstack([x_numerical, x_categorical]) - - # label encode the target variable to have the classes 0 and 1 - y = LabelEncoder().fit_transform(y) - - all_data = GeneralDataset(x,y) - Path(path).mkdir(parents=True, exist_ok=True) - save_dataset(all_data, path, logger) - return all_data - -def get_cifar10_dataset(dataset_name: str, data_dir: str, logger:logging.Logger) -> GeneralDataset: - """Get the dataset.""" - path = f"{data_dir}/{dataset_name}" - - if os.path.exists(f"{path}.pkl"): - with open(f"{path}.pkl", "rb") as file: - all_data = joblib.load(file) - logger.info(f"Load data from {path}.pkl") - else: - logger.info("Downloading CIFAR-10 dataset") - transform = transforms.Compose([transforms.ToTensor(),transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))]) - trainset = torchvision.datasets.CIFAR10(root="./data/cifar10", train=True, download=True, transform=transform) - testset = torchvision.datasets.CIFAR10(root="./data/cifar10", train=False,download=True, transform=transform) - - x = np.vstack([trainset.data, testset.data]) - y = np.hstack([trainset.targets, testset.targets]) - - all_data = GeneralDataset(x, y, transform) - Path(path).mkdir(parents=True, exist_ok=True) - save_dataset(all_data, path, logger) - return all_data - -def get_cifar100_dataset(dataset_name: str, data_dir: str, logger:logging.Logger) -> GeneralDataset: - """Get the dataset.""" - path = f"{data_dir}/{dataset_name}" - - if os.path.exists(f"{path}.pkl"): - with open(f"{path}.pkl", "rb") as file: - all_data = joblib.load(file) - logger.info(f"Load data from {path}.pkl") - else: - logger.info("Downloading CIFAR-100 dataset") - transform = transforms.Compose([transforms.ToTensor(),transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))]) - trainset = torchvision.datasets.CIFAR100(root="./data/cifar100", train=True, download=True, transform=transform) - testset = torchvision.datasets.CIFAR100(root="./data/cifar100", train=False,download=True, transform=transform) - - x = np.vstack([trainset.data, testset.data]) - y = np.hstack([trainset.targets, testset.targets]) - - all_data = GeneralDataset(x, y, transform) - Path(path).mkdir(parents=True, exist_ok=True) - save_dataset(all_data, path, logger) - return all_data - - -def download_file(url: str, download_path: str) -> None: - """Download a file from a given URL.""" - try: - urlretrieve(url, download_path) # noqa: S310 - except Exception as e: - error_msg = f"Failed to download file from {url}: {e}" - raise RuntimeError(error_msg) from e - -def extract_tar(tar_path: str, extract_path: str) -> None: - """Extract a tar file to a given path.""" - with tarfile.open(tar_path, "r:gz") as tar: - tar.extractall(extract_path) # noqa: S202 - -def get_cinic10_dataset(dataset_name: str, data_dir: str, logger:logging.Logger) -> GeneralDataset: - """Get the dataset.""" - path = f"{data_dir}/{dataset_name}" - if os.path.exists(f"{path}.pkl"): - with open(f"{path}.pkl", "rb") as file: - all_data = joblib.load(file) - logger.info(f"Load data from {path}.pkl") - else: - if not os.path.exists("./data/cinic10"): - logger.info("Downloading CINIC-10 dataset") - os.makedirs("./data/cinic10") - url = "https://datashare.is.ed.ac.uk/bitstream/handle/10283/3192/CINIC-10.tar.gz" - download_path = "./data/CINIC-10.tar.gz" - download_file(url, download_path) - extract_tar(download_path, "./data/cinic10") - os.remove(download_path) - - transform = transforms.Compose([transforms.ToTensor(), - transforms.Normalize((0.5, 0.5, 0.5), - (0.5, 0.5, 0.5))]) - - trainset = torchvision.datasets.ImageFolder(root="./data/cinic10/train", transform=transform) - testset = torchvision.datasets.ImageFolder(root="./data/cinic10/test", transform=transform) - validset = torchvision.datasets.ImageFolder(root="./data/cinic10/valid", transform=transform) - - train_data, train_targets = zip(*[(image.numpy(), target) for image, target in trainset]) - test_data, test_targets = zip(*[(image.numpy(), target) for image, target in testset]) - valid_data, valid_targets = zip(*[(image.numpy(), target) for image, target in validset]) - - x = np.vstack([train_data, test_data, valid_data]) - x = np.transpose(x, (0, 2, 3, 1)) - y = np.hstack([train_targets, test_targets, valid_targets]) - - all_data = GeneralDataset(x, y, transform) - Path(path).mkdir(parents=True, exist_ok=True) - save_dataset(all_data, path, logger) - return all_data - -def save_dataset(all_data: GeneralDataset, path: str, logger:logging.Logger) -> GeneralDataset: - """Save the dataset.""" - with open(f"{path}.pkl", "wb") as file: - pickle.dump(all_data, file) - logger.info(f"Save data to {path}.pkl") - -def prepare_train_test_datasets(dataset_size: int, configs: dict) -> dict: - """Prepare the dataset for training the target models when the training data are sampled uniformly from the population. - - Args: - ---- - dataset_size (int): Size of the whole dataset - num_datasets (int): Number of datasets we should generate - configs (dict): Data split configuration - - Returns: - ------- - dict: Data split information which saves the information of training points index and test points index. - - """ - # The index_list will save all the information about the train, test and auit for each target model. - all_index = np.arange(dataset_size) - train_size = int(configs["f_train"] * dataset_size) - test_size = int(configs["f_test"] * dataset_size) - - selected_index = np.random.choice(all_index, train_size + test_size, replace=False) - train_index, test_index = train_test_split(selected_index, test_size=test_size) - return {"train_indices": train_index, "test_indices": test_index} diff --git a/dev_utils/graph_utils.py b/dev_utils/graph_utils.py deleted file mode 100755 index 8ee5d71d..00000000 --- a/dev_utils/graph_utils.py +++ /dev/null @@ -1,22 +0,0 @@ -"""Functions relating to computational graph investigations.""" -from torch.nn import Module - - -def calculate_graph_depth(grad_fn: Module, cache: dict=None) -> int: - """Calculates graph depth from a tensors grad_fn.""" - if not grad_fn: - return 0 - if cache is None: - cache = {} - - if grad_fn in cache: - return cache[grad_fn] - - max_depth = 0 - for sub_fn, _ in grad_fn.next_functions: - if sub_fn is not None: - sub_depth = calculate_graph_depth(sub_fn, cache) + 1 - max_depth = max(sub_depth, max_depth) - - cache[grad_fn] = max_depth - return max_depth diff --git a/dev_utils/train.py b/dev_utils/train.py deleted file mode 100755 index 23881a22..00000000 --- a/dev_utils/train.py +++ /dev/null @@ -1,264 +0,0 @@ -"""Contains functions for training and testing the model.""" - -import logging -import pickle -import time -from pathlib import Path - -import torch -from torch import nn, optim -from tqdm import tqdm - -from leakpro.utils.import_helper import Tuple - - -def get_optimizer(model: torch.nn.Module, configs: dict, logger: logging.Logger = None) -> torch.optim.Optimizer: - """Get the optimizer for training the model. - - Args: - ---- - model (torch.nn.Module): Model for optimization. - configs (dict): Configurations for optimization. - logger (logging.Logger, optional): Logger for logging information (default: None). - - Returns: - ------- - torch.optim.Optimizer: Optimizer for training the model. - - """ - optimizer = configs["train"].get("optimizer", "SGD") - learning_rate = configs["train"].get("learning_rate", 0.01) - weight_decay = configs["train"].get("weight_decay", 0) - momentum = configs["train"].get("momentum", 0) - - logger.info(f"Load the optimizer {optimizer}") - logger.info(f"Learning rate {learning_rate}") - logger.info(f"Weight decay {weight_decay} ") - - if optimizer == "SGD": - logger.info(f"Momentum {momentum} ") - return torch.optim.SGD( - model.parameters(), - lr=learning_rate, - weight_decay=weight_decay, - momentum=momentum, - ) - if optimizer == "Adam": - return torch.optim.Adam( - model.parameters(), lr=learning_rate, weight_decay=weight_decay - ) - if optimizer == "AdamW": - return torch.optim.AdamW( - model.parameters(), lr=learning_rate, weight_decay=weight_decay - ) - - raise NotImplementedError( - f"Optimizer {optimizer} has not been implemented. Please choose from SGD or Adam" - ) - - -# Test Function -def inference( - model: torch.nn.Module, loader: torch.utils.data.DataLoader, device: str -) -> Tuple[float, float]: - """Evaluate the model performance on the test loader. - - Args: - ---- - model (torch.nn.Module): Model for evaluation - loader (torch.utils.data.DataLoader): Data Loader for testing - device (str): GPU or CPU - - Return: - ------ - loss (float): Loss for the given model on the test dataset. - acc (float): Accuracy for the given model on the test dataset. - - """ - # Setting model to eval mode and moving to specified device - model.eval() - model.to(device) - - # Assigning variables for computing loss and accuracy - loss, acc, criterion = 0, 0, nn.CrossEntropyLoss() - - - # Disable gradient calculation to save memory - with torch.no_grad(): - for data, target in loader: - # Moving data and target to the device - data, target = data.to(device), target.to(device) # noqa: PLW2901 - # Cast target to long tensor - target = target.long() # noqa: PLW2901 - - # Computing output and loss - output = model(data) - loss += criterion(output, target).item() - - # Computing accuracy - pred = output.data.max(1, keepdim=True)[1] - acc += pred.eq(target.data.view_as(pred)).sum() - - # Averaging the losses - loss /= len(loader) - - # Calculating accuracy - acc = float(acc) / len(loader.dataset) - - # Return loss and accuracy - return loss, acc - -def train( # noqa: PLR0913 - model: torch.nn.Module, - train_loader: torch.utils.data.DataLoader, - configs: dict, - test_loader: torch.utils.data.DataLoader = None, - data_split: dict = None, - logger: logging.Logger = None, -) -> torch.nn.Module: - """Train the model based on on the train loader. - - Args: - ---- - model(nn.Module): Model for evaluation. - train_loader(torch.utils.data.DataLoader): Data loader for training. - configs (dict): Configurations for training. - test_loader (torch.utils.data.DataLoader, optional): Data loader for testing (default: None). - data_split (dict, optional): Data split for training and testing (default: None). - logger (logging.Logger, optional): Logger for logging information (default: None). - - Return: - ------ - nn.Module: Trained model. - - """ - # Get the device for training - device = torch.device("cuda" if torch.cuda.is_available() else "cpu") - - # Set the model to the device - model.to(device) - model.train() - # Set the loss function and optimizer - criterion = nn.CrossEntropyLoss() - optimizer = get_optimizer(model, configs, logger) - # Get the number of epochs for training - epochs = configs["train"]["epochs"] - - # Loop over each epoch - for epoch_idx in range(epochs): - start_time = time.time() - train_loss, train_acc = 0, 0 - # Loop over the training set - model.train() - with tqdm(train_loader, desc=f"Epoch {epoch_idx + 1}/{epochs}") as pbar: - for data, target in pbar: - # Cast target to long tensor - target = target.long() # noqa: PLW2901 - - # Move data to the device - data, target = data.to(device, non_blocking=True), target.to(device, non_blocking=True) # noqa: PLW2901 - - # Set the gradients to zero - optimizer.zero_grad() - - # Get the model output - output = model(data) - # Calculate the loss - loss = criterion(output, target) - pred = output.data.max(1, keepdim=True)[1] - train_acc += pred.eq(target.data.view_as(pred)).sum() - # Perform the backward pass - loss.backward() - # Take a step using optimizer - optimizer.step() - # Add the loss to the total loss - train_loss += loss.item() - - # Log the training loss and accuracy - log_train_str = f"Epoch: {epoch_idx+1}/{epochs} | Train Loss: {train_loss/len(train_loader):.8f} | Train Acc: {float(train_acc)/len(train_loader.dataset):.8f} | One step uses {time.time() - start_time:.2f} seconds" # noqa: E501 - logger.info(log_train_str) - - test_loss, test_acc = inference(model, test_loader, device) - - log_test_str = f"Epoch: {epoch_idx+1}/{epochs} | Test Loss: {test_loss:.8f} | Test Acc: {test_acc:.8f}" - logger.info(log_test_str) - - # Move the model back to the CPU - model.to("cpu") - - save_model_and_metadata( - model, data_split, configs, train_acc, test_acc, train_loss, test_loss, optimizer, criterion - ) - - # Return the model - return model - - -def save_model_and_metadata( # noqa: PLR0913 - model: torch.nn.Module, - data_split: dict, - configs: dict, - train_acc: float, - test_acc: float, - train_loss: float, - test_loss: float, - optimizer: optim.Optimizer, - loss: nn.Module, -) -> None: - """Save the model and metadata. - - Args: - ---- - model (torch.nn.Module): Trained model. - data_split (dict): Data split for training and testing. - configs (dict): Configurations for training. - train_acc (float): Training accuracy. - test_acc (float): Testing accuracy. - train_loss (float): Training loss. - test_loss (float): Testing loss. - optimizer (str): Optimizer used for training. - loss (str): Loss function used for training. - - """ - # Save model and metadata - log_dir = configs["run"]["log_dir"] - Path(log_dir).mkdir(parents=True, exist_ok=True) - - with open(f"{log_dir}/target_model.pkl", "wb") as f: - torch.save(model.state_dict(), f) - meta_data = {} - - if hasattr(model, "init_params"): - meta_data["init_params"] = model.init_params - else: - meta_data["init_params"] = {"num_classes": configs["train"]["num_classes"]} - - meta_data["train_indices"] = data_split["train_indices"] - meta_data["test_indices"] = data_split["test_indices"] - meta_data["num_train"] = len(data_split["train_indices"]) - - # read out optimizer parameters - meta_data["optimizer"] = {} - meta_data["optimizer"]["name"] = optimizer.__class__.__name__.lower() - meta_data["optimizer"]["lr"] = optimizer.param_groups[0].get("lr", 0) - meta_data["optimizer"]["weight_decay"] = optimizer.param_groups[0].get("weight_decay", 0) - meta_data["optimizer"]["momentum"] = optimizer.param_groups[0].get("momentum", 0) - meta_data["optimizer"]["dampening"] = optimizer.param_groups[0].get("dampening", 0) - meta_data["optimizer"]["nesterov"] = optimizer.param_groups[0].get("nesterov", False) - - # read out loss parameters - meta_data["loss"] = {} - meta_data["loss"]["name"] = loss.__class__.__name__.lower() - - meta_data["batch_size"] = configs["train"]["batch_size"] - meta_data["epochs"] = configs["train"]["epochs"] - meta_data["learning_rate"] = configs["train"]["learning_rate"] - meta_data["weight_decay"] = configs["train"]["weight_decay"] - meta_data["train_acc"] = train_acc - meta_data["test_acc"] = test_acc - meta_data["train_loss"] = train_loss - meta_data["test_loss"] = test_loss - meta_data["dataset"] = configs["data"]["dataset"] - - with open(f"{log_dir}/model_metadata.pkl", "wb") as f: - pickle.dump(meta_data, f) diff --git a/leakpro_main.py b/leakpro_main.py deleted file mode 100644 index 3023798a..00000000 --- a/leakpro_main.py +++ /dev/null @@ -1,118 +0,0 @@ -"""Main script to run LEAKPRO on a target model.""" - -import logging -import os -import random -import time -from pathlib import Path - -import numpy as np -import yaml -from torch import manual_seed -from torch.utils.data import Subset - -import dev_utils.train as utils -from dev_utils import shadow_model_blueprints -from dev_utils.cifar10_input_handler import Cifar10InputHandler -from dev_utils.data_preparation import ( - get_adult_dataset, - get_cifar10_dataset, - get_cifar100_dataset, - get_cinic10_dataset, - prepare_train_test_datasets, -) -from leakpro.attacks.attack_scheduler import AttackScheduler -from leakpro.dataset import get_dataloader -from leakpro.reporting.utils import prepare_privacy_risk_report -from leakpro.utils.handler_logger import setup_log - - -def generate_user_input(configs: dict, retrain: bool = False, logger: logging.Logger = None)->None: - """Generate user input for the target model.""" - # ------------------------------------------------ - - # Create the population dataset and target_model - if configs["data"]["dataset"] == "adult": - population = get_adult_dataset(configs["data"]["dataset"], configs["data"]["data_dir"], logger) - target_model = shadow_model_blueprints.NN(configs["train"]["inputs"], configs["train"]["outputs"]) - elif configs["data"]["dataset"] == "cifar10": - population = get_cifar10_dataset(configs["data"]["dataset"], configs["data"]["data_dir"], logger) - target_model = shadow_model_blueprints.ResNet18(configs["train"]["num_classes"]) - elif configs["data"]["dataset"] == "cifar100": - population = get_cifar100_dataset(configs["data"]["dataset"], configs["data"]["data_dir"], logger) - target_model = shadow_model_blueprints.ResNet18(configs["train"]["num_classes"]) - elif configs["data"]["dataset"] == "cinic10": - population = get_cinic10_dataset(configs["data"]["dataset"], configs["data"]["data_dir"], logger) - target_model = shadow_model_blueprints.ResNet18(configs["train"]["num_classes"]) - - n_population = len(population) - - # Create target training dataset and test dataset - # NOTE: this should not be done as the model is provided by the user - train_test_dataset = prepare_train_test_datasets(n_population, configs["data"]) - - train_loader = get_dataloader( - Subset(population, train_test_dataset["train_indices"]), - batch_size=configs["train"]["batch_size"], - shuffle=True, - ) - test_loader = get_dataloader( - Subset(population, train_test_dataset["test_indices"]), - batch_size=configs["train"]["test_batch_size"], - ) - - if retrain: - target_model = utils.train(target_model, train_loader, configs, test_loader, train_test_dataset, logger) - - -if __name__ == "__main__": - - user_args = "./config/dev_config/cifar10.yaml" # noqa: ERA001 - - with open(user_args, "rb") as f: - user_configs = yaml.safe_load(f) - - # Setup logger - logger = setup_log("LeakPro", save_file=True) - - # Generate user input - generate_user_input(user_configs, retrain=True, logger=logger) # This is for developing purposes only - - start_time = time.time() - # ------------------------------------------------ - # LEAKPRO starts here - args = "./config/audit.yaml" - with open(args, "rb") as f: - configs = yaml.safe_load(f) - - # Set the random seed, log_dir and inference_game - manual_seed(configs["audit"]["random_seed"]) - np.random.seed(configs["audit"]["random_seed"]) - random.seed(configs["audit"]["random_seed"]) - - # Create directory to store results - report_dir = f"{configs['audit']['report_log']}" - Path(report_dir).mkdir(parents=True, exist_ok=True) - - # Create user input handler - handler = Cifar10InputHandler(configs=configs, logger=logger) - - attack_scheduler = AttackScheduler(handler) - audit_results = attack_scheduler.run_attacks() - - for attack_name in audit_results: - logger.info(f"Preparing results for attack: {attack_name}") - - prepare_privacy_risk_report( - audit_results[attack_name]["result_object"], - configs["audit"], - save_path=f"{report_dir}/{attack_name}", - ) - # ------------------------------------------------ - # Save the configs and user_configs - config_log_path = configs["audit"]["config_log"] - os.makedirs(config_log_path, exist_ok=True) - with open(f"{config_log_path}/audit.yaml", "w") as f: - yaml.safe_dump(configs, f) - with open(f"{config_log_path}/user_config.yaml", "w") as f: - yaml.safe_dump(user_configs, f) diff --git a/leakprofl.py b/leakprofl.py deleted file mode 100755 index 0739e5da..00000000 --- a/leakprofl.py +++ /dev/null @@ -1,102 +0,0 @@ -"""Main script to run LEAKPRO on a target model.""" - -import logging -import os -import random -import time -from pathlib import Path - -import numpy as np -import yaml -from torch import manual_seed -from torchvision.models.resnet import BasicBlock - -from dev_utils import shadow_model_blueprints -from dev_utils.cifar10_gia_input_handler import Cifar10GIAInputHandler -from leakpro.attacks.attack_scheduler import AttackScheduler - - -def setup_log(name: str, save_file: bool=True) -> logging.Logger: - """Generate the logger for the current run. - - Args: - ---- - name (str): Logging file name. - save_file (bool): Flag about whether to save to file. - - Returns: - ------- - logging.Logger: Logger object for the current run. - - """ - my_logger = logging.getLogger(name) - my_logger.setLevel(logging.INFO) - log_format = logging.Formatter("%(asctime)s %(levelname)-8s %(message)s") - - # Console handler for output to the console - console_handler = logging.StreamHandler() - console_handler.setLevel(logging.INFO) - console_handler.setFormatter(log_format) - my_logger.addHandler(console_handler) - - if save_file: - filename = f"log_{name}.log" - log_handler = logging.FileHandler(filename, mode="w") - log_handler.setLevel(logging.INFO) - log_handler.setFormatter(log_format) - my_logger.addHandler(log_handler) - - return my_logger - -def generate_client_input(configs: dict) -> tuple: - """Generate client data and data splits.""" - # ------------------------------------------------ - - # Create the population dataset and target_model - if "cifar10" in configs["gia_settings"]["dataset"]: - target_model = shadow_model_blueprints.ResNet(BasicBlock, [5, 5, 5], num_classes=10, base_width=16 * 10) - else: - raise KeyError(f"""dataset {configs["gia_settings"]["dataset"]} not implemented.""") - - return target_model - -if __name__ == "__main__": - - # Setup logger - logger = setup_log("LeakPro", save_file=True) - - start_time = time.time() - # ------------------------------------------------ - # LEAKPRO starts here - args = "./config/audit.yaml" # noqa: ERA001 - - with open(args, "rb") as f: - configs = yaml.safe_load(f) - - # Create client loader and model - target_model = generate_client_input(configs["audit"]) - # Set the random seed, log_dir - manual_seed(configs["audit"]["random_seed"]) - np.random.seed(configs["audit"]["random_seed"]) - random.seed(configs["audit"]["random_seed"]) - - # Create directory to store results - report_dir = f"{configs['audit']['report_log']}" - Path(report_dir).mkdir(parents=True, exist_ok=True) - - # Create user input handler - handler = Cifar10GIAInputHandler(configs=configs, logger=logger, target_model=target_model) - - attack_scheduler = AttackScheduler(handler) - audit_results = attack_scheduler.run_attacks() - - for attack_name in audit_results: - logger.info(f"Preparing results for attack: {attack_name}") - - audit_results[attack_name]["result_object"].prepare_privacy_risk_report(attack_name, report_dir) - # ------------------------------------------------ - # Save the configs and user_configs - config_log_path = configs["audit"]["config_log"] - os.makedirs(config_log_path, exist_ok=True) - with open(f"{config_log_path}/audit.yaml", "w") as f: - yaml.safe_dump(configs, f)