Merge pull request #168 from aidotse/report_handler

Report handler
aidotse · Dec 11, 2024 · f7a0b1c · f7a0b1c
2 parents 81b735b + 088156c
commit f7a0b1c
Show file tree

Hide file tree

Showing 44 changed files with 2,736 additions and 234 deletions.
diff --git a/examples/mia/tabular_mia/adult_handler.py b/examples/mia/tabular_mia/adult_handler.py
@@ -8,6 +8,7 @@
 
 from leakpro import AbstractInputHandler
 
+
 class AdultInputHandler(AbstractInputHandler):
     """Class to handle the user input for the CIFAR10 dataset."""
 
@@ -41,11 +42,11 @@ def train(
 
         criterion = self.get_criterion()
         optimizer = self.get_optimizer(model)
-        
+
         for e in tqdm(range(epochs), desc="Training Progress"):
             model.train()
             train_acc, train_loss = 0.0, 0.0
-            
+
             for data, target in dataloader:
                 target = target.float().unsqueeze(1)
                 data, target = data.to(dev, non_blocking=True), target.to(dev, non_blocking=True)
@@ -55,11 +56,11 @@ def train(
                 loss = criterion(output, target)
                 pred = sigmoid(output) >= 0.5
                 train_acc += pred.eq(target).sum().item()
-                
+
                 loss.backward()
                 optimizer.step()
                 train_loss += loss.item()
-        
+
         train_acc = train_acc/len(dataloader.dataset)
         train_loss = train_loss/len(dataloader)
 

diff --git a/examples/mia/tabular_mia/main.ipynb b/examples/mia/tabular_mia/main.ipynb
@@ -37,10 +37,13 @@
     "project_root = os.path.abspath(os.path.join(os.getcwd(), \"../../..\"))\n",
     "sys.path.append(project_root)\n",
     "\n",
-    "from examples.mia.tabular_mia.utils.adult_data_preparation import preprocess_adult_dataset, get_adult_dataloaders, download_adult_dataset\n",
+    "from examples.mia.tabular_mia.utils.adult_data_preparation import (\n",
+    "    download_adult_dataset,\n",
+    "    get_adult_dataloaders,\n",
+    "    preprocess_adult_dataset,\n",
+    ")\n",
     "from examples.mia.tabular_mia.utils.adult_model_preparation import AdultNet, create_trained_model_and_metadata\n",
     "\n",
-    "\n",
     "# Generate the dataset and dataloaders\n",
     "path = os.path.join(os.getcwd(), \"data/\")\n",
     "\n",
@@ -55,9 +58,9 @@
     "if not os.path.exists(\"target\"):\n",
     "    os.makedirs(\"target\")\n",
     "model = AdultNet(input_size=n_features, hidden_size=64, num_classes=n_classes)\n",
-    "train_acc, train_loss, test_acc, test_loss = create_trained_model_and_metadata(model, \n",
-    "                                                                               train_loader, \n",
-    "                                                                               test_loader, \n",
+    "train_acc, train_loss, test_acc, test_loss = create_trained_model_and_metadata(model,\n",
+    "                                                                               train_loader,\n",
+    "                                                                               test_loader,\n",
     "                                                                               epochs=10)"
    ]
   },
@@ -84,20 +87,20 @@
     "plt.figure(figsize=(5, 4))\n",
     "\n",
     "plt.subplot(1, 2, 1)\n",
-    "plt.plot(train_acc, label='Train Accuracy')\n",
-    "plt.plot(test_acc, label='Test Accuracy')\n",
-    "plt.xlabel('Epoch')\n",
-    "plt.ylabel('Accuracy')\n",
-    "plt.title('Accuracy over Epochs')\n",
+    "plt.plot(train_acc, label=\"Train Accuracy\")\n",
+    "plt.plot(test_acc, label=\"Test Accuracy\")\n",
+    "plt.xlabel(\"Epoch\")\n",
+    "plt.ylabel(\"Accuracy\")\n",
+    "plt.title(\"Accuracy over Epochs\")\n",
     "plt.legend()\n",
     "\n",
     "# Plot training and test loss\n",
     "plt.subplot(1, 2, 2)\n",
-    "plt.plot(train_loss, label='Train Loss')\n",
-    "plt.plot(test_loss, label='Test Loss')\n",
-    "plt.xlabel('Epoch')\n",
-    "plt.ylabel('Loss')\n",
-    "plt.title('Loss over Epochs')\n",
+    "plt.plot(train_loss, label=\"Train Loss\")\n",
+    "plt.plot(test_loss, label=\"Test Loss\")\n",
+    "plt.xlabel(\"Epoch\")\n",
+    "plt.ylabel(\"Loss\")\n",
+    "plt.title(\"Loss over Epochs\")\n",
     "plt.legend()\n",
     "\n",
     "plt.tight_layout()\n",
@@ -501,7 +504,7 @@
     "# Prepare leakpro object\n",
     "leakpro = LeakPro(AdultInputHandler, config_path)\n",
     "\n",
-    "# Run the audit \n",
+    "# Run the audit\n",
     "leakpro.run_audit()"
    ]
   },

diff --git a/examples/mia/tabular_mia/utils/adult_data_preparation.py b/examples/mia/tabular_mia/utils/adult_data_preparation.py
@@ -1,25 +1,26 @@
 import os
+import pickle
+import urllib.request
+
+import joblib
 import numpy as np
 import pandas as pd
-import joblib
-import pickle
-from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder
 from sklearn.model_selection import train_test_split
-import urllib.request
-from torch.utils.data import Dataset, Subset, DataLoader
-from torch import tensor, float32
+from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
+from torch import float32, tensor
+from torch.utils.data import DataLoader, Dataset, Subset
 
 
 class AdultDataset(Dataset):
     def __init__(self, x:tensor, y:tensor, dec_to_onehot:dict, one_hot_encoded:bool=True):
         self.x = x
         self.y = y
-        
+
         # create dictionary to map between indices in categorical representation and one-hot encoded representation
         # For example: cols 1,2 continuous and col 3 categorical with 3 categories will be mapped to {1:1,2:2,3:[3,4,5]}
         self.dec_to_onehot = dec_to_onehot
         self.one_hot_encoded = one_hot_encoded
-    
+
     def __len__(self):
         return len(self.y)
 
@@ -28,8 +29,8 @@ def __getitem__(self, idx):
 
     def subset(self, indices):
         return AdultDataset(self.x[indices], self.y[indices], self.dec_to_onehot, self.one_hot_encoded)
-    
-    
+
+
 def download_adult_dataset(data_dir):
     """Download the Adult Dataset if it's not present."""
     # URLs for the dataset
@@ -54,22 +55,22 @@ def download_adult_dataset(data_dir):
 
 def preprocess_adult_dataset(path):
     """Get the dataset, download it if necessary, and store it."""
-    
+
     if os.path.exists(os.path.join(path, "adult_data.pkl")):
         with open(os.path.join(path, "adult_data.pkl"), "rb") as f:
             dataset = joblib.load(f)
-    else: 
+    else:
         column_names = [
-            "age", "workclass", "fnlwgt", "education", "education-num", 
+            "age", "workclass", "fnlwgt", "education", "education-num",
             "marital-status", "occupation", "relationship", "race", "sex",
             "capital-gain", "capital-loss", "hours-per-week", "native-country", "income",
         ]
-        
+
         # Load and clean data
         df_train = pd.read_csv(os.path.join(path, "adult.data"), names=column_names)
         df_test = pd.read_csv(os.path.join(path, "adult.test"), names=column_names, header=0)
         df_test["income"] = df_test["income"].str.replace(".", "", regex=False)
-        
+
         df_concatenated = pd.concat([df_train, df_test], axis=0)
         df_clean = df_concatenated.replace(" ?", np.nan).dropna()
 
@@ -83,19 +84,19 @@ def preprocess_adult_dataset(path):
         # Scaling numerical features
         scaler = StandardScaler()
         x_numerical = pd.DataFrame(scaler.fit_transform(x[numerical_features]), columns=numerical_features, index=x.index)
-        
+
         # Label encode the categories
         one_hot_encoder = OneHotEncoder(sparse_output=False)
         x_categorical_one_hot = one_hot_encoder.fit_transform(x[categorical_features])
         one_hot_feature_names = one_hot_encoder.get_feature_names_out(categorical_features)
         x_categorical_one_hot_df = pd.DataFrame(x_categorical_one_hot, columns=one_hot_feature_names, index=x.index)
-        
+
         # Concatenate the numerical and one-hot encoded categorical features
         x_final = pd.concat([x_numerical, x_categorical_one_hot_df], axis=1)
 
         # Label encode the target variable
         y = pd.Series(LabelEncoder().fit_transform(y))
-        
+
         # Add numerical features to the dictionary
         dec_to_onehot_mapping = {}
         for i, feature in enumerate(numerical_features):
@@ -115,22 +116,22 @@ def preprocess_adult_dataset(path):
         with open(f"{path}/adult_data.pkl", "wb") as file:
             pickle.dump(dataset, file)
             print(f"Save data to {path}.pkl")
-    
+
     return dataset
 
 def get_adult_dataloaders(dataset, train_fraction=0.3, test_fraction=0.3):
-    
+
     dataset_size = len(dataset)
     train_size = int(train_fraction * dataset_size)
     test_size = int(test_fraction * dataset_size)
 
     # Use sklearn's train_test_split to split into train and test indices
     selected_index = np.random.choice(np.arange(dataset_size), train_size + test_size, replace=False)
     train_indices, test_indices = train_test_split(selected_index, test_size=test_size)
-    
+
     train_subset = Subset(dataset, train_indices)
     test_subset = Subset(dataset, test_indices)
-    
+
     train_loader = DataLoader(train_subset, batch_size=128, shuffle=True)
     test_loader = DataLoader(test_subset, batch_size=128, shuffle=False)
 

diff --git a/examples/mia/tabular_mia/utils/adult_model_preparation.py b/examples/mia/tabular_mia/utils/adult_model_preparation.py
@@ -1,8 +1,9 @@
-import torch.nn as nn
-from torch import device, optim, cuda, no_grad, save, sigmoid
 import pickle
+
+from torch import cuda, device, nn, no_grad, optim, save, sigmoid
 from tqdm import tqdm
 
+
 class AdultNet(nn.Module):
     def __init__(self, input_size, hidden_size, num_classes):
         super(AdultNet, self).__init__()
@@ -13,7 +14,7 @@ def __init__(self, input_size, hidden_size, num_classes):
         self.relu = nn.ReLU()
         self.fc2 = nn.Linear(hidden_size, hidden_size)
         self.fc3 = nn.Linear(hidden_size, num_classes)
-    
+
     def forward(self, x):
         out = self.fc1(x)
         out = self.relu(out)
@@ -47,11 +48,11 @@ def create_trained_model_and_metadata(model, train_loader, test_loader, epochs =
     optimizer = optim.SGD(model.parameters(), lr=0.1, momentum=0.8)
     train_losses, train_accuracies = [], []
     test_losses, test_accuracies = [], []
-    
+
     for e in tqdm(range(epochs), desc="Training Progress"):
         model.train()
         train_acc, train_loss = 0.0, 0.0
-        
+
         for data, target in train_loader:
             target = target.float().unsqueeze(1)
             data, target = data.to(device_name, non_blocking=True), target.to(device_name, non_blocking=True)
@@ -61,17 +62,17 @@ def create_trained_model_and_metadata(model, train_loader, test_loader, epochs =
             loss = criterion(output, target)
             pred = sigmoid(output) >= 0.5
             train_acc += pred.eq(target).sum().item()
-            
+
             loss.backward()
             optimizer.step()
             train_loss += loss.item()
-        
+
         train_loss /= len(train_loader)
         train_acc /= len(train_loader.dataset)
-            
+
         train_losses.append(train_loss)
         train_accuracies.append(train_acc)
-        
+
         test_loss, test_acc = evaluate(model, test_loader, criterion, device_name)
         test_losses.append(test_loss)
         test_accuracies.append(test_acc)
@@ -86,12 +87,12 @@ def create_trained_model_and_metadata(model, train_loader, test_loader, epochs =
     meta_data["train_indices"] = train_loader.dataset.indices
     meta_data["test_indices"] = test_loader.dataset.indices
     meta_data["num_train"] = len(meta_data["train_indices"])
-    
+
     # Write init params
     meta_data["init_params"] = {}
     for key, value in model.init_params.items():
         meta_data["init_params"][key] = value
-    
+
     # read out optimizer parameters
     meta_data["optimizer"] = {}
     meta_data["optimizer"]["name"] = optimizer.__class__.__name__.lower()
@@ -112,8 +113,8 @@ def create_trained_model_and_metadata(model, train_loader, test_loader, epochs =
     meta_data["train_loss"] = train_loss
     meta_data["test_loss"] = test_loss
     meta_data["dataset"] = "adult"
-    
+
     with open("target/model_metadata.pkl", "wb") as f:
         pickle.dump(meta_data, f)
-    
+
     return train_accuracies, train_losses, test_accuracies, test_losses
diff --git a/examples/report_handler/mia_utils/audit.yaml b/examples/report_handler/mia_utils/audit.yaml
@@ -0,0 +1,51 @@
+audit:  # Configurations for auditing
+  random_seed: 1234  # Integer specifying the random seed
+  attack_list:
+    rmia:
+      training_data_fraction: 0.5  # Fraction of the auxilary dataset to use for this attack (in each shadow model training)
+      attack_data_fraction: 0.5 # Fraction of auxiliary dataset to sample from during attack
+      num_shadow_models: 3 # Number of shadow models to train
+      online: True # perform online or offline attack
+      temperature: 2
+      gamma: 2.0
+      offline_a: 0.33 # parameter from which we compute p(x) from p_OUT(x) such that p_IN(x) = a p_OUT(x) + b.
+      offline_b: 0.66
+    population:
+      attack_data_fraction: 1.0  # Fraction of the auxilary dataset to use for this attack
+    lira:
+      training_data_fraction: 0.5  # Fraction of the auxilary dataset to use for this attack (in each shadow model training)
+      num_shadow_models: 3 # Number of shadow models to train
+      online: True # perform online or offline attack
+    loss_traj:
+      training_distill_data_fraction : 0.7 # Fraction of the auxilary dataset to use for training the distillation models D_s = (1-D_KD)/2
+      number_of_traj: 10 # Number of epochs (number of points in the loss trajectory)
+      label_only: False # True or False
+      mia_classifier_epochs: 100
+    HSJ:
+      attack_data_fraction: 0.01  # Fraction of the auxilary dataset to use for this attack
+      target_metadata_path: "./target/model_metadata.pkl"
+      num_iterations: 2 # Number of iterations for the optimization
+      initial_num_evals: 100 # Number of evaluations for number of random vecotr to estimate the gradient
+      max_num_evals: 10000 # Maximum number of evaluations
+      stepsize_search: "geometric_progression" # Step size search method
+      gamma: 1.0 # Gamma for the optimization
+      constraint: 2 
+      batch_size: 50 
+      verbose: True
+      epsilon_threshold: 1e-6
+
+  output_dir: "./leakpro_output"
+  attack_type: "mia" #mia, gia
+  modality: "image" #image, tabular
+
+target:
+  # Target model path
+  module_path: "./mia_utils/utils/cifar_model_preparation.py"
+  model_class: "ResNet18" 
+  # Data paths
+  target_folder: "./target"
+  data_path: "./data/cifar10.pkl"
+
+shadow_model:
+
+distillation_model: