Skip to content

Commit

Permalink
Merge pull request #168 from aidotse/report_handler
Browse files Browse the repository at this point in the history
Report handler
  • Loading branch information
henrikfo authored Dec 11, 2024
2 parents 81b735b + 088156c commit f7a0b1c
Show file tree
Hide file tree
Showing 44 changed files with 2,736 additions and 234 deletions.
9 changes: 5 additions & 4 deletions examples/mia/tabular_mia/adult_handler.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@

from leakpro import AbstractInputHandler


class AdultInputHandler(AbstractInputHandler):
"""Class to handle the user input for the CIFAR10 dataset."""

Expand Down Expand Up @@ -41,11 +42,11 @@ def train(

criterion = self.get_criterion()
optimizer = self.get_optimizer(model)

for e in tqdm(range(epochs), desc="Training Progress"):
model.train()
train_acc, train_loss = 0.0, 0.0

for data, target in dataloader:
target = target.float().unsqueeze(1)
data, target = data.to(dev, non_blocking=True), target.to(dev, non_blocking=True)
Expand All @@ -55,11 +56,11 @@ def train(
loss = criterion(output, target)
pred = sigmoid(output) >= 0.5
train_acc += pred.eq(target).sum().item()

loss.backward()
optimizer.step()
train_loss += loss.item()

train_acc = train_acc/len(dataloader.dataset)
train_loss = train_loss/len(dataloader)

Expand Down
35 changes: 19 additions & 16 deletions examples/mia/tabular_mia/main.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -37,10 +37,13 @@
"project_root = os.path.abspath(os.path.join(os.getcwd(), \"../../..\"))\n",
"sys.path.append(project_root)\n",
"\n",
"from examples.mia.tabular_mia.utils.adult_data_preparation import preprocess_adult_dataset, get_adult_dataloaders, download_adult_dataset\n",
"from examples.mia.tabular_mia.utils.adult_data_preparation import (\n",
" download_adult_dataset,\n",
" get_adult_dataloaders,\n",
" preprocess_adult_dataset,\n",
")\n",
"from examples.mia.tabular_mia.utils.adult_model_preparation import AdultNet, create_trained_model_and_metadata\n",
"\n",
"\n",
"# Generate the dataset and dataloaders\n",
"path = os.path.join(os.getcwd(), \"data/\")\n",
"\n",
Expand All @@ -55,9 +58,9 @@
"if not os.path.exists(\"target\"):\n",
" os.makedirs(\"target\")\n",
"model = AdultNet(input_size=n_features, hidden_size=64, num_classes=n_classes)\n",
"train_acc, train_loss, test_acc, test_loss = create_trained_model_and_metadata(model, \n",
" train_loader, \n",
" test_loader, \n",
"train_acc, train_loss, test_acc, test_loss = create_trained_model_and_metadata(model,\n",
" train_loader,\n",
" test_loader,\n",
" epochs=10)"
]
},
Expand All @@ -84,20 +87,20 @@
"plt.figure(figsize=(5, 4))\n",
"\n",
"plt.subplot(1, 2, 1)\n",
"plt.plot(train_acc, label='Train Accuracy')\n",
"plt.plot(test_acc, label='Test Accuracy')\n",
"plt.xlabel('Epoch')\n",
"plt.ylabel('Accuracy')\n",
"plt.title('Accuracy over Epochs')\n",
"plt.plot(train_acc, label=\"Train Accuracy\")\n",
"plt.plot(test_acc, label=\"Test Accuracy\")\n",
"plt.xlabel(\"Epoch\")\n",
"plt.ylabel(\"Accuracy\")\n",
"plt.title(\"Accuracy over Epochs\")\n",
"plt.legend()\n",
"\n",
"# Plot training and test loss\n",
"plt.subplot(1, 2, 2)\n",
"plt.plot(train_loss, label='Train Loss')\n",
"plt.plot(test_loss, label='Test Loss')\n",
"plt.xlabel('Epoch')\n",
"plt.ylabel('Loss')\n",
"plt.title('Loss over Epochs')\n",
"plt.plot(train_loss, label=\"Train Loss\")\n",
"plt.plot(test_loss, label=\"Test Loss\")\n",
"plt.xlabel(\"Epoch\")\n",
"plt.ylabel(\"Loss\")\n",
"plt.title(\"Loss over Epochs\")\n",
"plt.legend()\n",
"\n",
"plt.tight_layout()\n",
Expand Down Expand Up @@ -501,7 +504,7 @@
"# Prepare leakpro object\n",
"leakpro = LeakPro(AdultInputHandler, config_path)\n",
"\n",
"# Run the audit \n",
"# Run the audit\n",
"leakpro.run_audit()"
]
},
Expand Down
45 changes: 23 additions & 22 deletions examples/mia/tabular_mia/utils/adult_data_preparation.py
Original file line number Diff line number Diff line change
@@ -1,25 +1,26 @@
import os
import pickle
import urllib.request

import joblib
import numpy as np
import pandas as pd
import joblib
import pickle
from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
import urllib.request
from torch.utils.data import Dataset, Subset, DataLoader
from torch import tensor, float32
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
from torch import float32, tensor
from torch.utils.data import DataLoader, Dataset, Subset


class AdultDataset(Dataset):
def __init__(self, x:tensor, y:tensor, dec_to_onehot:dict, one_hot_encoded:bool=True):
self.x = x
self.y = y

# create dictionary to map between indices in categorical representation and one-hot encoded representation
# For example: cols 1,2 continuous and col 3 categorical with 3 categories will be mapped to {1:1,2:2,3:[3,4,5]}
self.dec_to_onehot = dec_to_onehot
self.one_hot_encoded = one_hot_encoded

def __len__(self):
return len(self.y)

Expand All @@ -28,8 +29,8 @@ def __getitem__(self, idx):

def subset(self, indices):
return AdultDataset(self.x[indices], self.y[indices], self.dec_to_onehot, self.one_hot_encoded)


def download_adult_dataset(data_dir):
"""Download the Adult Dataset if it's not present."""
# URLs for the dataset
Expand All @@ -54,22 +55,22 @@ def download_adult_dataset(data_dir):

def preprocess_adult_dataset(path):
"""Get the dataset, download it if necessary, and store it."""

if os.path.exists(os.path.join(path, "adult_data.pkl")):
with open(os.path.join(path, "adult_data.pkl"), "rb") as f:
dataset = joblib.load(f)
else:
else:
column_names = [
"age", "workclass", "fnlwgt", "education", "education-num",
"age", "workclass", "fnlwgt", "education", "education-num",
"marital-status", "occupation", "relationship", "race", "sex",
"capital-gain", "capital-loss", "hours-per-week", "native-country", "income",
]

# Load and clean data
df_train = pd.read_csv(os.path.join(path, "adult.data"), names=column_names)
df_test = pd.read_csv(os.path.join(path, "adult.test"), names=column_names, header=0)
df_test["income"] = df_test["income"].str.replace(".", "", regex=False)

df_concatenated = pd.concat([df_train, df_test], axis=0)
df_clean = df_concatenated.replace(" ?", np.nan).dropna()

Expand All @@ -83,19 +84,19 @@ def preprocess_adult_dataset(path):
# Scaling numerical features
scaler = StandardScaler()
x_numerical = pd.DataFrame(scaler.fit_transform(x[numerical_features]), columns=numerical_features, index=x.index)

# Label encode the categories
one_hot_encoder = OneHotEncoder(sparse_output=False)
x_categorical_one_hot = one_hot_encoder.fit_transform(x[categorical_features])
one_hot_feature_names = one_hot_encoder.get_feature_names_out(categorical_features)
x_categorical_one_hot_df = pd.DataFrame(x_categorical_one_hot, columns=one_hot_feature_names, index=x.index)

# Concatenate the numerical and one-hot encoded categorical features
x_final = pd.concat([x_numerical, x_categorical_one_hot_df], axis=1)

# Label encode the target variable
y = pd.Series(LabelEncoder().fit_transform(y))

# Add numerical features to the dictionary
dec_to_onehot_mapping = {}
for i, feature in enumerate(numerical_features):
Expand All @@ -115,22 +116,22 @@ def preprocess_adult_dataset(path):
with open(f"{path}/adult_data.pkl", "wb") as file:
pickle.dump(dataset, file)
print(f"Save data to {path}.pkl")

return dataset

def get_adult_dataloaders(dataset, train_fraction=0.3, test_fraction=0.3):

dataset_size = len(dataset)
train_size = int(train_fraction * dataset_size)
test_size = int(test_fraction * dataset_size)

# Use sklearn's train_test_split to split into train and test indices
selected_index = np.random.choice(np.arange(dataset_size), train_size + test_size, replace=False)
train_indices, test_indices = train_test_split(selected_index, test_size=test_size)

train_subset = Subset(dataset, train_indices)
test_subset = Subset(dataset, test_indices)

train_loader = DataLoader(train_subset, batch_size=128, shuffle=True)
test_loader = DataLoader(test_subset, batch_size=128, shuffle=False)

Expand Down
27 changes: 14 additions & 13 deletions examples/mia/tabular_mia/utils/adult_model_preparation.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
import torch.nn as nn
from torch import device, optim, cuda, no_grad, save, sigmoid
import pickle

from torch import cuda, device, nn, no_grad, optim, save, sigmoid
from tqdm import tqdm


class AdultNet(nn.Module):
def __init__(self, input_size, hidden_size, num_classes):
super(AdultNet, self).__init__()
Expand All @@ -13,7 +14,7 @@ def __init__(self, input_size, hidden_size, num_classes):
self.relu = nn.ReLU()
self.fc2 = nn.Linear(hidden_size, hidden_size)
self.fc3 = nn.Linear(hidden_size, num_classes)

def forward(self, x):
out = self.fc1(x)
out = self.relu(out)
Expand Down Expand Up @@ -47,11 +48,11 @@ def create_trained_model_and_metadata(model, train_loader, test_loader, epochs =
optimizer = optim.SGD(model.parameters(), lr=0.1, momentum=0.8)
train_losses, train_accuracies = [], []
test_losses, test_accuracies = [], []

for e in tqdm(range(epochs), desc="Training Progress"):
model.train()
train_acc, train_loss = 0.0, 0.0

for data, target in train_loader:
target = target.float().unsqueeze(1)
data, target = data.to(device_name, non_blocking=True), target.to(device_name, non_blocking=True)
Expand All @@ -61,17 +62,17 @@ def create_trained_model_and_metadata(model, train_loader, test_loader, epochs =
loss = criterion(output, target)
pred = sigmoid(output) >= 0.5
train_acc += pred.eq(target).sum().item()

loss.backward()
optimizer.step()
train_loss += loss.item()

train_loss /= len(train_loader)
train_acc /= len(train_loader.dataset)

train_losses.append(train_loss)
train_accuracies.append(train_acc)

test_loss, test_acc = evaluate(model, test_loader, criterion, device_name)
test_losses.append(test_loss)
test_accuracies.append(test_acc)
Expand All @@ -86,12 +87,12 @@ def create_trained_model_and_metadata(model, train_loader, test_loader, epochs =
meta_data["train_indices"] = train_loader.dataset.indices
meta_data["test_indices"] = test_loader.dataset.indices
meta_data["num_train"] = len(meta_data["train_indices"])

# Write init params
meta_data["init_params"] = {}
for key, value in model.init_params.items():
meta_data["init_params"][key] = value

# read out optimizer parameters
meta_data["optimizer"] = {}
meta_data["optimizer"]["name"] = optimizer.__class__.__name__.lower()
Expand All @@ -112,8 +113,8 @@ def create_trained_model_and_metadata(model, train_loader, test_loader, epochs =
meta_data["train_loss"] = train_loss
meta_data["test_loss"] = test_loss
meta_data["dataset"] = "adult"

with open("target/model_metadata.pkl", "wb") as f:
pickle.dump(meta_data, f)

return train_accuracies, train_losses, test_accuracies, test_losses
51 changes: 51 additions & 0 deletions examples/report_handler/mia_utils/audit.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
audit: # Configurations for auditing
random_seed: 1234 # Integer specifying the random seed
attack_list:
rmia:
training_data_fraction: 0.5 # Fraction of the auxilary dataset to use for this attack (in each shadow model training)
attack_data_fraction: 0.5 # Fraction of auxiliary dataset to sample from during attack
num_shadow_models: 3 # Number of shadow models to train
online: True # perform online or offline attack
temperature: 2
gamma: 2.0
offline_a: 0.33 # parameter from which we compute p(x) from p_OUT(x) such that p_IN(x) = a p_OUT(x) + b.
offline_b: 0.66
population:
attack_data_fraction: 1.0 # Fraction of the auxilary dataset to use for this attack
lira:
training_data_fraction: 0.5 # Fraction of the auxilary dataset to use for this attack (in each shadow model training)
num_shadow_models: 3 # Number of shadow models to train
online: True # perform online or offline attack
loss_traj:
training_distill_data_fraction : 0.7 # Fraction of the auxilary dataset to use for training the distillation models D_s = (1-D_KD)/2
number_of_traj: 10 # Number of epochs (number of points in the loss trajectory)
label_only: False # True or False
mia_classifier_epochs: 100
HSJ:
attack_data_fraction: 0.01 # Fraction of the auxilary dataset to use for this attack
target_metadata_path: "./target/model_metadata.pkl"
num_iterations: 2 # Number of iterations for the optimization
initial_num_evals: 100 # Number of evaluations for number of random vecotr to estimate the gradient
max_num_evals: 10000 # Maximum number of evaluations
stepsize_search: "geometric_progression" # Step size search method
gamma: 1.0 # Gamma for the optimization
constraint: 2
batch_size: 50
verbose: True
epsilon_threshold: 1e-6

output_dir: "./leakpro_output"
attack_type: "mia" #mia, gia
modality: "image" #image, tabular

target:
# Target model path
module_path: "./mia_utils/utils/cifar_model_preparation.py"
model_class: "ResNet18"
# Data paths
target_folder: "./target"
data_path: "./data/cifar10.pkl"

shadow_model:

distillation_model:
Loading

0 comments on commit f7a0b1c

Please sign in to comment.