Skip to content

Commit

Permalink
first step towards tabular fl attack
Browse files Browse the repository at this point in the history
  • Loading branch information
johanos1 committed Nov 27, 2024
1 parent 9f4d953 commit 8e8f05f
Show file tree
Hide file tree
Showing 10 changed files with 468 additions and 34 deletions.
48 changes: 48 additions & 0 deletions examples/gia/adult/adultGIAHandler.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
from leakpro.input_handler.abstract_gia_input_handler import AbstractGIAInputHandler
from leakpro.fl_utils.gia_optimizers import MetaOptimizer
from leakpro.fl_utils.gia_module_to_functional import MetaModule

from torch.nn import Module, BCEWithLogitsLoss
from torch import cuda, device, optim
from torch.utils.data import DataLoader

class adultGiaHandler(AbstractGIAInputHandler):

def __init__(self, configs: dict, dataloader:DataLoader) -> None:
super().__init__(configs)
self.dataloader = dataloader

def get_client_dataloader(self):
return self.dataloader

def get_criterion(self)->None:
"""Set the CrossEntropyLoss for the model."""
return BCEWithLogitsLoss()

def get_optimizer(self, model:Module) -> None:
"""Set the optimizer for the model."""
learning_rate = 0.1
momentum = 0.8
return optim.SGD(model.parameters(), lr=learning_rate, momentum=momentum)

def train(
dataloader: DataLoader,
model: Module = None,
criterion: Module = None,
optimizer: MetaOptimizer = None,
epochs: int = None,
) -> dict:
"""Model training procedure."""

dev = device("cuda" if cuda.is_available() else "cpu")
model.to(dev)
patched_model = MetaModule(model)

for e in range(epochs):
for data, target in dataloader:
target = target.float().unsqueeze(1)
data, target = data.to(dev, non_blocking=True), target.to(dev, non_blocking=True)
output = patched_model(data, patched_model.parameters)
loss = criterion(output, target)
patched_model.parameters = optimizer.step(loss, patched_model.parameters)
return patched_model
18 changes: 18 additions & 0 deletions examples/gia/adult/audit.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
audit: # Configurations for auditing
random_seed: 1234 # Integer specifying the random seed
attack_list:
tableak:
pool_size: 10


output_dir: "./leakpro_output"
attack_type: "gia" #mia, gia
modality: "tabular"

target:
# Target model path
module_path: "utils/adult_model_preparation.py"
model_class: "AdultNet"
# Data paths
target_folder: "./target"
data_path: "./data/adult_data.pkl"
36 changes: 36 additions & 0 deletions examples/gia/adult/main.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
"""Script demonstrates how to perform gradient inversion attack on adult."""

from torch.utils.data import Subset, DataLoader
from utils.adult_data_preparation import download_adult_dataset, get_adult_dataloaders, preprocess_adult_dataset
from utils.adult_model_preparation import AdultNet, create_trained_model_and_metadata
from adultGIAHandler import adultGiaHandler
import numpy as np
from leakpro import LeakPro

def train_global_model(dataset):
n_features = dataset.x.shape[1]
n_classes = 1
train_loader, test_loader, train_indices, _ = get_adult_dataloaders(dataset, train_fraction=0.3, test_fraction=0.3)

global_model = AdultNet(input_size=n_features, hidden_size=64, num_classes=n_classes)
global_model, meta_data = create_trained_model_and_metadata(global_model, train_loader,test_loader, epochs=10)
return global_model, meta_data

if __name__ == "__main__":

path = "./data"
download_adult_dataset(path)
dataset = preprocess_adult_dataset(path)
global_model, meta_data = train_global_model(dataset)

# Train the client model
n_client_data = 8
client_data_indices = np.random.choice(len(dataset), 8, replace=False)
client_loader = Subset(dataset, client_data_indices)
train_loader = DataLoader(client_loader, batch_size=8, shuffle=True)

# Set up Leakpro
config_path = "audit.yaml"

# Prepare leakpro object
leakpro = LeakPro(adultGiaHandler, config_path)
137 changes: 137 additions & 0 deletions examples/gia/adult/utils/adult_data_preparation.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,137 @@
import os
import numpy as np
import pandas as pd
import joblib
import pickle
from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
import urllib.request
from torch.utils.data import Dataset, Subset, DataLoader
from torch import tensor, float32


class AdultDataset(Dataset):
def __init__(self, x:tensor, y:tensor, dec_to_onehot:dict, one_hot_encoded:bool=True):
self.x = x
self.y = y

# create dictionary to map between indices in categorical representation and one-hot encoded representation
# For example: cols 1,2 continuous and col 3 categorical with 3 categories will be mapped to {1:1,2:2,3:[3,4,5]}
self.dec_to_onehot = dec_to_onehot
self.one_hot_encoded = one_hot_encoded

def __len__(self):
return len(self.y)

def __getitem__(self, idx):
return self.x[idx], self.y[idx]

def subset(self, indices):
return AdultDataset(self.x[indices], self.y[indices], self.dec_to_onehot, self.one_hot_encoded)


def download_adult_dataset(data_dir):
"""Download the Adult Dataset if it's not present."""
# URLs for the dataset
base_url = "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/"
data_file = os.path.join(data_dir, "adult.data")
test_file = os.path.join(data_dir, "adult.test")

if not os.path.exists(data_dir):
os.makedirs(data_dir)
print("Created directory:", data_dir)
else:
print("Directory already exists:", data_dir)

# Download the dataset if not present
if not os.path.exists(data_file):
print("Downloading adult.data...")
urllib.request.urlretrieve(base_url + "adult.data", data_file)

if not os.path.exists(test_file):
print("Downloading adult.test...")
urllib.request.urlretrieve(base_url + "adult.test", test_file)

def preprocess_adult_dataset(path):
"""Get the dataset, download it if necessary, and store it."""

if os.path.exists(os.path.join(path, "adult_data.pkl")):
with open(os.path.join(path, "adult_data.pkl"), "rb") as f:
dataset = joblib.load(f)
else:
column_names = [
"age", "workclass", "fnlwgt", "education", "education-num",
"marital-status", "occupation", "relationship", "race", "sex",
"capital-gain", "capital-loss", "hours-per-week", "native-country", "income",
]

# Load and clean data
df_train = pd.read_csv(os.path.join(path, "adult.data"), names=column_names)
df_test = pd.read_csv(os.path.join(path, "adult.test"), names=column_names, header=0)
df_test["income"] = df_test["income"].str.replace(".", "", regex=False)

df_concatenated = pd.concat([df_train, df_test], axis=0)
df_clean = df_concatenated.replace(" ?", np.nan).dropna()

# Split features and labels
x, y = df_clean.iloc[:, :-1], df_clean.iloc[:, -1]

# Categorical and numerical columns
categorical_features = [col for col in x.columns if x[col].dtype == "object"]
numerical_features = [col for col in x.columns if x[col].dtype in ["int64", "float64"]]

# Scaling numerical features
scaler = StandardScaler()
x_numerical = pd.DataFrame(scaler.fit_transform(x[numerical_features]), columns=numerical_features, index=x.index)

# Label encode the categories
one_hot_encoder = OneHotEncoder(sparse_output=False)
x_categorical_one_hot = one_hot_encoder.fit_transform(x[categorical_features])
one_hot_feature_names = one_hot_encoder.get_feature_names_out(categorical_features)
x_categorical_one_hot_df = pd.DataFrame(x_categorical_one_hot, columns=one_hot_feature_names, index=x.index)

# Concatenate the numerical and one-hot encoded categorical features
x_final = pd.concat([x_numerical, x_categorical_one_hot_df], axis=1)

# Label encode the target variable
y = pd.Series(LabelEncoder().fit_transform(y))

# Add numerical features to the dictionary
dec_to_onehot_mapping = {}
for i, feature in enumerate(numerical_features):
dec_to_onehot_mapping[i] = [x_final.columns.get_loc(feature)] # Mapping to column index

# Add one-hot encoded features to the dictionary
for i, categorical_feature in enumerate(categorical_features):
j = i + len(numerical_features)
one_hot_columns = [col for col in one_hot_feature_names if col.startswith(categorical_feature)]
dec_to_onehot_mapping[j] = [x_final.columns.get_loc(col) for col in one_hot_columns]

#--------------------
# Create tensor dataset to be stored
x_tensor = tensor(x_final.values, dtype=float32)
y_tensor = tensor(y.values, dtype=float32)
dataset = AdultDataset(x_tensor, y_tensor, dec_to_onehot_mapping, one_hot_encoded=True)
with open(f"{path}/adult_data.pkl", "wb") as file:
pickle.dump(dataset, file)
print(f"Save data to {path}.pkl")

return dataset

def get_adult_dataloaders(dataset, train_fraction=0.3, test_fraction=0.3):

dataset_size = len(dataset)
train_size = int(train_fraction * dataset_size)
test_size = int(test_fraction * dataset_size)

# Use sklearn's train_test_split to split into train and test indices
selected_index = np.random.choice(np.arange(dataset_size), train_size + test_size, replace=False)
train_indices, test_indices = train_test_split(selected_index, test_size=test_size)

train_subset = Subset(dataset, train_indices)
test_subset = Subset(dataset, test_indices)

train_loader = DataLoader(train_subset, batch_size=128, shuffle=True)
test_loader = DataLoader(test_subset, batch_size=128, shuffle=False)

return train_loader, test_loader, train_indices, test_indices
111 changes: 111 additions & 0 deletions examples/gia/adult/utils/adult_model_preparation.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,111 @@
import torch.nn as nn
from torch import device, optim, cuda, no_grad, save, sigmoid
import pickle
from tqdm import tqdm

class AdultNet(nn.Module):
def __init__(self, input_size, hidden_size, num_classes):
super(AdultNet, self).__init__()
self.init_params = {"input_size": input_size,
"hidden_size": hidden_size,
"num_classes": num_classes}
self.fc1 = nn.Linear(input_size, hidden_size)
self.relu = nn.ReLU()
self.fc2 = nn.Linear(hidden_size, hidden_size)
self.fc3 = nn.Linear(hidden_size, num_classes)

def forward(self, x):
out = self.fc1(x)
out = self.relu(out)
out = self.fc2(out)
out = self.relu(out)
out = self.fc3(out)
return out

def evaluate(model, loader, criterion, device):
model.eval()
loss, acc = 0, 0
with no_grad():
for data, target in loader:
data, target = data.to(device), target.to(device)
target = target.float().unsqueeze(1)
output = model(data)
loss += criterion(output, target).item()
pred = sigmoid(output) >= 0.5
acc += pred.eq(target.data.view_as(pred)).sum()
loss /= len(loader)
acc = float(acc) / len(loader.dataset)
return loss, acc

def create_trained_model_and_metadata(model, train_loader, test_loader, epochs = 10, metadata = None):

device_name = device("cuda" if cuda.is_available() else "cpu")
model.to(device_name)
model.train()

criterion = nn.BCEWithLogitsLoss()
optimizer = optim.SGD(model.parameters(), lr=0.1, momentum=0.8)
train_losses, train_accuracies = [], []
test_losses, test_accuracies = [], []

with tqdm(total= epochs, leave = False) as pbar:
for e in range(epochs):
model.train()
train_acc, train_loss = 0.0, 0.0

for data, target in train_loader:
target = target.float().unsqueeze(1)
data, target = data.to(device_name, non_blocking=True), target.to(device_name, non_blocking=True)
optimizer.zero_grad()
output = model(data)

loss = criterion(output, target)
pred = sigmoid(output) >= 0.5
train_acc += pred.eq(target).sum().item()

loss.backward()
optimizer.step()
train_loss += loss.item()

train_loss /= len(train_loader)
train_acc /= len(train_loader.dataset)
test_loss, test_acc = evaluate(model, test_loader, criterion, device_name)

tqdm.write(f"Epoch {e+1}/{epochs} - Train Loss: {train_loss:.4f}, Train Accuracy: {train_acc:.4f}, Test Loss: {test_loss:.4f}, Test Accuracy: {test_acc:.4f}")
pbar.update(1)
# Move the model back to the CPU
model.to("cpu")

# Create metadata and store it
meta_data = {}
meta_data["train_indices"] = train_loader.dataset.indices
meta_data["test_indices"] = test_loader.dataset.indices
meta_data["num_train"] = len(meta_data["train_indices"])

# Write init params
meta_data["init_params"] = {}
for key, value in model.init_params.items():
meta_data["init_params"][key] = value

# read out optimizer parameters
meta_data["optimizer"] = {}
meta_data["optimizer"]["name"] = optimizer.__class__.__name__.lower()
meta_data["optimizer"]["lr"] = optimizer.param_groups[0].get("lr", 0)
meta_data["optimizer"]["weight_decay"] = optimizer.param_groups[0].get("weight_decay", 0)
meta_data["optimizer"]["momentum"] = optimizer.param_groups[0].get("momentum", 0)
meta_data["optimizer"]["dampening"] = optimizer.param_groups[0].get("dampening", 0)
meta_data["optimizer"]["nesterov"] = optimizer.param_groups[0].get("nesterov", False)

# read out criterion parameters
meta_data["loss"] = {}
meta_data["loss"]["name"] = criterion.__class__.__name__.lower()

meta_data["batch_size"] = train_loader.batch_size
meta_data["epochs"] = epochs
meta_data["train_acc"] = train_acc
meta_data["test_acc"] = test_acc
meta_data["train_loss"] = train_loss
meta_data["test_loss"] = test_loss
meta_data["dataset"] = "adult"

return model, meta_data
Loading

0 comments on commit 8e8f05f

Please sign in to comment.