-
Notifications
You must be signed in to change notification settings - Fork 5
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
first step towards tabular fl attack
- Loading branch information
Showing
10 changed files
with
468 additions
and
34 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,48 @@ | ||
from leakpro.input_handler.abstract_gia_input_handler import AbstractGIAInputHandler | ||
from leakpro.fl_utils.gia_optimizers import MetaOptimizer | ||
from leakpro.fl_utils.gia_module_to_functional import MetaModule | ||
|
||
from torch.nn import Module, BCEWithLogitsLoss | ||
from torch import cuda, device, optim | ||
from torch.utils.data import DataLoader | ||
|
||
class adultGiaHandler(AbstractGIAInputHandler): | ||
|
||
def __init__(self, configs: dict, dataloader:DataLoader) -> None: | ||
super().__init__(configs) | ||
self.dataloader = dataloader | ||
|
||
def get_client_dataloader(self): | ||
return self.dataloader | ||
|
||
def get_criterion(self)->None: | ||
"""Set the CrossEntropyLoss for the model.""" | ||
return BCEWithLogitsLoss() | ||
|
||
def get_optimizer(self, model:Module) -> None: | ||
"""Set the optimizer for the model.""" | ||
learning_rate = 0.1 | ||
momentum = 0.8 | ||
return optim.SGD(model.parameters(), lr=learning_rate, momentum=momentum) | ||
|
||
def train( | ||
dataloader: DataLoader, | ||
model: Module = None, | ||
criterion: Module = None, | ||
optimizer: MetaOptimizer = None, | ||
epochs: int = None, | ||
) -> dict: | ||
"""Model training procedure.""" | ||
|
||
dev = device("cuda" if cuda.is_available() else "cpu") | ||
model.to(dev) | ||
patched_model = MetaModule(model) | ||
|
||
for e in range(epochs): | ||
for data, target in dataloader: | ||
target = target.float().unsqueeze(1) | ||
data, target = data.to(dev, non_blocking=True), target.to(dev, non_blocking=True) | ||
output = patched_model(data, patched_model.parameters) | ||
loss = criterion(output, target) | ||
patched_model.parameters = optimizer.step(loss, patched_model.parameters) | ||
return patched_model |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,18 @@ | ||
audit: # Configurations for auditing | ||
random_seed: 1234 # Integer specifying the random seed | ||
attack_list: | ||
tableak: | ||
pool_size: 10 | ||
|
||
|
||
output_dir: "./leakpro_output" | ||
attack_type: "gia" #mia, gia | ||
modality: "tabular" | ||
|
||
target: | ||
# Target model path | ||
module_path: "utils/adult_model_preparation.py" | ||
model_class: "AdultNet" | ||
# Data paths | ||
target_folder: "./target" | ||
data_path: "./data/adult_data.pkl" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,36 @@ | ||
"""Script demonstrates how to perform gradient inversion attack on adult.""" | ||
|
||
from torch.utils.data import Subset, DataLoader | ||
from utils.adult_data_preparation import download_adult_dataset, get_adult_dataloaders, preprocess_adult_dataset | ||
from utils.adult_model_preparation import AdultNet, create_trained_model_and_metadata | ||
from adultGIAHandler import adultGiaHandler | ||
import numpy as np | ||
from leakpro import LeakPro | ||
|
||
def train_global_model(dataset): | ||
n_features = dataset.x.shape[1] | ||
n_classes = 1 | ||
train_loader, test_loader, train_indices, _ = get_adult_dataloaders(dataset, train_fraction=0.3, test_fraction=0.3) | ||
|
||
global_model = AdultNet(input_size=n_features, hidden_size=64, num_classes=n_classes) | ||
global_model, meta_data = create_trained_model_and_metadata(global_model, train_loader,test_loader, epochs=10) | ||
return global_model, meta_data | ||
|
||
if __name__ == "__main__": | ||
|
||
path = "./data" | ||
download_adult_dataset(path) | ||
dataset = preprocess_adult_dataset(path) | ||
global_model, meta_data = train_global_model(dataset) | ||
|
||
# Train the client model | ||
n_client_data = 8 | ||
client_data_indices = np.random.choice(len(dataset), 8, replace=False) | ||
client_loader = Subset(dataset, client_data_indices) | ||
train_loader = DataLoader(client_loader, batch_size=8, shuffle=True) | ||
|
||
# Set up Leakpro | ||
config_path = "audit.yaml" | ||
|
||
# Prepare leakpro object | ||
leakpro = LeakPro(adultGiaHandler, config_path) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,137 @@ | ||
import os | ||
import numpy as np | ||
import pandas as pd | ||
import joblib | ||
import pickle | ||
from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder | ||
from sklearn.model_selection import train_test_split | ||
import urllib.request | ||
from torch.utils.data import Dataset, Subset, DataLoader | ||
from torch import tensor, float32 | ||
|
||
|
||
class AdultDataset(Dataset): | ||
def __init__(self, x:tensor, y:tensor, dec_to_onehot:dict, one_hot_encoded:bool=True): | ||
self.x = x | ||
self.y = y | ||
|
||
# create dictionary to map between indices in categorical representation and one-hot encoded representation | ||
# For example: cols 1,2 continuous and col 3 categorical with 3 categories will be mapped to {1:1,2:2,3:[3,4,5]} | ||
self.dec_to_onehot = dec_to_onehot | ||
self.one_hot_encoded = one_hot_encoded | ||
|
||
def __len__(self): | ||
return len(self.y) | ||
|
||
def __getitem__(self, idx): | ||
return self.x[idx], self.y[idx] | ||
|
||
def subset(self, indices): | ||
return AdultDataset(self.x[indices], self.y[indices], self.dec_to_onehot, self.one_hot_encoded) | ||
|
||
|
||
def download_adult_dataset(data_dir): | ||
"""Download the Adult Dataset if it's not present.""" | ||
# URLs for the dataset | ||
base_url = "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/" | ||
data_file = os.path.join(data_dir, "adult.data") | ||
test_file = os.path.join(data_dir, "adult.test") | ||
|
||
if not os.path.exists(data_dir): | ||
os.makedirs(data_dir) | ||
print("Created directory:", data_dir) | ||
else: | ||
print("Directory already exists:", data_dir) | ||
|
||
# Download the dataset if not present | ||
if not os.path.exists(data_file): | ||
print("Downloading adult.data...") | ||
urllib.request.urlretrieve(base_url + "adult.data", data_file) | ||
|
||
if not os.path.exists(test_file): | ||
print("Downloading adult.test...") | ||
urllib.request.urlretrieve(base_url + "adult.test", test_file) | ||
|
||
def preprocess_adult_dataset(path): | ||
"""Get the dataset, download it if necessary, and store it.""" | ||
|
||
if os.path.exists(os.path.join(path, "adult_data.pkl")): | ||
with open(os.path.join(path, "adult_data.pkl"), "rb") as f: | ||
dataset = joblib.load(f) | ||
else: | ||
column_names = [ | ||
"age", "workclass", "fnlwgt", "education", "education-num", | ||
"marital-status", "occupation", "relationship", "race", "sex", | ||
"capital-gain", "capital-loss", "hours-per-week", "native-country", "income", | ||
] | ||
|
||
# Load and clean data | ||
df_train = pd.read_csv(os.path.join(path, "adult.data"), names=column_names) | ||
df_test = pd.read_csv(os.path.join(path, "adult.test"), names=column_names, header=0) | ||
df_test["income"] = df_test["income"].str.replace(".", "", regex=False) | ||
|
||
df_concatenated = pd.concat([df_train, df_test], axis=0) | ||
df_clean = df_concatenated.replace(" ?", np.nan).dropna() | ||
|
||
# Split features and labels | ||
x, y = df_clean.iloc[:, :-1], df_clean.iloc[:, -1] | ||
|
||
# Categorical and numerical columns | ||
categorical_features = [col for col in x.columns if x[col].dtype == "object"] | ||
numerical_features = [col for col in x.columns if x[col].dtype in ["int64", "float64"]] | ||
|
||
# Scaling numerical features | ||
scaler = StandardScaler() | ||
x_numerical = pd.DataFrame(scaler.fit_transform(x[numerical_features]), columns=numerical_features, index=x.index) | ||
|
||
# Label encode the categories | ||
one_hot_encoder = OneHotEncoder(sparse_output=False) | ||
x_categorical_one_hot = one_hot_encoder.fit_transform(x[categorical_features]) | ||
one_hot_feature_names = one_hot_encoder.get_feature_names_out(categorical_features) | ||
x_categorical_one_hot_df = pd.DataFrame(x_categorical_one_hot, columns=one_hot_feature_names, index=x.index) | ||
|
||
# Concatenate the numerical and one-hot encoded categorical features | ||
x_final = pd.concat([x_numerical, x_categorical_one_hot_df], axis=1) | ||
|
||
# Label encode the target variable | ||
y = pd.Series(LabelEncoder().fit_transform(y)) | ||
|
||
# Add numerical features to the dictionary | ||
dec_to_onehot_mapping = {} | ||
for i, feature in enumerate(numerical_features): | ||
dec_to_onehot_mapping[i] = [x_final.columns.get_loc(feature)] # Mapping to column index | ||
|
||
# Add one-hot encoded features to the dictionary | ||
for i, categorical_feature in enumerate(categorical_features): | ||
j = i + len(numerical_features) | ||
one_hot_columns = [col for col in one_hot_feature_names if col.startswith(categorical_feature)] | ||
dec_to_onehot_mapping[j] = [x_final.columns.get_loc(col) for col in one_hot_columns] | ||
|
||
#-------------------- | ||
# Create tensor dataset to be stored | ||
x_tensor = tensor(x_final.values, dtype=float32) | ||
y_tensor = tensor(y.values, dtype=float32) | ||
dataset = AdultDataset(x_tensor, y_tensor, dec_to_onehot_mapping, one_hot_encoded=True) | ||
with open(f"{path}/adult_data.pkl", "wb") as file: | ||
pickle.dump(dataset, file) | ||
print(f"Save data to {path}.pkl") | ||
|
||
return dataset | ||
|
||
def get_adult_dataloaders(dataset, train_fraction=0.3, test_fraction=0.3): | ||
|
||
dataset_size = len(dataset) | ||
train_size = int(train_fraction * dataset_size) | ||
test_size = int(test_fraction * dataset_size) | ||
|
||
# Use sklearn's train_test_split to split into train and test indices | ||
selected_index = np.random.choice(np.arange(dataset_size), train_size + test_size, replace=False) | ||
train_indices, test_indices = train_test_split(selected_index, test_size=test_size) | ||
|
||
train_subset = Subset(dataset, train_indices) | ||
test_subset = Subset(dataset, test_indices) | ||
|
||
train_loader = DataLoader(train_subset, batch_size=128, shuffle=True) | ||
test_loader = DataLoader(test_subset, batch_size=128, shuffle=False) | ||
|
||
return train_loader, test_loader, train_indices, test_indices |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,111 @@ | ||
import torch.nn as nn | ||
from torch import device, optim, cuda, no_grad, save, sigmoid | ||
import pickle | ||
from tqdm import tqdm | ||
|
||
class AdultNet(nn.Module): | ||
def __init__(self, input_size, hidden_size, num_classes): | ||
super(AdultNet, self).__init__() | ||
self.init_params = {"input_size": input_size, | ||
"hidden_size": hidden_size, | ||
"num_classes": num_classes} | ||
self.fc1 = nn.Linear(input_size, hidden_size) | ||
self.relu = nn.ReLU() | ||
self.fc2 = nn.Linear(hidden_size, hidden_size) | ||
self.fc3 = nn.Linear(hidden_size, num_classes) | ||
|
||
def forward(self, x): | ||
out = self.fc1(x) | ||
out = self.relu(out) | ||
out = self.fc2(out) | ||
out = self.relu(out) | ||
out = self.fc3(out) | ||
return out | ||
|
||
def evaluate(model, loader, criterion, device): | ||
model.eval() | ||
loss, acc = 0, 0 | ||
with no_grad(): | ||
for data, target in loader: | ||
data, target = data.to(device), target.to(device) | ||
target = target.float().unsqueeze(1) | ||
output = model(data) | ||
loss += criterion(output, target).item() | ||
pred = sigmoid(output) >= 0.5 | ||
acc += pred.eq(target.data.view_as(pred)).sum() | ||
loss /= len(loader) | ||
acc = float(acc) / len(loader.dataset) | ||
return loss, acc | ||
|
||
def create_trained_model_and_metadata(model, train_loader, test_loader, epochs = 10, metadata = None): | ||
|
||
device_name = device("cuda" if cuda.is_available() else "cpu") | ||
model.to(device_name) | ||
model.train() | ||
|
||
criterion = nn.BCEWithLogitsLoss() | ||
optimizer = optim.SGD(model.parameters(), lr=0.1, momentum=0.8) | ||
train_losses, train_accuracies = [], [] | ||
test_losses, test_accuracies = [], [] | ||
|
||
with tqdm(total= epochs, leave = False) as pbar: | ||
for e in range(epochs): | ||
model.train() | ||
train_acc, train_loss = 0.0, 0.0 | ||
|
||
for data, target in train_loader: | ||
target = target.float().unsqueeze(1) | ||
data, target = data.to(device_name, non_blocking=True), target.to(device_name, non_blocking=True) | ||
optimizer.zero_grad() | ||
output = model(data) | ||
|
||
loss = criterion(output, target) | ||
pred = sigmoid(output) >= 0.5 | ||
train_acc += pred.eq(target).sum().item() | ||
|
||
loss.backward() | ||
optimizer.step() | ||
train_loss += loss.item() | ||
|
||
train_loss /= len(train_loader) | ||
train_acc /= len(train_loader.dataset) | ||
test_loss, test_acc = evaluate(model, test_loader, criterion, device_name) | ||
|
||
tqdm.write(f"Epoch {e+1}/{epochs} - Train Loss: {train_loss:.4f}, Train Accuracy: {train_acc:.4f}, Test Loss: {test_loss:.4f}, Test Accuracy: {test_acc:.4f}") | ||
pbar.update(1) | ||
# Move the model back to the CPU | ||
model.to("cpu") | ||
|
||
# Create metadata and store it | ||
meta_data = {} | ||
meta_data["train_indices"] = train_loader.dataset.indices | ||
meta_data["test_indices"] = test_loader.dataset.indices | ||
meta_data["num_train"] = len(meta_data["train_indices"]) | ||
|
||
# Write init params | ||
meta_data["init_params"] = {} | ||
for key, value in model.init_params.items(): | ||
meta_data["init_params"][key] = value | ||
|
||
# read out optimizer parameters | ||
meta_data["optimizer"] = {} | ||
meta_data["optimizer"]["name"] = optimizer.__class__.__name__.lower() | ||
meta_data["optimizer"]["lr"] = optimizer.param_groups[0].get("lr", 0) | ||
meta_data["optimizer"]["weight_decay"] = optimizer.param_groups[0].get("weight_decay", 0) | ||
meta_data["optimizer"]["momentum"] = optimizer.param_groups[0].get("momentum", 0) | ||
meta_data["optimizer"]["dampening"] = optimizer.param_groups[0].get("dampening", 0) | ||
meta_data["optimizer"]["nesterov"] = optimizer.param_groups[0].get("nesterov", False) | ||
|
||
# read out criterion parameters | ||
meta_data["loss"] = {} | ||
meta_data["loss"]["name"] = criterion.__class__.__name__.lower() | ||
|
||
meta_data["batch_size"] = train_loader.batch_size | ||
meta_data["epochs"] = epochs | ||
meta_data["train_acc"] = train_acc | ||
meta_data["test_acc"] = test_acc | ||
meta_data["train_loss"] = train_loss | ||
meta_data["test_loss"] = test_loss | ||
meta_data["dataset"] = "adult" | ||
|
||
return model, meta_data |
Oops, something went wrong.