Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

dpsgd [work in progress] #209

Open
wants to merge 5 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -163,7 +163,6 @@ cython_debug/
.DS_Store

# Ignore data and model folders
data
data/cifar*
data/CINIC*
attack_objects/
Expand Down
Empty file added examples/expm/.gitignore
Empty file.
49 changes: 49 additions & 0 deletions examples/expm/audit.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
audit: # Configurations for auditing
random_seed: 1234 # Integer specifying the random seed
attack_list:
rmia:
training_data_fraction: 0.5 # Fraction of the auxilary dataset to use for this attack (in each shadow model training)
attack_data_fraction: 0.5 # Fraction of auxiliary dataset to sample from during attack
num_shadow_models: 8 # Number of shadow models to train
online: True # perform online or offline attack
temperature: 2
gamma: 1.0
offline_a: 0.33 # parameter from which we compute p(x) from p_OUT(x) such that p_IN(x) = a p_OUT(x) + b.
offline_b: 0.66
# qmia:
# training_data_fraction: 1.0 # Fraction of the auxilary dataset (data without train and test indices) to use for training the quantile regressor
# epochs: 5 # Number of training epochs for quantile regression
# population:
# attack_data_fraction: 1.0 # Fraction of the auxilary dataset to use for this attack
lira:
training_data_fraction: 0.5 # Fraction of the auxilary dataset to use for this attack (in each shadow model training)
num_shadow_models: 8 # Number of shadow models to train
online: True # perform online or offline attack
fixed_variance: True # Use a fixed variance for the whole audit
boosting: True
# loss_traj:
# training_distill_data_fraction : 0.7 # Fraction of the auxilary dataset to use for training the distillation models D_s = (1-D_KD)/2
# number_of_traj: 10 # Number of epochs (number of points in the loss trajectory)
# label_only: False # True or False
# mia_classifier_epochs: 100

output_dir: "./examples/expm/leakpro_output"
attack_type: "mia" #mia, gia
dpsgd:
dpsgd_use: True
dpsgd_path: "./examples/expm/target_dpsgd/dpsgd_dic.pkl"



target:
# Target model path
module_path: "examples/expm/utils/dpsgd_model.py" # either model_grud.py or model_LR.py for logestic regression
model_class: "GRUD_DPSGD" # LR/GRUD
# Data paths
target_folder: "./examples/expm/target_dpsgd" # either target_GRUD or target_LR
data_path: "./examples/expm/data/mimic/dataset.pkl" #unflattened dataset for GRUD and flattened dataset for LR

shadow_model:
model_class: dpsgd_model_handler # LR/GRUD

distillation_model:
125 changes: 125 additions & 0 deletions examples/expm/dpsgd_handler.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,125 @@

import os
import pickle

from opacus import PrivacyEngine
from opacus.accountants.utils import get_noise_multiplier
from sklearn.metrics import accuracy_score
from torch import cuda, device, nn, optim
from torch.nn import BCEWithLogitsLoss
from torch.utils.data import DataLoader
from tqdm import tqdm

from leakpro import AbstractInputHandler


class MimicInputHandlerGRU(AbstractInputHandler):
"""Class to handle the user input for the MIMICIII dataset."""

def __init__(self, configs: dict) -> None:
super().__init__(configs = configs)

def get_criterion(self)->BCEWithLogitsLoss:
"""Set the CrossEntropyLoss for the model."""
return BCEWithLogitsLoss()

def get_optimizer(self, model:nn.Module) -> optim.Optimizer:
"""Set the optimizer for the model."""
learning_rate = 0.01
return optim.Adam(model.parameters(), lr=learning_rate)

def convert_to_device(self, x):
device_name = device("cuda" if cuda.is_available() else "cpu")
return x.to(device_name)

def to_numpy(self, tensor) :
return tensor.detach().cpu().numpy() if tensor.is_cuda else tensor.detach().numpy()

def train(
self,
dataloader: DataLoader,
model: nn.Module = None,
criterion: nn.Module = None,
optimizer: optim.Optimizer = None,
epochs: int = None,
) -> dict:


print("Training shadow models with DP-SGD")
dpsgd_path = self.configs["audit"]["dpsgd"]["dpsgd_path"]

sample_rate = 1/len(dataloader)
# Check if the file exists
if os.path.exists(dpsgd_path):
# Open and read the pickle file
with open(dpsgd_path, "rb") as file:
privacy_engine_dict = pickle.load(file)
print("Pickle file loaded successfully!")
print("Data:", privacy_engine_dict)
else:
raise Exception(f"File not found at: {dpsgd_path}")

try:
noise_multiplier = get_noise_multiplier(target_epsilon = privacy_engine_dict["target_epsilon"],
target_delta = privacy_engine_dict["target_delta"],
sample_rate = sample_rate ,
epochs = privacy_engine_dict["epochs"],
epsilon_tolerance = privacy_engine_dict["epsilon_tolerance"],
accountant = "prv",
eps_error = privacy_engine_dict["eps_error"],)
except:
# the prv accountant is not robust to large epsilon (even epsilon = 10)
# so we will use rdp when it fails, so the actual epsilon may be slightly off
# see https://github.com/pytorch/opacus/issues/604
noise_multiplier = get_noise_multiplier(target_epsilon = 2,
target_delta = privacy_engine_dict["target_delta"],
sample_rate = sample_rate,
epochs = privacy_engine_dict["epochs"],
epsilon_tolerance = privacy_engine_dict["epsilon_tolerance"],
accountant = "rdp")

# make the model private
privacy_engine = PrivacyEngine(accountant = "prv")
model, optimizer, dataloader = privacy_engine.make_private(
module=model,
optimizer=optimizer,
data_loader=dataloader,
noise_multiplier=noise_multiplier,
max_grad_norm= privacy_engine_dict["max_grad_norm"],
)

device_name = device("cuda" if cuda.is_available() else "cpu")
model.to(device_name)
model.train()

criterion = self.get_criterion()

for e in tqdm(range(epochs), desc="Training Progress"):
model.train()
train_acc, train_loss = 0.0, 0.0

for _, (x, labels) in enumerate(tqdm(dataloader, desc="Training Batches")):
if x.numel() == 0: # Skip empty batches
continue

x = self.convert_to_device(x)
labels = self.convert_to_device(labels)
labels = labels.float()

optimizer.zero_grad()
output = model(x).squeeze(dim=1)

loss = criterion(output, labels)
loss.backward()
optimizer.step()

train_loss += loss.item()

train_loss = train_loss/len(dataloader)
binary_predictions = (output > 0).float().cpu().numpy()

binary_labels = labels.cpu().numpy()
# Compute accuracy
train_acc = accuracy_score(binary_labels, binary_predictions)

return {"model": model, "metrics": {"accuracy": train_acc, "loss": train_loss}}
Empty file added examples/expm/handler.py
Empty file.
Empty file added examples/expm/main.ipynb
Empty file.
153 changes: 153 additions & 0 deletions examples/expm/run_dpsgd_main.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,153 @@
import os

from torch import zeros
from utils.data_handler import get_mimic_dataloaders, get_mimic_dataset
from utils.dpsgd_model import *

# Import and initialize ReportHandler
from leakpro.reporting.report_handler import ReportHandler

# Generate the dataset and dataloaders
path = os.path.join(os.getcwd(), "examples/expm/data/mimic/")
target_model_dir = "./examples/expm/target_dpsgd"
epsilons = [.0001, .001, .01, .1, .5, 1, 2, 3.5, 5, 7, 10] # epsilons to run over
delta = 1e-5
target_epsilon = 3.5

train_frac = 0.4
valid_frac = 0.0
test_frac = 0.0
early_stop_frac = 0.4
batch_size = 55
use_LR = False # True if you want to use the LR model, False if you want to use the GRUD model

dataset, train_indices, validation_indices, test_indices, early_stop_indices= get_mimic_dataset(path,
train_frac ,
valid_frac,
test_frac,
early_stop_frac,
use_LR)

train_loader, validation_loader, test_loader, early_stop_loader = get_mimic_dataloaders(dataset,
train_indices,
validation_indices,
test_indices,
early_stop_indices,
batch_size)

sample_rate = 1/len(train_loader) # already incorporates batchsize

noise_multiplier_dict = {
"target_epsilon": target_epsilon,
"target_delta": delta,
"sample_rate": sample_rate,
"epochs": 21,
"epsilon_tolerance": 0.01,
"accountant": "prv",
"eps_error": 0.01,
"max_grad_norm": 1,
}





optimized_hyperparams ={
"cell_size": 58,
"hidden_size": 78,
"learning_rate": 0.0004738759319792616,
"num_epochs":50,
"patience_early_stopping": 20,
"patience_lr_scheduler": 5,
"batch_size": 59,
"seed": 4410,
"min_delta": 0.00001,
"epsilon": 3.5,
"max_grad_norm": 1,
}
n_features = int(dataset.x.shape[1]/3)
X_mean = zeros(1,dataset.x.shape[2],n_features)

model_params = {k: optimized_hyperparams[k] for k in ["cell_size", "hidden_size", "batch_size"]}

# Add other required parameters to model_params
model_params.update({
"input_size": n_features,
"X_mean": X_mean,
"output_last": False,
"bn_flag": False,
# "droupout": 0.33,
})


# Initialize the model with filtered parameters
model = GRUD_DPSGD(**model_params)
# Train the model
results= dpsgd_gru_trained_model_and_metadata(
model,
train_loader,
early_stop_loader,
noise_multiplier_dict,
epochs=optimized_hyperparams["num_epochs"],
patience_early_stopping = optimized_hyperparams["patience_early_stopping"],
patience_lr= optimized_hyperparams["patience_lr_scheduler"],
min_delta = optimized_hyperparams["min_delta"],
learning_rate = optimized_hyperparams["learning_rate"],
target_model_dir = target_model_dir,)
train_losses, test_losses , train_acc, test_acc, best_model,niter_per_epoch, privacy_engine = results


import matplotlib.pyplot as plt

# Convert losses to numpy-compatible lists directly
train_losses_cpu = [float(loss) for loss in train_losses]
test_losses_cpu = [float(loss) for loss in test_losses]

# Plot training and test accuracy
plt.figure(figsize=(5, 4))

plt.subplot(1, 2, 1)
plt.plot(train_acc, label="Train Accuracy")
plt.plot(test_acc, label="Test Accuracy")
plt.xlabel("Epoch")
plt.ylabel("Accuracy")
plt.title("Accuracy over Epochs")
plt.legend()

# Plot training and test loss
plt.subplot(1, 2, 2)
plt.plot(train_losses, label="Train Loss")
plt.plot(test_losses, label="Test Loss")
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.title("Loss over Epochs")
plt.legend()

plt.tight_layout()
plt.show()
plt.savefig("psgd_gru.png")



from dpsgd_handler import MimicInputHandlerGRU

from leakpro import LeakPro

# Read the config file
config_path = "./examples/expm/audit.yaml"

# Prepare leakpro object
leakpro = LeakPro(MimicInputHandlerGRU, config_path)

# Run the audit
mia_results = leakpro.run_audit(return_results=True)



# report_handler = ReportHandler()
report_handler = ReportHandler(report_dir="./examples/expm/leakpro_output/results")

# Save MIA resuls using report handler
for res in mia_results:
report_handler.save_results(attack_name=res.attack_name, result_data=res, config=res.configs)

Loading