aidotse · fazelehh · Jan 13, 2025 · Jan 21, 2025 · Jan 24, 2025 · Jan 30, 2025
diff --git a/.gitignore b/.gitignore
@@ -163,7 +163,6 @@ cython_debug/
 .DS_Store
 
 # Ignore data and model folders
-data
 data/cifar*
 data/CINIC*
 attack_objects/

diff --git a/examples/expm/.gitignore b/examples/expm/.gitignore
diff --git a/examples/expm/audit.yaml b/examples/expm/audit.yaml
@@ -0,0 +1,49 @@
+audit:  # Configurations for auditing
+  random_seed: 1234  # Integer specifying the random seed
+  attack_list:
+    rmia:
+      training_data_fraction: 0.5  # Fraction of the auxilary dataset to use for this attack (in each shadow model training)
+      attack_data_fraction: 0.5 # Fraction of auxiliary dataset to sample from during attack
+      num_shadow_models: 8 # Number of shadow models to train
+      online: True # perform online or offline attack
+      temperature: 2
+      gamma: 1.0
+      offline_a: 0.33 # parameter from which we compute p(x) from p_OUT(x) such that p_IN(x) = a p_OUT(x) + b.
+      offline_b: 0.66
+    # qmia:
+    #   training_data_fraction: 1.0  # Fraction of the auxilary dataset (data without train and test indices) to use for training the quantile regressor
+    #   epochs: 5  # Number of training epochs for quantile regression
+    # population:
+    #   attack_data_fraction: 1.0  # Fraction of the auxilary dataset to use for this attack
+    lira:
+      training_data_fraction: 0.5  # Fraction of the auxilary dataset to use for this attack (in each shadow model training)
+      num_shadow_models: 8 # Number of shadow models to train
+      online: True # perform online or offline attack
+      fixed_variance: True # Use a fixed variance for the whole audit
+      boosting: True
+    # loss_traj:
+    #   training_distill_data_fraction : 0.7 # Fraction of the auxilary dataset to use for training the distillation models D_s = (1-D_KD)/2
+    #   number_of_traj: 10 # Number of epochs (number of points in the loss trajectory)
+    #   label_only: False # True or False
+    #   mia_classifier_epochs: 100
+
+  output_dir: "./examples/expm/leakpro_output"
+  attack_type: "mia" #mia, gia
+  dpsgd:
+    dpsgd_use: True
+    dpsgd_path: "./examples/expm/target_dpsgd/dpsgd_dic.pkl"
+
+
+
+target:
+  # Target model path
+  module_path: "examples/expm/utils/dpsgd_model.py" # either model_grud.py or model_LR.py for logestic regression
+  model_class: "GRUD_DPSGD" #  LR/GRUD
+  # Data paths
+  target_folder: "./examples/expm/target_dpsgd" # either target_GRUD or target_LR
+  data_path: "./examples/expm/data/mimic/dataset.pkl" #unflattened dataset for GRUD and flattened dataset for LR
+
+shadow_model:
+  model_class: dpsgd_model_handler # LR/GRUD
+
+distillation_model:
diff --git a/examples/expm/dpsgd_handler.py b/examples/expm/dpsgd_handler.py
@@ -0,0 +1,125 @@
+
+import os
+import pickle
+
+from opacus import PrivacyEngine
+from opacus.accountants.utils import get_noise_multiplier
+from sklearn.metrics import accuracy_score
+from torch import cuda, device, nn, optim
+from torch.nn import BCEWithLogitsLoss
+from torch.utils.data import DataLoader
+from tqdm import tqdm
+
+from leakpro import AbstractInputHandler
+
+
+class MimicInputHandlerGRU(AbstractInputHandler):
+    """Class to handle the user input for the MIMICIII dataset."""
+
+    def __init__(self, configs: dict) -> None:
+        super().__init__(configs = configs)
+
+    def get_criterion(self)->BCEWithLogitsLoss:
+        """Set the CrossEntropyLoss for the model."""
+        return BCEWithLogitsLoss()
+
+    def get_optimizer(self, model:nn.Module) -> optim.Optimizer:
+        """Set the optimizer for the model."""
+        learning_rate = 0.01
+        return optim.Adam(model.parameters(), lr=learning_rate)
+
+    def convert_to_device(self, x):
+        device_name = device("cuda" if cuda.is_available() else "cpu")
+        return x.to(device_name)
+
+    def to_numpy(self, tensor) :
+        return tensor.detach().cpu().numpy() if tensor.is_cuda else tensor.detach().numpy()
+
+    def train(
+        self,
+        dataloader: DataLoader,
+        model: nn.Module = None,
+        criterion: nn.Module = None,
+        optimizer: optim.Optimizer = None,
+        epochs: int = None,
+    ) -> dict:
+
+
+        print("Training shadow models with DP-SGD")
+        dpsgd_path = self.configs["audit"]["dpsgd"]["dpsgd_path"]
+
+        sample_rate = 1/len(dataloader)
+        # Check if the file exists
+        if os.path.exists(dpsgd_path):
+            # Open and read the pickle file
+            with open(dpsgd_path, "rb") as file:
+                privacy_engine_dict = pickle.load(file)
+            print("Pickle file loaded successfully!")
+            print("Data:", privacy_engine_dict)
+        else:
+            raise Exception(f"File not found at: {dpsgd_path}")
+
+        try:
+            noise_multiplier = get_noise_multiplier(target_epsilon = privacy_engine_dict["target_epsilon"],
+                                            target_delta = privacy_engine_dict["target_delta"],
+                                            sample_rate = sample_rate ,
+                                            epochs = privacy_engine_dict["epochs"],
+                                            epsilon_tolerance = privacy_engine_dict["epsilon_tolerance"],
+                                            accountant = "prv",
+                                            eps_error = privacy_engine_dict["eps_error"],)
+        except:
+            # the prv accountant is not robust to large epsilon (even epsilon = 10)
+            # so we will use rdp when it fails, so the actual epsilon may be slightly off
+            # see https://github.com/pytorch/opacus/issues/604
+            noise_multiplier = get_noise_multiplier(target_epsilon = 2,
+                                                    target_delta = privacy_engine_dict["target_delta"],
+                                                    sample_rate = sample_rate,
+                                                    epochs = privacy_engine_dict["epochs"],
+                                                    epsilon_tolerance = privacy_engine_dict["epsilon_tolerance"],
+                                                    accountant = "rdp")
+
+        # make the model private
+        privacy_engine = PrivacyEngine(accountant = "prv")
+        model, optimizer, dataloader = privacy_engine.make_private(
+            module=model,
+            optimizer=optimizer,
+            data_loader=dataloader,
+            noise_multiplier=noise_multiplier,
+            max_grad_norm= privacy_engine_dict["max_grad_norm"],
+        )
+
+        device_name = device("cuda" if cuda.is_available() else "cpu")
+        model.to(device_name)
+        model.train()
+
+        criterion = self.get_criterion()
+
+        for e in tqdm(range(epochs), desc="Training Progress"):
+            model.train()
+            train_acc, train_loss = 0.0, 0.0
+
+            for _, (x, labels) in enumerate(tqdm(dataloader, desc="Training Batches")):
+                if x.numel() == 0:  # Skip empty batches
+                    continue
+
+                x = self.convert_to_device(x)
+                labels = self.convert_to_device(labels)
+                labels = labels.float()
+
+                optimizer.zero_grad()
+                output = model(x).squeeze(dim=1)
+
+                loss = criterion(output, labels)
+                loss.backward()
+                optimizer.step()
+
+                train_loss += loss.item()
+
+            train_loss = train_loss/len(dataloader)
+            binary_predictions = (output > 0).float().cpu().numpy()
+
+            binary_labels = labels.cpu().numpy()
+            # Compute accuracy
+            train_acc = accuracy_score(binary_labels, binary_predictions)
+
+        return {"model": model, "metrics": {"accuracy": train_acc, "loss": train_loss}}
diff --git a/examples/expm/handler.py b/examples/expm/handler.py
diff --git a/examples/expm/main.ipynb b/examples/expm/main.ipynb
diff --git a/examples/expm/run_dpsgd_main.py b/examples/expm/run_dpsgd_main.py
@@ -0,0 +1,153 @@
+import os
+
+from torch import zeros
+from utils.data_handler import get_mimic_dataloaders, get_mimic_dataset
+from utils.dpsgd_model import *
+
+# Import and initialize ReportHandler
+from leakpro.reporting.report_handler import ReportHandler
+
+# Generate the dataset and dataloaders
+path = os.path.join(os.getcwd(), "examples/expm/data/mimic/")
+target_model_dir = "./examples/expm/target_dpsgd"
+epsilons = [.0001, .001, .01, .1, .5, 1, 2, 3.5, 5, 7, 10] # epsilons to run over
+delta = 1e-5
+target_epsilon = 3.5
+
+train_frac = 0.4
+valid_frac = 0.0
+test_frac = 0.0
+early_stop_frac = 0.4
+batch_size = 55
+use_LR = False # True if you want to use the LR model, False if you want to use the GRUD model
+
+dataset, train_indices, validation_indices, test_indices, early_stop_indices= get_mimic_dataset(path,
+                                                                            train_frac ,
+                                                                            valid_frac,
+                                                                            test_frac,
+                                                                            early_stop_frac,
+                                                                            use_LR)
+
+train_loader, validation_loader, test_loader, early_stop_loader = get_mimic_dataloaders(dataset,
+                                                            train_indices,
+                                                            validation_indices,
+                                                            test_indices,
+                                                            early_stop_indices,
+                                                            batch_size)
+
+sample_rate = 1/len(train_loader) # already incorporates batchsize
+
+noise_multiplier_dict = {
+    "target_epsilon": target_epsilon,
+    "target_delta": delta,
+    "sample_rate": sample_rate,
+    "epochs": 21,
+    "epsilon_tolerance": 0.01,
+    "accountant": "prv",
+    "eps_error": 0.01,
+    "max_grad_norm": 1,
+}
+
+
+
+
+
+optimized_hyperparams ={
+    "cell_size": 58,
+    "hidden_size": 78,
+    "learning_rate": 0.0004738759319792616,
+    "num_epochs":50,
+    "patience_early_stopping": 20,
+    "patience_lr_scheduler": 5,
+    "batch_size": 59,
+    "seed": 4410,
+    "min_delta": 0.00001,
+    "epsilon": 3.5,
+    "max_grad_norm": 1,
+    }
+n_features = int(dataset.x.shape[1]/3)
+X_mean = zeros(1,dataset.x.shape[2],n_features)
+
+model_params = {k: optimized_hyperparams[k] for k in ["cell_size", "hidden_size", "batch_size"]}
+
+# Add other required parameters to model_params
+model_params.update({
+    "input_size": n_features,
+    "X_mean": X_mean,
+    "output_last": False,
+    "bn_flag": False,
+    # "droupout": 0.33,
+})
+
+
+# Initialize the model with filtered parameters
+model = GRUD_DPSGD(**model_params)
+# Train the model
+results= dpsgd_gru_trained_model_and_metadata(
+                                            model,
+                                            train_loader,
+                                            early_stop_loader,
+                                            noise_multiplier_dict,
+                                            epochs=optimized_hyperparams["num_epochs"],
+                                            patience_early_stopping = optimized_hyperparams["patience_early_stopping"],
+                                            patience_lr= optimized_hyperparams["patience_lr_scheduler"],
+                                            min_delta = optimized_hyperparams["min_delta"],
+                                            learning_rate = optimized_hyperparams["learning_rate"],
+                                            target_model_dir = target_model_dir,)
+train_losses, test_losses , train_acc, test_acc, best_model,niter_per_epoch, privacy_engine  = results
+
+
+import matplotlib.pyplot as plt
+
+# Convert losses to numpy-compatible lists directly
+train_losses_cpu = [float(loss) for loss in train_losses]
+test_losses_cpu = [float(loss) for loss in test_losses]
+
+# Plot training and test accuracy
+plt.figure(figsize=(5, 4))
+
+plt.subplot(1, 2, 1)
+plt.plot(train_acc, label="Train Accuracy")
+plt.plot(test_acc, label="Test Accuracy")
+plt.xlabel("Epoch")
+plt.ylabel("Accuracy")
+plt.title("Accuracy over Epochs")
+plt.legend()
+
+# Plot training and test loss
+plt.subplot(1, 2, 2)
+plt.plot(train_losses, label="Train Loss")
+plt.plot(test_losses, label="Test Loss")
+plt.xlabel("Epoch")
+plt.ylabel("Loss")
+plt.title("Loss over Epochs")
+plt.legend()
+
+plt.tight_layout()
+plt.show()
+plt.savefig("psgd_gru.png")
+
+
+
+from dpsgd_handler import MimicInputHandlerGRU
+
+from leakpro import LeakPro
+
+# Read the config file
+config_path = "./examples/expm/audit.yaml"
+
+# Prepare leakpro object
+leakpro = LeakPro(MimicInputHandlerGRU, config_path)
+
+# Run the audit
+mia_results = leakpro.run_audit(return_results=True)
+
+
+
+# report_handler = ReportHandler()
+report_handler = ReportHandler(report_dir="./examples/expm/leakpro_output/results")
+
+# Save MIA resuls using report handler
+for res in mia_results:
+    report_handler.save_results(attack_name=res.attack_name, result_data=res, config=res.configs)
+