From e84a536b1a967d4b1162e0ec62c21ad388193b8e Mon Sep 17 00:00:00 2001 From: fazelehh Date: Mon, 13 Jan 2025 19:26:25 +0000 Subject: [PATCH 1/5] fixing data path --- .gitignore | 1 - examples/expm/.gitignore | 0 examples/expm/data/.gitkeep | 0 examples/expm/handler.py | 0 examples/expm/main.ipynb | 0 examples/expm/utils/data_handler.py | 350 ++++++++++++++++++++++++++++ 6 files changed, 350 insertions(+), 1 deletion(-) create mode 100644 examples/expm/.gitignore create mode 100644 examples/expm/data/.gitkeep create mode 100644 examples/expm/handler.py create mode 100644 examples/expm/main.ipynb create mode 100644 examples/expm/utils/data_handler.py diff --git a/.gitignore b/.gitignore index cf704f92..a3dc9187 100755 --- a/.gitignore +++ b/.gitignore @@ -163,7 +163,6 @@ cython_debug/ .DS_Store # Ignore data and model folders -data data/cifar* data/CINIC* attack_objects/ diff --git a/examples/expm/.gitignore b/examples/expm/.gitignore new file mode 100644 index 00000000..e69de29b diff --git a/examples/expm/data/.gitkeep b/examples/expm/data/.gitkeep new file mode 100644 index 00000000..e69de29b diff --git a/examples/expm/handler.py b/examples/expm/handler.py new file mode 100644 index 00000000..e69de29b diff --git a/examples/expm/main.ipynb b/examples/expm/main.ipynb new file mode 100644 index 00000000..e69de29b diff --git a/examples/expm/utils/data_handler.py b/examples/expm/utils/data_handler.py new file mode 100644 index 00000000..b59339f5 --- /dev/null +++ b/examples/expm/utils/data_handler.py @@ -0,0 +1,350 @@ +""" +This file is inspired by https://github.com/MLforHealth/MIMIC_Extract +MIT License +Copyright (c) 2019 MIT Laboratory for Computational Physiology +""" +#TODO: Do I need to include the license for this file.? +import os +import pickle + +import numpy as np +import pandas as pd +from sklearn.preprocessing import StandardScaler +from torch import Tensor, from_numpy +from torch.utils.data import DataLoader, Dataset, Subset +from utils.gru_model_handler import to_3D_tensor +from tqdm import tqdm + + +class MimicDataset(Dataset): + def __init__(self, x, y): + # Check if x is already a tensor + if not isinstance(x, Tensor): + self.x = from_numpy(x).float() # Convert features to torch tensors if needed + else: + self.x = x.float() # Ensure it is of type float + + # Check if y is already a tensor + if not isinstance(y, Tensor): + self.y = from_numpy(y).float() # Convert labels to torch tensors if needed + else: + self.y = y.float() # Ensure it is of type float + + def __len__(self): + return len(self.y) + + def __getitem__(self, idx): + return self.x[idx], self.y[idx].squeeze(0) + + def subset(self, indices): + return MimicDataset(self.x[indices], self.y[indices]) + + +def get_mimic_dataset(path, + train_frac, + validation_frac, + test_frac, + early_stop_frac, + use_LR = True): + """Get the dataset, download it if necessary, and store it.""" + + # Assert that the sum of all fractions is between 0 and 1 + total_frac = train_frac + validation_frac + test_frac + early_stop_frac + assert 0 < total_frac <= 1, "The sum of dataset fractions must be between 0 and 1." + + + dataset_path = os.path.join(path, "dataset.pkl") + indices_path = os.path.join(path, "indices.pkl") + + if os.path.exists(dataset_path) and os.path.exists(indices_path): + print("Loading dataset...") + with open(dataset_path, "rb") as f: + dataset = pickle.load(f) # Load the dataset + with open(indices_path, "rb") as f: + indices_dict = pickle.load(f) # Load the dictionary containing indices + train_indices = indices_dict["train_indices"] # Get the actual train indices + validation_indices = indices_dict["validation_indices"] # Get the actual validation indices + test_indices = indices_dict["test_indices"] # Get the actual test indices + early_stop_indices = indices_dict["early_stop_indices"] # Get the actual early stop indices + print(f"Loaded dataset from {dataset_path}") + return dataset, train_indices, validation_indices ,test_indices, early_stop_indices + + data_file_path = os.path.join(path, "all_hourly_data.h5") + if os.path.exists(data_file_path): + print("Loading data...") + data = pd.read_hdf(data_file_path, "vitals_labs") + statics = pd.read_hdf(data_file_path, "patients") + + ID_COLS = ["subject_id", "hadm_id", "icustay_id"] + + print("Splitting data...") + train_data, holdout_data, y_train, y_holdout_data = data_splitter(statics, + data, + train_frac) + + print("Normalizing data...") + train_data , holdout_data = data_normalization(train_data, holdout_data) + + print("Imputing missing values...") + train_data, holdout_data = [ + simple_imputer(df, ID_COLS) for df in tqdm((train_data, holdout_data), desc="Imputation")] + + + # Skip pivot_table if flatten is False + train, holdout, label_train, label_holdout = train_data, holdout_data, y_train, y_holdout_data + + assert_no_missing_values(train_data, holdout_data, train, holdout) + + train_df, holdout_df = standard_scaler(train, holdout) + + # Creating the dataset + data_x = pd.concat((train_df, holdout_df), axis=0) + data_y = pd.concat((label_train, label_holdout), axis=0) + + assert np.issubdtype(data_x.values.dtype, np.number), "Non-numeric data found in features." + assert np.issubdtype(data_y.values.dtype, np.number), "Non-numeric data found in labels." + + print("Creating dataset...") + if use_LR: + dataset = MimicDataset(data_x.values, data_y.values) + else: + data_x = to_3D_tensor(data_x) + dataset = MimicDataset(data_x, data_y.values) + + # Generate indices for training, validation, test, and early stopping + train_indices, validation_indices, test_indices, early_stop_indices = data_indices(data_x, + train_frac, + validation_frac, + test_frac, + early_stop_frac) + + os.makedirs(os.path.dirname(dataset_path), exist_ok=True) + # Save the dataset to dataset.pkl + print("Saving dataset and indices...") + with open(dataset_path, "wb") as file: + pickle.dump(dataset, file) + print(f"Saved dataset to {dataset_path}") + + # Save train and test indices to indices.pkl + indices_to_save = { + "train_indices": train_indices, + "validation_indices": validation_indices, + "test_indices": test_indices, + "early_stop_indices": early_stop_indices, + } + with open(indices_path, "wb") as file: + pickle.dump(indices_to_save, file) + print(f"Saved train and test indices to {indices_path}") + else: + msg = "Please download the MIMIC-III dataset from https://physionet.org/content/mimiciii/1.4/ and save it in the specified path." + raise FileNotFoundError(msg) + return dataset, train_indices, validation_indices, test_indices, early_stop_indices + + +def data_splitter(statics, + data, + train_frac): + GAP_TIME = 6 # In hours + WINDOW_SIZE = 24 # In hours + SEED = 1 + + Ys = statics[statics.max_hours > WINDOW_SIZE + GAP_TIME][["los_icu"]] + Ys["los_3"] = Ys["los_icu"] > 3 + Ys.drop(columns=["los_icu"], inplace=True) + Ys["los_3"] = Ys["los_3"].astype(float) + + lvl2 = data[ + (data.index.get_level_values("icustay_id").isin(set(Ys.index.get_level_values("icustay_id")))) & + (data.index.get_level_values("hours_in") < WINDOW_SIZE) + ] + + data_subj_idx, y_subj_idx = [df.index.get_level_values("subject_id") for df in (lvl2, Ys)] + data_subjects = set(data_subj_idx) + assert data_subjects == set(y_subj_idx), "Subject ID pools differ!" + + # Randomly shuffle subjects and compute the sizes of the splits + np.random.seed(SEED) + subjects = np.random.permutation(list(data_subjects)) + N = len(subjects) + N_train = int(train_frac * N) + + # Ensure no overlap between train and test sets + train_subj = subjects[:N_train] + test_subj = subjects[N_train::] + + # Split the data according to the subjects + (train_data, holdout_data), (y_train, y_holdout) = [ + [df[df.index.get_level_values("subject_id").isin(s)] for s in (train_subj, test_subj)] + for df in (lvl2, Ys) + ] + return train_data, holdout_data, y_train, y_holdout + +# def simple_imputer(dataframe, +# ID_COLS): +# idx = pd.IndexSlice +# df = dataframe.copy() +# if len(df.columns.names) > 2: df.columns = df.columns.droplevel(("label", "LEVEL1", "LEVEL2")) + +# df_out = df.loc[:, idx[:, ["mean", "count"]]] +# icustay_means = df_out.loc[:, idx[:, "mean"]].groupby(ID_COLS).mean() + +# df_out.loc[:, idx[:, "mean"]] = ( +# df_out.loc[:, idx[:, "mean"]] +# .groupby(ID_COLS) +# .ffill() # Replace forward fill method +# .groupby(ID_COLS) +# .fillna(icustay_means) # Fill remaining NaNs with icustay_means +# .fillna(0) # Fill any remaining NaNs with 0 +# ) + +# # df_out.loc[:,idx[:,"mean"]] = df_out.loc[:,idx[:,"mean"]].groupby(ID_COLS).fillna( +# # method="ffill" +# # ).groupby(ID_COLS).fillna(icustay_means).fillna(0) + +# df_out.loc[:, idx[:, "count"]] = (df.loc[:, idx[:, "count"]] > 0).astype(float) +# df_out.rename(columns={"count": "mask"}, level="Aggregation Function", inplace=True) + +# is_absent = (1 - df_out.loc[:, idx[:, "mask"]]) +# hours_of_absence = is_absent.cumsum() +# time_since_measured = hours_of_absence - hours_of_absence[is_absent==0].fillna(method="ffill") +# time_since_measured.rename(columns={"mask": "time_since_measured"}, level="Aggregation Function", inplace=True) + +# df_out = pd.concat((df_out, time_since_measured), axis=1) +# df_out.loc[:, idx[:, "time_since_measured"]] = df_out.loc[:, idx[:, "time_since_measured"]].fillna(100) + +# df_out.sort_index(axis=1, inplace=True) +# return df_out + +def simple_imputer(dataframe, ID_COLS): + idx = pd.IndexSlice + df = dataframe.copy() + + # Adjust column levels if necessary + if len(df.columns.names) > 2: + df.columns = df.columns.droplevel(("label", "LEVEL1", "LEVEL2")) + + # Select mean and count columns + df_out = df.loc[:, idx[:, ["mean", "count"]]].copy() # Explicit deep copy + + # Compute group-level means + icustay_means = df_out.loc[:, idx[:, "mean"]].groupby(ID_COLS).transform("mean") + + # Forward fill and fill NaNs with icustay_means + df_out.loc[:, idx[:, "mean"]] = ( + df_out.loc[:, idx[:, "mean"]] + .groupby(ID_COLS) + .ffill() # Forward fill within groups + ) + df_out.loc[:, idx[:, "mean"]] = df_out.loc[:, idx[:, "mean"]].fillna(icustay_means) + + # Fill remaining NaNs with 0 + df_out.loc[:, idx[:, "mean"]] = df_out.loc[:, idx[:, "mean"]].fillna(0) + + # Binary mask for count columns + df_out.loc[:, idx[:, "count"]] = (df.loc[:, idx[:, "count"]] > 0).astype(float) + df_out = df_out.rename(columns={"count": "mask"}, level="Aggregation Function") # Avoid inplace=True + + # Calculate time since last measurement + is_absent = (1 - df_out.loc[:, idx[:, "mask"]]) + hours_of_absence = is_absent.cumsum() + time_since_measured = hours_of_absence - hours_of_absence[is_absent == 0].ffill() + time_since_measured.rename(columns={"mask": "time_since_measured"}, level="Aggregation Function", inplace=True) + + # Add time_since_measured to the output + df_out = pd.concat((df_out, time_since_measured), axis=1) + df_out.loc[:, idx[:, "time_since_measured"]] = df_out.loc[:, idx[:, "time_since_measured"]].fillna(100) + + # Sort columns by index + df_out.sort_index(axis=1, inplace=True) + + return df_out + + + + + +def data_indices(dataset, + train_frac, + valid_frac, + test_frac, + early_stop_frac): + N = len(dataset) + N_train = int(train_frac * N) + N_validation = int(valid_frac * N) + N_test = int(test_frac * N) + N_early_stop = int(early_stop_frac * N) + + # Generate sequential indices for training and testing + # Indices from 0 to N_train-1 + train_indices = list(range(N_train)) + # Indices from N_train to N_train + N_validation-1 + validation_indices = list(range(N_train, N_train + N_validation)) + # Indices for test set + test_indices = list(range(N_train + N_validation, N_train + N_validation + N_test)) + # Indices for early stopping + early_stop_indices = list(range(N_train + N_validation + N_test, N_train + N_validation + N_test + N_early_stop)) + return train_indices, validation_indices, test_indices, early_stop_indices + + +def get_mimic_dataloaders(dataset, + train_indices, + validation_indices, + test_indices, + early_stop_indices, + batch_size=128): + train_subset = Subset(dataset, train_indices) + test_subset = Subset(dataset, test_indices) + validation_subset = Subset(dataset, validation_indices) + early_stop_subset = Subset(dataset, early_stop_indices) + + train_loader = DataLoader(train_subset, batch_size, shuffle=False) + test_loader = DataLoader(test_subset, batch_size, shuffle=False) + validation_loader = DataLoader(validation_subset, batch_size, shuffle=False) + early_stop_loader = DataLoader(early_stop_subset, batch_size, shuffle=False) + + return train_loader, validation_loader, test_loader, early_stop_loader + + +def data_normalization(lvl2_train, + lvl2_test): + idx = pd.IndexSlice + lvl2_means, lvl2_stds = lvl2_train.loc[:, idx[:,"mean"]].mean(axis=0), lvl2_train.loc[:, idx[:,"mean"]].std(axis=0) + + lvl2_train.loc[:, idx[:,"mean"]] = (lvl2_train.loc[:, idx[:,"mean"]] - lvl2_means)/lvl2_stds + lvl2_test.loc[:, idx[:,"mean"]] = (lvl2_test.loc[:, idx[:,"mean"]] - lvl2_means)/lvl2_stds + return lvl2_train, lvl2_test + + +def standard_scaler(flat_train, + flat_test): + # Initialize the scaler + scaler = StandardScaler() + + # Identify continuous columns (float64 and int64 types) + continuous_columns = flat_train.select_dtypes(include=["float64", "int64"]).columns + + # Fit the scaler on training data and transform both training and test sets + train_flat_continuous = scaler.fit_transform(flat_train[continuous_columns]) + test_flat_continuous = scaler.transform(flat_test[continuous_columns]) + + # Create copies of the original DataFrames + train_scaled = flat_train.copy() + test_scaled = flat_test.copy() + + # Replace continuous columns with the scaled versions + train_scaled[continuous_columns] = train_flat_continuous + test_scaled[continuous_columns] = test_flat_continuous + + # Return the scaled DataFrames + return train_scaled, test_scaled + + +def flatten_multi_index(df): + """Flattens the multi-index DataFrame by resetting the index.""" + return df.reset_index(drop=True) + + +def assert_no_missing_values(*dfs): + """Asserts that no DataFrame in the input list contains any missing values.""" + for df in dfs: + assert not df.isnull().any().any(), "DataFrame contains missing values!" \ No newline at end of file From cc15e721f219065763fd91d3cccdb68ecc0a4766 Mon Sep 17 00:00:00 2001 From: fazelehh Date: Tue, 21 Jan 2025 12:47:08 +0000 Subject: [PATCH 2/5] creating dpsgd example for mimic3 on grud model --- examples/expm/audit.yaml | 44 ++ examples/expm/dpsgd_handler.py | 74 +++ examples/expm/run_dpsgd_main.py | 150 +++++ examples/expm/run_nonprivate_main.py | 133 ++++ examples/expm/utils/dpsgd_model_handler.py | 0 examples/expm/utils/expm_model_handler.py | 0 examples/expm/utils/gru_model_handler.py | 698 +++++++++++++++++++++ examples/expm/utils/utils.py | 197 ++++++ 8 files changed, 1296 insertions(+) create mode 100644 examples/expm/audit.yaml create mode 100644 examples/expm/dpsgd_handler.py create mode 100644 examples/expm/run_dpsgd_main.py create mode 100644 examples/expm/run_nonprivate_main.py create mode 100644 examples/expm/utils/dpsgd_model_handler.py create mode 100644 examples/expm/utils/expm_model_handler.py create mode 100644 examples/expm/utils/gru_model_handler.py create mode 100644 examples/expm/utils/utils.py diff --git a/examples/expm/audit.yaml b/examples/expm/audit.yaml new file mode 100644 index 00000000..171e2316 --- /dev/null +++ b/examples/expm/audit.yaml @@ -0,0 +1,44 @@ +audit: # Configurations for auditing + random_seed: 1234 # Integer specifying the random seed + attack_list: + rmia: + training_data_fraction: 0.5 # Fraction of the auxilary dataset to use for this attack (in each shadow model training) + attack_data_fraction: 0.5 # Fraction of auxiliary dataset to sample from during attack + num_shadow_models: 8 # Number of shadow models to train + online: True # perform online or offline attack + temperature: 2 + gamma: 1.0 + offline_a: 0.33 # parameter from which we compute p(x) from p_OUT(x) such that p_IN(x) = a p_OUT(x) + b. + offline_b: 0.66 + # qmia: + # training_data_fraction: 1.0 # Fraction of the auxilary dataset (data without train and test indices) to use for training the quantile regressor + # epochs: 5 # Number of training epochs for quantile regression + # population: + # attack_data_fraction: 1.0 # Fraction of the auxilary dataset to use for this attack + lira: + training_data_fraction: 0.5 # Fraction of the auxilary dataset to use for this attack (in each shadow model training) + num_shadow_models: 8 # Number of shadow models to train + online: True # perform online or offline attack + fixed_variance: True # Use a fixed variance for the whole audit + boosting: True + # loss_traj: + # training_distill_data_fraction : 0.7 # Fraction of the auxilary dataset to use for training the distillation models D_s = (1-D_KD)/2 + # number_of_traj: 10 # Number of epochs (number of points in the loss trajectory) + # label_only: False # True or False + # mia_classifier_epochs: 100 + + output_dir: "./examples/expm/leakpro_output" + attack_type: "mia" #mia, gia + +target: + # Target model path + module_path: "./examples/expm/utils/dpsgd_model_handler.py" # either model_grud.py or model_LR.py for logestic regression + model_class: "GRUD" # LR/GRUD + # Data paths + target_folder: "./target_GRUD" # either target_GRUD or target_LR + data_path: "./examples/expm/data/mimic/dataset.pkl" #unflattened dataset for GRUD and flattened dataset for LR + +shadow_model: + model_class: GRUD # LR/GRUD + +distillation_model: \ No newline at end of file diff --git a/examples/expm/dpsgd_handler.py b/examples/expm/dpsgd_handler.py new file mode 100644 index 00000000..50eeef97 --- /dev/null +++ b/examples/expm/dpsgd_handler.py @@ -0,0 +1,74 @@ + +from torch import cuda, device, nn, optim, squeeze +from torch.nn import CrossEntropyLoss +from torch.utils.data import DataLoader +from tqdm import tqdm +from sklearn.metrics import accuracy_score +from leakpro import AbstractInputHandler + + +class MimicInputHandlerGRU(AbstractInputHandler): + """Class to handle the user input for the MIMICIII dataset.""" + + def __init__(self, configs: dict) -> None: + super().__init__(configs = configs) + + def get_criterion(self)->CrossEntropyLoss: + """Set the CrossEntropyLoss for the model.""" + return CrossEntropyLoss() + + def get_optimizer(self, model:nn.Module) -> optim.Optimizer: + """Set the optimizer for the model.""" + learning_rate = 0.01 + return optim.Adam(model.parameters(), lr=learning_rate) + + def convert_to_device(self, x): + device_name = device("cuda" if cuda.is_available() else "cpu") + return x.to(device_name) + + def to_numpy(self, tensor) : + return tensor.detach().cpu().numpy() if tensor.is_cuda else tensor.detach().numpy() + + def train( + self, + dataloader: DataLoader, + model: nn.Module = None, + criterion: nn.Module = None, + optimizer: optim.Optimizer = None, + epochs: int = None, + ) -> dict: + + """Model training procedure.""" + device_name = device("cuda" if cuda.is_available() else "cpu") + model.to(device_name) + model.train() + + criterion = self.get_criterion() + optimizer = self.get_optimizer(model) + + for e in tqdm(range(epochs), desc="Training Progress"): + model.train() + train_acc, train_loss = 0.0, 0.0 + + for _, (x, labels) in enumerate(tqdm(dataloader, desc="Training Batches")): + x = self.convert_to_device(x) + labels = self.convert_to_device(labels) + labels = labels.long() + + optimizer.zero_grad() + output = model(x) + + loss = criterion(squeeze(output), squeeze(labels).long()) + loss.backward() + optimizer.step() + train_loss += loss.item() + + train_loss = train_loss/len(dataloader) + binary_predictions = self.to_numpy(output).argmax(axis=1) + + # Ensure labels are integer and 1D + binary_labels = self.to_numpy(labels).astype(int) + # Compute accuracy + train_acc = accuracy_score(binary_labels, binary_predictions) + + return {"model": model, "metrics": {"accuracy": train_acc, "loss": train_loss}} \ No newline at end of file diff --git a/examples/expm/run_dpsgd_main.py b/examples/expm/run_dpsgd_main.py new file mode 100644 index 00000000..2b99f0e1 --- /dev/null +++ b/examples/expm/run_dpsgd_main.py @@ -0,0 +1,150 @@ +import os +import sys + +from torch import zeros +from utils.data_handler import get_mimic_dataloaders, get_mimic_dataset +from opacus.accountants.utils import get_noise_multiplier +from utils.gru_model_handler import * + + +# Generate the dataset and dataloaders +path = os.path.join(os.getcwd(), "examples/expm/data/mimic/") +epsilons = [.0001, .001, .01, .1, .5, 1, 2, 3.5, 5, 7, 10] # epsilons to run over +delta = 1e-5 + + +train_frac = 0.4 +valid_frac = 0.0 +test_frac = 0.0 +early_stop_frac = 0.4 +batch_size = 59 +use_LR = False # True if you want to use the LR model, False if you want to use the GRUD model + +dataset, train_indices, validation_indices, test_indices, early_stop_indices= get_mimic_dataset(path, + train_frac , + valid_frac, + test_frac, + early_stop_frac, + use_LR) + +train_loader, validation_loader, test_loader, early_stop_loader = get_mimic_dataloaders(dataset, + train_indices, + validation_indices, + test_indices, + early_stop_indices, + batch_size) + +sample_rate = 1/len(train_loader) # already incorporates batchsize + +try: + noise_multiplier = get_noise_multiplier(target_epsilon = 2, + target_delta = delta, + sample_rate = sample_rate, + epochs = 21, + epsilon_tolerance = 0.01, + accountant = 'prv', + eps_error = 0.01) +except: + # the prv accountant is not robust to large epsilon (even epsilon = 10) + # so we will use rdp when it fails, so the actual epsilon may be slightly off + # see https://github.com/pytorch/opacus/issues/604 + noise_multiplier = get_noise_multiplier(target_epsilon = 2, + target_delta = delta, + sample_rate = sample_rate, + epochs = 21, + epsilon_tolerance = 0.01, + accountant = 'rdp') + + + + +optimized_hyperparams ={ + "cell_size": 58, + "hidden_size": 78, + "learning_rate": 0.0004738759319792616, + "num_epochs":37, + "patience_early_stopping": 20, + "patience_lr_scheduler": 5, + "batch_size": 59, + "seed": 4410, + "min_delta": 0.00001, + "epsilon": 10, + "max_grad_norm": 1, + } +n_features = int(dataset.x.shape[1]/3) +X_mean = zeros(1,dataset.x.shape[2],n_features) + +model_params = {k: optimized_hyperparams[k] for k in ["cell_size", "hidden_size", "batch_size"]} + +# Add other required parameters to model_params +model_params.update({ + "input_size": n_features, + "X_mean": X_mean, + "output_last": False, + "bn_flag": False, + "droupout": 0.1, +}) + + +# Initialize the model with filtered parameters +model = GRUD(**model_params) +# Train the model +results= dpsgd_gru_trained_model_and_metadata( + model, + train_loader, + early_stop_loader, + noise_multiplier, + max_grad_norm = optimized_hyperparams['max_grad_norm'], + epochs=optimized_hyperparams['num_epochs'], + patience_early_stopping = optimized_hyperparams["patience_early_stopping"], + patience_lr= optimized_hyperparams["patience_lr_scheduler"], + min_delta = optimized_hyperparams["min_delta"], + learning_rate = optimized_hyperparams["learning_rate"]) +train_losses, test_losses , train_acc, test_acc, best_model,niter_per_epoch, privacy_engine = results + + +import matplotlib.pyplot as plt + +# Convert losses to numpy-compatible lists directly +train_losses_cpu = [float(loss) for loss in train_losses] +test_losses_cpu = [float(loss) for loss in test_losses] + +# Plot training and test accuracy +plt.figure(figsize=(5, 4)) + +plt.subplot(1, 2, 1) +plt.plot(train_acc, label="Train Accuracy") +plt.plot(test_acc, label="Test Accuracy") +plt.xlabel("Epoch") +plt.ylabel("Accuracy") +plt.title("Accuracy over Epochs") +plt.legend() + +# Plot training and test loss +plt.subplot(1, 2, 2) +plt.plot(train_losses, label="Train Loss") +plt.plot(test_losses, label="Test Loss") +plt.xlabel("Epoch") +plt.ylabel("Loss") +plt.title("Loss over Epochs") +plt.legend() + +plt.tight_layout() +plt.show() +plt.savefig("psgd_gru.png") + + + +from dpsgd_handler import MimicInputHandlerGRU + +from leakpro import LeakPro + +# Read the config file +config_path = "./examples/expm/audit.yaml" + +# Prepare leakpro object +leakpro = LeakPro(MimicInputHandlerGRU, config_path) + +# Run the audit +mia_results = leakpro.run_audit(return_results=True) + diff --git a/examples/expm/run_nonprivate_main.py b/examples/expm/run_nonprivate_main.py new file mode 100644 index 00000000..59314519 --- /dev/null +++ b/examples/expm/run_nonprivate_main.py @@ -0,0 +1,133 @@ +import os +import sys + +from torch import zeros +from utils.data_handler import get_mimic_dataloaders, get_mimic_dataset +from opacus.accountants.utils import get_noise_multiplier +from utils.gru_model_handler import * + + +# Generate the dataset and dataloaders +path = os.path.join(os.getcwd(), "examples/expm/data/mimic/") +epsilons = [.0001, .001, .01, .1, .5, 1, 2, 3.5, 5, 7, 10] # epsilons to run over +delta = 1e-5 + + +train_frac = 0.4 +valid_frac = 0.0 +test_frac = 0.0 +early_stop_frac = 0.4 +batch_size = 59 +use_LR = False # True if you want to use the LR model, False if you want to use the GRUD model + +dataset, train_indices, validation_indices, test_indices, early_stop_indices= get_mimic_dataset(path, + train_frac , + valid_frac, + test_frac, + early_stop_frac, + use_LR) + +train_loader, validation_loader, test_loader, early_stop_loader = get_mimic_dataloaders(dataset, + train_indices, + validation_indices, + test_indices, + early_stop_indices, + batch_size) + +sample_rate = 1/len(train_loader) # already incorporates batchsize + +try: + noise_multiplier = get_noise_multiplier(target_epsilon = 2, + target_delta = delta, + sample_rate = sample_rate, + epochs = 21, + epsilon_tolerance = 0.01, + accountant = 'prv', + eps_error = 0.01) +except: + # the prv accountant is not robust to large epsilon (even epsilon = 10) + # so we will use rdp when it fails, so the actual epsilon may be slightly off + # see https://github.com/pytorch/opacus/issues/604 + noise_multiplier = get_noise_multiplier(target_epsilon = 2, + target_delta = delta, + sample_rate = sample_rate, + epochs = 21, + epsilon_tolerance = 0.01, + accountant = 'rdp') + + +# Initialize model with the best hyperparameters +model_params = { + 'X_mean': X_mean, + 'input_size': X_mean.shape[2], + 'device_id': device, + 'cell_size': h['cell_size'], + 'hidden_size': h['hidden_size'], + 'batch_size': h['batch_size'], + 'apply_sigmoid': h['loss'] == 'l2', # Only add apply_sigmoid for 'l2' loss + 'use_bn': h["use_bn"] != "nobn", # Use batch norm unless 'nobn' specified +} + + +optimized_hyperparams ={ + "cell_size": 58, + "hidden_size": 78, + "learning_rate": 0.0004738759319792616, + "num_epochs":37, + "patience_early_stopping": 20, + "patience_lr_scheduler": 5, + "batch_size": 59, + "seed": 4410, + "min_delta": 0.00001, + "epsilon": 10, + "max_grad_norm": 1, + } +n_features = int(dataset.x.shape[1]/3) +X_mean = zeros(1,dataset.x.shape[2],n_features) + +model_params = {k: optimized_hyperparams[k] for k in ["cell_size", "hidden_size", "batch_size"]} + +# Add other required parameters to model_params +model_params.update({ + "input_size": n_features, + "X_mean": X_mean, + "output_last": False +}) + + +# if h["loss"] == "l2": +# loss_fn = lambda preds, targets: torch.nn.MSELoss()(torch.squeeze(preds), targets) +# elif h["loss"] == "bce": +# loss_fn = torch.nn.BCEWithLogitsLoss() + + + +# Initialize the model with filtered parameters +model = GRUD(**model_params) +# Train the model +results= dpsgd_gru_trained_model_and_metadata( + model, + train_loader, + early_stop_loader, + noise_multiplier, + max_grad_norm = optimized_hyperparams['max_grad_norm'], + epochs=optimized_hyperparams['num_epochs'], + patience_early_stopping = optimized_hyperparams["patience_early_stopping"], + patience_lr= optimized_hyperparams["patience_lr_scheduler"], + min_delta = optimized_hyperparams["min_delta"], + learning_rate = optimized_hyperparams["learning_rate"]) +train_losses, test_losses , train_acc, test_acc, best_model,niter_per_epoch, privacy_engine = results + +optimized_hyperparams ={ + "cell_size": 58, + "hidden_size": 78, + "learning_rate": 0.0004738759319792616, + "num_epochs":37, + "patience_early_stopping": 20, + "patience_lr_scheduler": 5, + "batch_size": 59, + "seed": 4410, + "min_delta": 0.00001, + "epsilon": 10, + } + diff --git a/examples/expm/utils/dpsgd_model_handler.py b/examples/expm/utils/dpsgd_model_handler.py new file mode 100644 index 00000000..e69de29b diff --git a/examples/expm/utils/expm_model_handler.py b/examples/expm/utils/expm_model_handler.py new file mode 100644 index 00000000..e69de29b diff --git a/examples/expm/utils/gru_model_handler.py b/examples/expm/utils/gru_model_handler.py new file mode 100644 index 00000000..907e7833 --- /dev/null +++ b/examples/expm/utils/gru_model_handler.py @@ -0,0 +1,698 @@ +""" +This file is inspired by https://github.com/MLforHealth/MIMIC_Extract +MIT License +Copyright (c) 2019 MIT Laboratory for Computational Physiology +""" +import math +import os +import pickle +import time +import warnings + +import numpy as np +import pandas as pd +import torch.nn.functional as F +import torch.utils.data as utils +from sklearn.metrics import accuracy_score +from torch import Tensor, cat, cuda, device, exp, eye, from_numpy, isnan, max, nn, optim, save, sigmoid, squeeze, tanh, zeros, no_grad +from torch.autograd import Variable +from torch.nn.parameter import Parameter +from torch.optim.lr_scheduler import ReduceLROnPlateau +from tqdm import tqdm +from opacus import PrivacyEngine, GradSampleModule + +def to_3D_tensor(df): + idx = pd.IndexSlice + np_3D = np.dstack([df.loc[idx[:, :, :, i], :].values for i in sorted(set(df.index.get_level_values("hours_in")))]) + return from_numpy(np_3D) + +def prepare_dataloader(df, Ys, batch_size, shuffle=True): + """Dfs = (df_train, df_dev, df_test). + df_* = (subject, hadm, icustay, hours_in) X (level2, agg fn \ni {mask, mean, time}) + Ys_series = (subject, hadm, icustay) => label. + """ + X = from_numpy(to_3D_tensor(df).astype(np.float32)) + label = from_numpy(Ys.values.astype(np.int64)) + dataset = utils.TensorDataset(X, label) + return utils.DataLoader(dataset, batch_size =int(batch_size) , shuffle=shuffle, drop_last = True) + +class FilterLinear(nn.Module): + def __init__(self, in_features, out_features, filter_square_matrix, device, bias=True): + """filter_square_matrix : filter square matrix, whose each elements is 0 or 1. + """ + super(FilterLinear, self).__init__() + self.in_features = in_features + self.out_features = out_features + + assert in_features > 1 and out_features > 1, "Passing in nonsense sizes" + + self.filter_square_matrix = None + self.filter_square_matrix = Variable(filter_square_matrix.to(device), requires_grad=False) + + self.weight = Parameter(Tensor(out_features, in_features)).to(device) + + if bias: + self.bias = Parameter(Tensor(out_features)).to(device) + else: + self.register_parameter("bias", None) + self.reset_parameters() + + def reset_parameters(self): + stdv = 1. / math.sqrt(self.weight.size(1)) + self.weight.data.uniform_(-stdv, stdv) + if self.bias is not None: + self.bias.data.uniform_(-stdv, stdv) + + def forward(self, x): + return F.linear( + x, + self.filter_square_matrix.mul(self.weight), + self.bias + ) + + def __repr__(self): + return self.__class__.__name__ + "(" \ + + "in_features=" + str(self.in_features) \ + + ", out_features=" + str(self.out_features) \ + + ", bias=" + str(self.bias is not None) + ")" + +class GRUD(nn.Module): + def __init__(self, input_size, cell_size, hidden_size, X_mean, batch_size = 0, + droupout=0, bn_flag = True, output_last = False): + """With minor modifications from https://github.com/zhiyongc/GRU-D/ + + Recurrent Neural Networks for Multivariate Times Series with Missing Values + GRU-D: GRU exploit two representations of informative missingness patterns, i.e., masking and time interval. + cell_size is the size of cell_state. + + Implemented based on the paper: + @article{che2018recurrent, + title={Recurrent neural networks for multivariate time series with missing values}, + author={Che, Zhengping and Purushotham, Sanjay and Cho, Kyunghyun and Sontag, David and Liu, Yan}, + journal={Scientific reports}, + volume={8}, + number={1}, + pages={6085}, + year={2018}, + publisher={Nature Publishing Group} + } + + GRU-D: + input_size: variable dimension of each time + hidden_size: dimension of hidden_state + mask_size: dimension of masking vector + X_mean: the mean of the historical input data + """ + + super(GRUD, self).__init__() + + # Save init params to a dictionary + self.init_params = { + "input_size": input_size, + "cell_size": cell_size, + "hidden_size": hidden_size, + "X_mean": X_mean, + "batch_size": batch_size, + "output_last": output_last, + "dropout":droupout, + "bn_flag":bn_flag, + } + + self.hidden_size = hidden_size + self.delta_size = input_size + self.mask_size = input_size + self.bn_flag = bn_flag + + self.device = device("cuda" if cuda.is_available() else "cpu") + self.identity = eye(input_size).to(self.device) + self.X_mean = Variable(Tensor(X_mean).to(self.device)) + + # Wz, Uz are part of the same network. the bias is bz + self.zl = nn.Linear(input_size + hidden_size + self.mask_size, hidden_size).to(self.device) + + # Wr, Ur are part of the same network. the bias is br + self.rl = nn.Linear(input_size + hidden_size + self.mask_size, hidden_size).to(self.device) + + # W, U are part of the same network. the bias is b + self.hl = nn.Linear(input_size + hidden_size + self.mask_size, hidden_size).to(self.device) + + self.gamma_x_l = FilterLinear(self.delta_size, self.delta_size, self.identity, self.device) + self.gamma_h_l = nn.Linear(self.delta_size, self.hidden_size).to(self.device) + self.output_last = output_last + + #TODO: this part differs from gitcode + # self.fc = nn.Linear(self.hidden_size, 2).to(self.device) + # self.bn= nn.BatchNorm1d(2, eps=1e-05, momentum=0.1, affine=True).to(self.device) + # self.drop=nn.Dropout(p=0.7, inplace=False) + self.fc = nn.Linear(self.hidden_size, 1) # a probability score + self.drop=nn.Dropout(p=0.48, inplace=False) + if self.bn_flag: + self.bn= nn.BatchNorm1d(self.hidden_size, eps=1e-05, momentum=0.1, affine=True) + + def step(self, x, x_last_obsv, x_mean, h, mask, delta): + """Inputs: + x: input tensor + x_last_obsv: input tensor with forward fill applied + x_mean: the mean of each feature + h: the hidden state of the network + mask: the mask of whether or not the current value is observed + delta: the tensor indicating the number of steps since the last time a feature was observed. + + Returns: + h: the updated hidden state of the network + + """ + + # Assert to check for NaNs in x_mean + assert not isnan(x_mean).any(), "NaN values found in x_mean" + + batch_size = x.size()[0] + feature_size = x.size()[1] + zero_x = zeros(batch_size, feature_size).to(self.device) + zero_h = zeros(batch_size, self.hidden_size).to(self.device) + + + gamma_x_l_delta = self.gamma_x_l(delta) + delta_x = exp(-max(zero_x, gamma_x_l_delta)) + + gamma_h_l_delta = self.gamma_h_l(delta) + delta_h = exp(-max(zero_h, gamma_h_l_delta)) + + x_mean = x_mean.repeat(batch_size, 1) + + x = mask * x + (1 - mask) * (delta_x * x_last_obsv + (1 - delta_x) * x_mean) + h = delta_h * h + + combined = cat((x, h, mask), 1) + # Assert to check for NaNs in combined + assert not isnan(combined).any(), "NaN values found in combined" + + z = sigmoid(self.zl(combined)) #sigmoid(W_z*x_t + U_z*h_{t-1} + V_z*m_t + bz) + r = sigmoid(self.rl(combined)) #sigmoid(W_r*x_t + U_r*h_{t-1} + V_r*m_t + br) + combined_new = cat((x, r*h, mask), 1) + h_tilde = tanh(self.hl(combined_new)) #tanh(W*x_t +U(r_t*h_{t-1}) + V*m_t) + b + h = (1 - z) * h + z * h_tilde + + return h + + + def forward(self, X): + """X: Input tensor of shape (batch_size, time_steps * 3, features) + The tensor includes Mask, Measurement, and Delta sequentially for each time step. + """ + + # Step 1: Split the input tensor into Mask, Measurement, and Delta + batch_size = X.size(0) + time_steps = X.size(1) // 3 # Since every 3 consecutive steps represent Mask, Measurement, and Delta + + # Reshape X into 3 separate tensors for Mask, Measurement, and Delta + Mask = X[:, np.arange(0, X.size(1), 3), :] # Extract Mask + Measurement = X[:, np.arange(1, X.size(1), 3), :] # Extract Measurement + Delta = X[:, np.arange(2, X.size(1), 3), :] # Extract Delta + + # Transpose tensors to match (batch_size, time_steps, features) + Mask = Mask.transpose(1, 2) + Measurement = Measurement.transpose(1, 2) + Delta = Delta.transpose(1, 2) + + # X_last_obsv is initialized to Measurement at the starting point + X_last_obsv = Measurement + + # Step 2: Initialize hidden state + step_size = Measurement.size(1) # Number of time points + Hidden_State = self.initHidden(batch_size) + + # Step 3: Iterate through time steps and update the GRU hidden state + outputs = None + for i in range(step_size): + Hidden_State = self.step( + squeeze(Measurement[:, i, :], 1), + squeeze(X_last_obsv[:, i, :], 1), + squeeze(self.X_mean[:, i, :], 1), + Hidden_State, + squeeze(Mask[:, i, :], 1), + squeeze(Delta[:, i, :], 1), + ) + # Collect hidden states + if outputs is None: + outputs = Hidden_State.unsqueeze(1) + else: + outputs = cat((Hidden_State.unsqueeze(1), outputs), 1) + + # Step 4: Predict a binary outcome using FC, BatchNorm, and Dropout layers + if self.bn_flag: + return self.fc(self.bn(self.drop(Hidden_State))) + else: + return self.fc(self.drop(Hidden_State)) + + + def initHidden(self, batch_size): + Hidden_State = Variable(zeros(batch_size, self.hidden_size)).to(self.device) + return Hidden_State + +def to_numpy(tensor): + return tensor.detach().cpu().numpy() if tensor.is_cuda else tensor.detach().numpy() + + + + +def dpsgd_gru_trained_model_and_metadata(model, + train_dataloader, + test_dataloader, + noise_multiplier, + max_grad_norm, + epochs, + patience_early_stopping, + patience_lr, + min_delta, + learning_rate): + + print("Model Structure: ", model) + print("Start Training ... ") + + # Check if the input tensor is 3D + # This check is nessary because the GRU-D model expects a 3D tensor, meaning the input data should not be flattened + # The input tensor should have the shape (num_datapoints, num_features, num_timepoints) + if train_dataloader.dataset.dataset.x.ndimension() != 3: + warnings.warn("Input tensor is not 3D. There might be a mismatch between .", UserWarning) + + # Early Stopping + min_loss_epoch_valid = float("inf") # Initialize to infinity for comparison + patient_epoch = 0 # Initialize patient counter + + device_name = device("cuda" if cuda.is_available() else "cpu") + + if isinstance(model, nn.Sequential): + output_last = model[-1].output_last + print("Output type dermined by the last layer") + else: + output_last = model.output_last + print("Output type dermined by the model") + + criterion_BCE = nn.BCEWithLogitsLoss() + criterion_CEL = nn.CrossEntropyLoss() + criterion_MSE = nn.MSELoss() + optimizer = optim.Adam(model.parameters(), lr=learning_rate) + + + # make the model private + privacy_engine = PrivacyEngine(accountant = 'prv') + priv_model, priv_opt, priv_train_dataloader = privacy_engine.make_private( + module=model, + optimizer=optimizer, + data_loader=train_dataloader, + noise_multiplier=noise_multiplier, + max_grad_norm=max_grad_norm, + ) + + # Reduce learning rate when a metric has stopped improving + scheduler = ReduceLROnPlateau(priv_opt, mode="min", patience = patience_lr) + + + train_losses = [] + test_losses = [] + test_acces = [] + train_acces = [] + + niter_per_epoch = 0 + + cur_time = time.time() + pre_time = time.time() + + X_mean = priv_model._module.X_mean + priv_model.to(device_name) + + for epoch in tqdm(range(epochs), desc="Training Progress"): + + priv_model.train() + train_loss = 0.0 + + test_dataloader_iter = iter(test_dataloader) + + for _, (X, labels) in enumerate(tqdm(priv_train_dataloader, desc="Training Batches")): + + if epoch == 0: + niter_per_epoch += 1 + + X = X.to(device_name) + labels = labels.to(device_name) + labels = labels.long().float() + prediction = priv_model(X) + prediction = prediction.squeeze(dim=1) + + output_last = True + if output_last: + loss = criterion_BCE(prediction, labels) + else: + full_labels = cat((X[:,1:,:], labels), dim = 1) + loss = criterion_MSE(prediction, full_labels) + + + priv_opt.zero_grad() + loss.backward() + priv_opt.step() + train_loss += loss.item() + + train_loss /= len(priv_train_dataloader) + train_losses.append(train_loss) + + # Convert predictions to class indices + binary_predictions = (prediction > 0).float().cpu().numpy() + + # Ensure labels are integer and 1D + binary_labels = to_numpy(labels).astype(int) + # Compute accuracy + train_acc = accuracy_score(binary_labels, binary_predictions) + train_acces.append(train_acc) + + # test + priv_model.eval() + try: + X_test, labels_test = next(test_dataloader_iter) + except StopIteration: + valid_dataloader_iter = iter(test_dataloader) + X_test, labels_test = next(valid_dataloader_iter) + + + + X_test = X_test.to(device_name) + labels_test = labels_test.to(device_name) + labels_test = labels_test.long().float() + + + with no_grad(): + prediction_test = priv_model(X_test) + prediction_test = prediction_test.squeeze(dim=1) + + + if output_last: + test_loss = criterion_BCE(prediction_test, labels_test) + else: + full_labels_val = cat((X_test[:,1:,:], labels_test), dim = 1) + test_loss = criterion_MSE(prediction_test, full_labels_val) + + test_loss = test_loss.cpu().item() + test_losses.append(test_loss) + + # Convert predictions to class indices + binary_predictions_test = (prediction_test > 0).float().cpu().numpy() + + # Ensure labels are integer and 1D + binary_labels_test = to_numpy(labels_test).astype(int) + # Compute accuracy + test_acc = accuracy_score(binary_labels_test, binary_predictions_test) + test_acces.append(test_acc) + + # Early stopping + # Assume test_loss is computed for validation set + if test_loss < min_loss_epoch_valid - min_delta: # Improvement condition + min_loss_epoch_valid = test_loss + patient_epoch = 0 + print(f"Epoch {epoch}: Validation loss improved to {test_loss:.4f}") + else: + patient_epoch += 1 + print(f"Epoch {epoch}: No improvement. Patience counter: {patient_epoch}/{patience_early_stopping}") + + if patient_epoch >= patience_early_stopping: + print(f"Early stopping at epoch {epoch}. Best validation loss: {min_loss_epoch_valid:.4f}") + break + + # Step the scheduler + scheduler.step(test_loss) + + # Check the learning rate + current_lr = priv_opt.param_groups[0]["lr"] + print(f"Learning Rate: {current_lr:.6f}") + + # Stop if learning rate becomes too small + if current_lr < 1e-6: + print("Learning rate too small, stopping training.") + break + + + # Print training parameters + cur_time = time.time() + print("Epoch: {}, train_loss: {}, valid_loss: {}, time: {}".format( \ + epoch, \ + np.around(train_loss, decimals=8),\ + np.around(test_loss, decimals=8),\ + np.around(cur_time - pre_time, decimals=2))) + pre_time = cur_time + # Move the model back to the CPU + # Ensure the target directory exists + os.makedirs("target_GRUD", exist_ok=True) + priv_model.to("cpu") + with open("target_GRUD/target_model.pkl", "wb") as f: + save(priv_model.state_dict(), f) + + # Create metadata and store it + meta_data = {} + meta_data["train_indices"] = priv_train_dataloader.dataset.indices + meta_data["test_indices"] = test_dataloader.dataset.indices + meta_data["num_train"] = len(meta_data["train_indices"]) + + # Write init params + meta_data["init_params"] = {} + for key, value in model.init_params.items(): + meta_data["init_params"][key] = value + + # read out optimizer parameters + meta_data["optimizer"] = {} + meta_data["optimizer"]["name"] = priv_opt.__class__.__name__.lower() + meta_data["optimizer"]["lr"] = priv_opt.param_groups[0].get("lr", 0) + meta_data["optimizer"]["weight_decay"] = priv_opt.param_groups[0].get("weight_decay", 0) + meta_data["optimizer"]["momentum"] = priv_opt.param_groups[0].get("momentum", 0) + meta_data["optimizer"]["dampening"] = priv_opt.param_groups[0].get("dampening", 0) + meta_data["optimizer"]["nesterov"] = priv_opt.param_groups[0].get("nesterov", False) + + # read out criterion parameters + meta_data["loss"] = {} + meta_data["loss"]["name"] = criterion_CEL.__class__.__name__.lower() + + meta_data["batch_size"] = priv_train_dataloader.batch_size + meta_data["epochs"] = epochs + meta_data["train_acc"] = train_acc + meta_data["test_acc"] = test_acc + meta_data["train_loss"] = train_loss + meta_data["test_loss"] = test_loss + meta_data["dataset"] = "mimiciii" + with open("target_GRUD/priv_model_metadata.pkl", "wb") as f: + pickle.dump(meta_data, f) + return [train_losses, test_losses, train_acces, test_acces, priv_model, niter_per_epoch, privacy_engine] + + + + + + + + + + + +def gru_trained_model_and_metadata(model, + train_dataloader, + test_dataloader, + epochs, + patience_early_stopping, + patience_lr, + min_delta, + learning_rate): + + print("Model Structure: ", model) + print("Start Training ... ") + + # Check if the input tensor is 3D + # This check is nessary because the GRU-D model expects a 3D tensor, meaning the input data should not be flattened + # The input tensor should have the shape (num_datapoints, num_features, num_timepoints) + if train_dataloader.dataset.dataset.x.ndimension() != 3: + warnings.warn("Input tensor is not 3D. There might be a mismatch between .", UserWarning) + + # Early Stopping + min_loss_epoch_valid = float("inf") # Initialize to infinity for comparison + patient_epoch = 0 # Initialize patient counter + + device_name = device("cuda" if cuda.is_available() else "cpu") + + if isinstance(model, nn.Sequential): + output_last = model[-1].output_last + print("Output type dermined by the last layer") + else: + output_last = model.output_last + print("Output type dermined by the model") + + ##TODO BCE + criterion_CEL = nn.CrossEntropyLoss() + criterion_MSE = nn.MSELoss() + criterion_BCE = nn.BCEWithLogitsLoss() + optimizer = optim.Adam(model.parameters(), lr=learning_rate) + + # Reduce learning rate when a metric has stopped improving + scheduler = ReduceLROnPlateau(optimizer, mode="min", factor=0.5, patience = patience_lr) + + + train_losses = [] + test_losses = [] + test_acces = [] + train_acces = [] + + + cur_time = time.time() + pre_time = time.time() + + + model.to(device_name) + + for epoch in tqdm(range(epochs), desc="Training Progress"): + + model.train() + train_loss = 0.0 + + test_dataloader_iter = iter(test_dataloader) + + for _, (X, labels) in enumerate(tqdm(train_dataloader, desc="Training Batches")): + + X = X.to(device_name) + labels = labels.to(device_name) + labels = labels.long() + prediction = model(X) + + output_last = True + if output_last: + loss = criterion_CEL(squeeze(prediction), squeeze(labels)) + else: + full_labels = cat((X[:,1:,:], labels), dim = 1) + loss = criterion_MSE(prediction, full_labels) + + + optimizer.zero_grad() + loss.backward() + optimizer.step() + train_loss += loss.item() + + train_loss /= len(train_dataloader) + train_losses.append(train_loss) + + # Convert predictions to class indices + binary_predictions = to_numpy(prediction).argmax(axis=1) + + # Ensure labels are integer and 1D + binary_labels = to_numpy(labels).astype(int) + # Compute accuracy + train_acc = accuracy_score(binary_labels, binary_predictions) + train_acces.append(train_acc) + + # test + model.eval() + try: + X_test, labels_test = next(test_dataloader_iter) + except StopIteration: + valid_dataloader_iter = iter(test_dataloader) + X_test, labels_test = next(valid_dataloader_iter) + + + model.zero_grad() + X_test = X_test.to(device_name) + labels_test = labels_test.to(device_name) + labels_test = labels_test.long() + + prediction_test = model(X_test) + + + + if output_last: + test_loss = criterion_CEL(squeeze(prediction_test), squeeze(labels_test)) + else: + full_labels_val = cat((X_test[:,1:,:], labels_test), dim = 1) + test_loss = criterion_MSE(prediction_test, full_labels_val) + + test_loss = test_loss.cpu().item() + test_losses.append(test_loss) + + # Convert predictions to class indices + binary_predictions_test = to_numpy(prediction_test).argmax(axis=1) + + # Ensure labels are integer and 1D + binary_labels_test = to_numpy(labels_test).astype(int) + # Compute accuracy + test_acc = accuracy_score(binary_labels_test, binary_predictions_test) + test_acces.append(test_acc) + + # Early stopping + # Assume test_loss is computed for validation set + if test_loss < min_loss_epoch_valid - min_delta: # Improvement condition + min_loss_epoch_valid = test_loss + patient_epoch = 0 + print(f"Epoch {epoch}: Validation loss improved to {test_loss:.4f}") + else: + patient_epoch += 1 + print(f"Epoch {epoch}: No improvement. Patience counter: {patient_epoch}/{patience_early_stopping}") + + if patient_epoch >= patience_early_stopping: + print(f"Early stopping at epoch {epoch}. Best validation loss: {min_loss_epoch_valid:.4f}") + break + + # Step the scheduler + scheduler.step(test_loss) + + # Check the learning rate + current_lr = optimizer.param_groups[0]["lr"] + print(f"Learning Rate: {current_lr:.6f}") + + # Stop if learning rate becomes too small + if current_lr < 1e-6: + print("Learning rate too small, stopping training.") + break + + + # Print training parameters + cur_time = time.time() + print("Epoch: {}, train_loss: {}, valid_loss: {}, time: {}".format( \ + epoch, \ + np.around(train_loss, decimals=8),\ + np.around(test_loss, decimals=8),\ + np.around(cur_time - pre_time, decimals=2))) + pre_time = cur_time + # Move the model back to the CPU + # Ensure the target directory exists + os.makedirs("target_GRUD", exist_ok=True) + model.to("cpu") + with open("target_GRUD/target_model.pkl", "wb") as f: + save(model.state_dict(), f) + + # Create metadata and store it + meta_data = {} + meta_data["train_indices"] = train_dataloader.dataset.indices + meta_data["test_indices"] = test_dataloader.dataset.indices + meta_data["num_train"] = len(meta_data["train_indices"]) + + # Write init params + meta_data["init_params"] = {} + for key, value in model.init_params.items(): + meta_data["init_params"][key] = value + + # read out optimizer parameters + meta_data["optimizer"] = {} + meta_data["optimizer"]["name"] = optimizer.__class__.__name__.lower() + meta_data["optimizer"]["lr"] = optimizer.param_groups[0].get("lr", 0) + meta_data["optimizer"]["weight_decay"] = optimizer.param_groups[0].get("weight_decay", 0) + meta_data["optimizer"]["momentum"] = optimizer.param_groups[0].get("momentum", 0) + meta_data["optimizer"]["dampening"] = optimizer.param_groups[0].get("dampening", 0) + meta_data["optimizer"]["nesterov"] = optimizer.param_groups[0].get("nesterov", False) + + # read out criterion parameters + meta_data["loss"] = {} + meta_data["loss"]["name"] = criterion_CEL.__class__.__name__.lower() + + meta_data["batch_size"] = train_dataloader.batch_size + meta_data["epochs"] = epochs + meta_data["train_acc"] = train_acc + meta_data["test_acc"] = test_acc + meta_data["train_loss"] = train_loss + meta_data["test_loss"] = test_loss + meta_data["dataset"] = "mimiciii" + with open("target_GRUD/model_metadata.pkl", "wb") as f: + pickle.dump(meta_data, f) + return train_losses, test_losses, train_acces, test_acces + + diff --git a/examples/expm/utils/utils.py b/examples/expm/utils/utils.py new file mode 100644 index 00000000..6d3ae71d --- /dev/null +++ b/examples/expm/utils/utils.py @@ -0,0 +1,197 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import Optional + +from opacus.accountants import create_accountant +# file is from MIMIC Extract Paper +import copy, math, os, pickle, time, pandas as pd, numpy as np, scipy.stats as ss + +from sklearn.linear_model import LogisticRegression +from sklearn.ensemble import RandomForestClassifier +from sklearn.metrics import average_precision_score, roc_auc_score, accuracy_score, f1_score + +from opacus import PrivacyEngine, GradSampleModule +import torch, torch.utils.data as utils, torch.nn as nn, torch.nn.functional as F, torch.optim as optim +from torch.autograd import Variable +from torch.nn.parameter import Parameter +from torch.optim.lr_scheduler import ReduceLROnPlateau + + +from pathlib import Path + + +MAX_SIGMA = 1e6 + + +def get_noise_multiplier( + *, + target_epsilon: float, + target_delta: float, + sample_rate: float, + epochs: Optional[int] = None, + steps: Optional[int] = None, + accountant: str = "rdp", + epsilon_tolerance: float = 0.01, + **kwargs, +) -> float: + r""" + Computes the noise level sigma to reach a total budget of (target_epsilon, target_delta) + at the end of epochs, with a given sample_rate + + Args: + target_epsilon: the privacy budget's epsilon + target_delta: the privacy budget's delta + sample_rate: the sampling rate (usually batch_size / n_data) + epochs: the number of epochs to run + steps: number of steps to run + accountant: accounting mechanism used to estimate epsilon + epsilon_tolerance: precision for the binary search + Returns: + The noise level sigma to ensure privacy budget of (target_epsilon, target_delta) + """ + if (steps is None) == (epochs is None): + raise ValueError( + "get_noise_multiplier takes as input EITHER a number of steps or a number of epochs" + ) + if steps is None: + steps = int(epochs / sample_rate) + + eps_high = float("inf") + accountant = create_accountant(mechanism=accountant) + + sigma_low, sigma_high = 0, 10 + while eps_high > target_epsilon: + sigma_high = 2 * sigma_high + accountant.history = [(sigma_high, sample_rate, steps)] + eps_high = accountant.get_epsilon(delta=target_delta, **kwargs) + if sigma_high > MAX_SIGMA: + raise ValueError("The privacy budget is too low.") + + while target_epsilon - eps_high > epsilon_tolerance: + sigma = (sigma_low + sigma_high) / 2 + accountant.history = [(sigma, sample_rate, steps)] + eps = accountant.get_epsilon(delta=target_delta, **kwargs) + + if eps < target_epsilon: + sigma_high = sigma + eps_high = eps + else: + sigma_low = sigma + + return sigma_high + + +def Train_Model_DPSGD(pre_model, loss_fn, pre_train_dataloader, noise_multiplier, + max_grad_norm = 1, num_epochs = 300, patience = 1000, + learning_rate=1e-3, batch_size=None): + """ + Inputs: + pre_model: a GRUD model + loss_fn: the loss function to use + pre_train_dataloader: training data + noise_multiplier: the noise multiplier for dpsgd + max_grad_norm: the max norm for gradient in dpsgd + num_epochs: number of times over the training data + patience: used for decreasing learning rate + min_delta: if the loss stays within this value on the next step stop early + batch_size: size of a batch + + Returns: + best_model + losses_train + losses_epochs_train + """ + pre_opt = torch.optim.Adam(pre_model.parameters(), lr = learning_rate) + + # make private + privacy_engine = PrivacyEngine(accountant = 'prv') + priv_model, priv_opt, priv_train_dataloader = privacy_engine.make_private( + module=pre_model, + optimizer=pre_opt, + data_loader=pre_train_dataloader, + noise_multiplier=noise_multiplier, + max_grad_norm=max_grad_norm, + ) + scheduler = ReduceLROnPlateau(priv_opt, 'min', patience=patience, verbose = True) + +# losses_train = [] +# losses_epochs_train = [] + niter_per_epoch = 0 + + # BE CAREFUL! The mean should be computed privately. + X_mean = priv_model._module.X_mean + for epoch in range(num_epochs): +# losses_epoch_train = [] + + for X, labels in priv_train_dataloader: + if epoch == 0: + niter_per_epoch += 1 # needed to compute epsilon later if we want to + mask = X[:, np.arange(0, X.shape[1], 3), :] + measurement = X[:, np.arange(1, X.shape[1], 3), :] + time_ = X[:, np.arange(2, X.shape[1], 3), :] + + mask = torch.transpose(mask, 1, 2) + measurement = torch.transpose(measurement, 1, 2) + time_ = torch.transpose(time_, 1, 2) +# measurement_last_obsv = measurement + m_shape = measurement.shape[0] + # we delete last column and prepend mean so that the last observed is used + measurement_last_obsv = measurement[:, 0:measurement.shape[1]-1, :] + measurement_last_obsv = torch.cat((torch.stack([X_mean[:, 0, :]]*m_shape), + measurement_last_obsv), dim = 1) + + convert_to_tensor = lambda x: torch.autograd.Variable(x) + X, X_last_obsv, Mask, Delta, labels = map(convert_to_tensor, + [measurement, + measurement_last_obsv, + mask, + time_, + labels]) + + priv_model.zero_grad() + + prediction = priv_model(X, X_last_obsv, Mask, Delta) + + loss_train = loss_fn(torch.squeeze(prediction), torch.squeeze(labels)) +# with torch.no_grad(): +# losses_train.append(loss_train.item()) +# losses_epoch_train.append(loss_train.item()) + + priv_opt.zero_grad() + loss_train.backward() + priv_opt.step() + scheduler.step(loss_train) + +# avg_losses_epoch_train = sum(losses_epoch_train) / float(len(losses_epoch_train)) +# losses_epochs_train.append(avg_losses_epoch_train) + + return priv_model, niter_per_epoch, privacy_engine#, [losses_train, losses_epochs_train] + +def get_results_df(RESULTS_FOLDER, h_pass, run, task, verbose = False): + task_d = {} + i = 0 + folder = Path(RESULTS_FOLDER, f"{h_pass}{run}") + for filename in folder.glob('*'): + if os.path.isdir(filename): + for subfilename in filename.glob('*'): + if task in str(filename) and 'json' in str(subfilename) and 'results' in str(subfilename): + task_d[i] = unjsonify(subfilename) + i += 1 + if task in str(filename) and 'json' in str(filename): + task_d[i] = unjsonify(filename) + i += 1 + if verbose: print(f'---Processing {h_pass}{run} run for {task} ICU hyperparameter results -----') + task_df = pd.concat([pd.json_normalize(task_d[j]) for j in range(0,i)]) + return task_df \ No newline at end of file From 9973880887cc04648863b7f58fd4c8b27ba6ee8b Mon Sep 17 00:00:00 2001 From: fazelehh Date: Fri, 24 Jan 2025 14:31:36 +0000 Subject: [PATCH 3/5] renaming somefiles, adding pdsgd model --- examples/expm/audit.yaml | 15 +- examples/expm/dpsgd_handler.py | 52 +- examples/expm/run_dpsgd_main.py | 21 +- examples/expm/run_nonprivate_main.py | 2 +- examples/expm/utils/data_handler.py | 350 ------------- .../{gru_model_handler.py => dpsgd_model.py} | 220 +-------- .../{dpsgd_model_handler.py => expm_model.py} | 0 examples/expm/utils/expm_model_handler.py | 0 examples/expm/utils/non_private_model.py | 460 ++++++++++++++++++ 9 files changed, 538 insertions(+), 582 deletions(-) delete mode 100644 examples/expm/utils/data_handler.py rename examples/expm/utils/{gru_model_handler.py => dpsgd_model.py} (70%) rename examples/expm/utils/{dpsgd_model_handler.py => expm_model.py} (100%) delete mode 100644 examples/expm/utils/expm_model_handler.py create mode 100644 examples/expm/utils/non_private_model.py diff --git a/examples/expm/audit.yaml b/examples/expm/audit.yaml index 171e2316..216a76da 100644 --- a/examples/expm/audit.yaml +++ b/examples/expm/audit.yaml @@ -29,16 +29,21 @@ audit: # Configurations for auditing output_dir: "./examples/expm/leakpro_output" attack_type: "mia" #mia, gia + dpsgd: + dpsgd_use: True + dpsgd_path: "./examples/expm/target_GRUD/dpsgd_dic.pkl" + + target: # Target model path - module_path: "./examples/expm/utils/dpsgd_model_handler.py" # either model_grud.py or model_LR.py for logestic regression - model_class: "GRUD" # LR/GRUD + module_path: "examples/expm/utils/dpsgd_model.py" # either model_grud.py or model_LR.py for logestic regression + model_class: "GRUD_DPSGD" # LR/GRUD # Data paths - target_folder: "./target_GRUD" # either target_GRUD or target_LR + target_folder: "./target_dpsgd" # either target_GRUD or target_LR data_path: "./examples/expm/data/mimic/dataset.pkl" #unflattened dataset for GRUD and flattened dataset for LR shadow_model: - model_class: GRUD # LR/GRUD + model_class: dpsgd_model_handler # LR/GRUD -distillation_model: \ No newline at end of file +distillation_model: "GRUD_DPSGD" \ No newline at end of file diff --git a/examples/expm/dpsgd_handler.py b/examples/expm/dpsgd_handler.py index 50eeef97..8a43b260 100644 --- a/examples/expm/dpsgd_handler.py +++ b/examples/expm/dpsgd_handler.py @@ -5,6 +5,10 @@ from tqdm import tqdm from sklearn.metrics import accuracy_score from leakpro import AbstractInputHandler +import os +import pickle +from opacus.accountants.utils import get_noise_multiplier +from opacus import PrivacyEngine, GradSampleModule class MimicInputHandlerGRU(AbstractInputHandler): @@ -37,8 +41,52 @@ def train( optimizer: optim.Optimizer = None, epochs: int = None, ) -> dict: - - """Model training procedure.""" + + if self.configs['audit']['dpsgd']: + print("Training shadow models with DP-SGD") + dpsgd_path = self.configs['audit']['dpsgd']['dpsgd_path'] + + sample_rate = 1/len(dataloader) + # Check if the file exists + if os.path.exists(dpsgd_path): + # Open and read the pickle file + with open(dpsgd_path, "rb") as file: + data = pickle.load(file) + print("Pickle file loaded successfully!") + print("Data:", data) + else: + print(f"File not found at: {dpsgd_path}") + try: + noise_multiplier = get_noise_multiplier(target_epsilon = 2, + target_delta = data['delta'], + sample_rate = data['sample_rate'], + epochs = 21, + epsilon_tolerance = data['epsilon_tolerance'], + accountant = 'prv', + eps_error = data['eps_error']) + except: + # the prv accountant is not robust to large epsilon (even epsilon = 10) + # so we will use rdp when it fails, so the actual epsilon may be slightly off + # see https://github.com/pytorch/opacus/issues/604 + noise_multiplier = get_noise_multiplier(target_epsilon = 2, + target_delta = data['delta'], + sample_rate = sample_rate, + epochs = 21, + epsilon_tolerance = 0.01, + accountant = 'rdp') + # make the model private + privacy_engine = PrivacyEngine(accountant = 'prv') + model, optimizer, dataloader = privacy_engine.make_private( + module=model, + optimizer=optimizer, + data_loader=dataloader, + noise_multiplier=noise_multiplier, + max_grad_norm=1, + ) + + + + device_name = device("cuda" if cuda.is_available() else "cpu") model.to(device_name) model.train() diff --git a/examples/expm/run_dpsgd_main.py b/examples/expm/run_dpsgd_main.py index 2b99f0e1..854e9b98 100644 --- a/examples/expm/run_dpsgd_main.py +++ b/examples/expm/run_dpsgd_main.py @@ -4,14 +4,14 @@ from torch import zeros from utils.data_handler import get_mimic_dataloaders, get_mimic_dataset from opacus.accountants.utils import get_noise_multiplier -from utils.gru_model_handler import * +from utils.dpsgd_model import * # Generate the dataset and dataloaders path = os.path.join(os.getcwd(), "examples/expm/data/mimic/") epsilons = [.0001, .001, .01, .1, .5, 1, 2, 3.5, 5, 7, 10] # epsilons to run over delta = 1e-5 - +target_epsilon = 2 train_frac = 0.4 valid_frac = 0.0 @@ -36,8 +36,19 @@ sample_rate = 1/len(train_loader) # already incorporates batchsize +noise_multiplier_dict = { + "target_epsilon": target_epsilon, + "target_delta": delta, + "sample_rate": sample_rate, + "epochs": 21, + "epsilon_tolerance": 0.01, + "accountant": 'prv', + "eps_error": 0.01, + "max_grad_norm": 1, +} + try: - noise_multiplier = get_noise_multiplier(target_epsilon = 2, + noise_multiplier = get_noise_multiplier(target_epsilon = target_epsilon, target_delta = delta, sample_rate = sample_rate, epochs = 21, @@ -62,7 +73,7 @@ "cell_size": 58, "hidden_size": 78, "learning_rate": 0.0004738759319792616, - "num_epochs":37, + "num_epochs":1, "patience_early_stopping": 20, "patience_lr_scheduler": 5, "batch_size": 59, @@ -87,7 +98,7 @@ # Initialize the model with filtered parameters -model = GRUD(**model_params) +model = GRUD_DPSGD(**model_params) # Train the model results= dpsgd_gru_trained_model_and_metadata( model, diff --git a/examples/expm/run_nonprivate_main.py b/examples/expm/run_nonprivate_main.py index 59314519..b46cb498 100644 --- a/examples/expm/run_nonprivate_main.py +++ b/examples/expm/run_nonprivate_main.py @@ -2,7 +2,7 @@ import sys from torch import zeros -from utils.data_handler import get_mimic_dataloaders, get_mimic_dataset +from examples.expm.utils.data_preparation import get_mimic_dataloaders, get_mimic_dataset from opacus.accountants.utils import get_noise_multiplier from utils.gru_model_handler import * diff --git a/examples/expm/utils/data_handler.py b/examples/expm/utils/data_handler.py deleted file mode 100644 index b59339f5..00000000 --- a/examples/expm/utils/data_handler.py +++ /dev/null @@ -1,350 +0,0 @@ -""" -This file is inspired by https://github.com/MLforHealth/MIMIC_Extract -MIT License -Copyright (c) 2019 MIT Laboratory for Computational Physiology -""" -#TODO: Do I need to include the license for this file.? -import os -import pickle - -import numpy as np -import pandas as pd -from sklearn.preprocessing import StandardScaler -from torch import Tensor, from_numpy -from torch.utils.data import DataLoader, Dataset, Subset -from utils.gru_model_handler import to_3D_tensor -from tqdm import tqdm - - -class MimicDataset(Dataset): - def __init__(self, x, y): - # Check if x is already a tensor - if not isinstance(x, Tensor): - self.x = from_numpy(x).float() # Convert features to torch tensors if needed - else: - self.x = x.float() # Ensure it is of type float - - # Check if y is already a tensor - if not isinstance(y, Tensor): - self.y = from_numpy(y).float() # Convert labels to torch tensors if needed - else: - self.y = y.float() # Ensure it is of type float - - def __len__(self): - return len(self.y) - - def __getitem__(self, idx): - return self.x[idx], self.y[idx].squeeze(0) - - def subset(self, indices): - return MimicDataset(self.x[indices], self.y[indices]) - - -def get_mimic_dataset(path, - train_frac, - validation_frac, - test_frac, - early_stop_frac, - use_LR = True): - """Get the dataset, download it if necessary, and store it.""" - - # Assert that the sum of all fractions is between 0 and 1 - total_frac = train_frac + validation_frac + test_frac + early_stop_frac - assert 0 < total_frac <= 1, "The sum of dataset fractions must be between 0 and 1." - - - dataset_path = os.path.join(path, "dataset.pkl") - indices_path = os.path.join(path, "indices.pkl") - - if os.path.exists(dataset_path) and os.path.exists(indices_path): - print("Loading dataset...") - with open(dataset_path, "rb") as f: - dataset = pickle.load(f) # Load the dataset - with open(indices_path, "rb") as f: - indices_dict = pickle.load(f) # Load the dictionary containing indices - train_indices = indices_dict["train_indices"] # Get the actual train indices - validation_indices = indices_dict["validation_indices"] # Get the actual validation indices - test_indices = indices_dict["test_indices"] # Get the actual test indices - early_stop_indices = indices_dict["early_stop_indices"] # Get the actual early stop indices - print(f"Loaded dataset from {dataset_path}") - return dataset, train_indices, validation_indices ,test_indices, early_stop_indices - - data_file_path = os.path.join(path, "all_hourly_data.h5") - if os.path.exists(data_file_path): - print("Loading data...") - data = pd.read_hdf(data_file_path, "vitals_labs") - statics = pd.read_hdf(data_file_path, "patients") - - ID_COLS = ["subject_id", "hadm_id", "icustay_id"] - - print("Splitting data...") - train_data, holdout_data, y_train, y_holdout_data = data_splitter(statics, - data, - train_frac) - - print("Normalizing data...") - train_data , holdout_data = data_normalization(train_data, holdout_data) - - print("Imputing missing values...") - train_data, holdout_data = [ - simple_imputer(df, ID_COLS) for df in tqdm((train_data, holdout_data), desc="Imputation")] - - - # Skip pivot_table if flatten is False - train, holdout, label_train, label_holdout = train_data, holdout_data, y_train, y_holdout_data - - assert_no_missing_values(train_data, holdout_data, train, holdout) - - train_df, holdout_df = standard_scaler(train, holdout) - - # Creating the dataset - data_x = pd.concat((train_df, holdout_df), axis=0) - data_y = pd.concat((label_train, label_holdout), axis=0) - - assert np.issubdtype(data_x.values.dtype, np.number), "Non-numeric data found in features." - assert np.issubdtype(data_y.values.dtype, np.number), "Non-numeric data found in labels." - - print("Creating dataset...") - if use_LR: - dataset = MimicDataset(data_x.values, data_y.values) - else: - data_x = to_3D_tensor(data_x) - dataset = MimicDataset(data_x, data_y.values) - - # Generate indices for training, validation, test, and early stopping - train_indices, validation_indices, test_indices, early_stop_indices = data_indices(data_x, - train_frac, - validation_frac, - test_frac, - early_stop_frac) - - os.makedirs(os.path.dirname(dataset_path), exist_ok=True) - # Save the dataset to dataset.pkl - print("Saving dataset and indices...") - with open(dataset_path, "wb") as file: - pickle.dump(dataset, file) - print(f"Saved dataset to {dataset_path}") - - # Save train and test indices to indices.pkl - indices_to_save = { - "train_indices": train_indices, - "validation_indices": validation_indices, - "test_indices": test_indices, - "early_stop_indices": early_stop_indices, - } - with open(indices_path, "wb") as file: - pickle.dump(indices_to_save, file) - print(f"Saved train and test indices to {indices_path}") - else: - msg = "Please download the MIMIC-III dataset from https://physionet.org/content/mimiciii/1.4/ and save it in the specified path." - raise FileNotFoundError(msg) - return dataset, train_indices, validation_indices, test_indices, early_stop_indices - - -def data_splitter(statics, - data, - train_frac): - GAP_TIME = 6 # In hours - WINDOW_SIZE = 24 # In hours - SEED = 1 - - Ys = statics[statics.max_hours > WINDOW_SIZE + GAP_TIME][["los_icu"]] - Ys["los_3"] = Ys["los_icu"] > 3 - Ys.drop(columns=["los_icu"], inplace=True) - Ys["los_3"] = Ys["los_3"].astype(float) - - lvl2 = data[ - (data.index.get_level_values("icustay_id").isin(set(Ys.index.get_level_values("icustay_id")))) & - (data.index.get_level_values("hours_in") < WINDOW_SIZE) - ] - - data_subj_idx, y_subj_idx = [df.index.get_level_values("subject_id") for df in (lvl2, Ys)] - data_subjects = set(data_subj_idx) - assert data_subjects == set(y_subj_idx), "Subject ID pools differ!" - - # Randomly shuffle subjects and compute the sizes of the splits - np.random.seed(SEED) - subjects = np.random.permutation(list(data_subjects)) - N = len(subjects) - N_train = int(train_frac * N) - - # Ensure no overlap between train and test sets - train_subj = subjects[:N_train] - test_subj = subjects[N_train::] - - # Split the data according to the subjects - (train_data, holdout_data), (y_train, y_holdout) = [ - [df[df.index.get_level_values("subject_id").isin(s)] for s in (train_subj, test_subj)] - for df in (lvl2, Ys) - ] - return train_data, holdout_data, y_train, y_holdout - -# def simple_imputer(dataframe, -# ID_COLS): -# idx = pd.IndexSlice -# df = dataframe.copy() -# if len(df.columns.names) > 2: df.columns = df.columns.droplevel(("label", "LEVEL1", "LEVEL2")) - -# df_out = df.loc[:, idx[:, ["mean", "count"]]] -# icustay_means = df_out.loc[:, idx[:, "mean"]].groupby(ID_COLS).mean() - -# df_out.loc[:, idx[:, "mean"]] = ( -# df_out.loc[:, idx[:, "mean"]] -# .groupby(ID_COLS) -# .ffill() # Replace forward fill method -# .groupby(ID_COLS) -# .fillna(icustay_means) # Fill remaining NaNs with icustay_means -# .fillna(0) # Fill any remaining NaNs with 0 -# ) - -# # df_out.loc[:,idx[:,"mean"]] = df_out.loc[:,idx[:,"mean"]].groupby(ID_COLS).fillna( -# # method="ffill" -# # ).groupby(ID_COLS).fillna(icustay_means).fillna(0) - -# df_out.loc[:, idx[:, "count"]] = (df.loc[:, idx[:, "count"]] > 0).astype(float) -# df_out.rename(columns={"count": "mask"}, level="Aggregation Function", inplace=True) - -# is_absent = (1 - df_out.loc[:, idx[:, "mask"]]) -# hours_of_absence = is_absent.cumsum() -# time_since_measured = hours_of_absence - hours_of_absence[is_absent==0].fillna(method="ffill") -# time_since_measured.rename(columns={"mask": "time_since_measured"}, level="Aggregation Function", inplace=True) - -# df_out = pd.concat((df_out, time_since_measured), axis=1) -# df_out.loc[:, idx[:, "time_since_measured"]] = df_out.loc[:, idx[:, "time_since_measured"]].fillna(100) - -# df_out.sort_index(axis=1, inplace=True) -# return df_out - -def simple_imputer(dataframe, ID_COLS): - idx = pd.IndexSlice - df = dataframe.copy() - - # Adjust column levels if necessary - if len(df.columns.names) > 2: - df.columns = df.columns.droplevel(("label", "LEVEL1", "LEVEL2")) - - # Select mean and count columns - df_out = df.loc[:, idx[:, ["mean", "count"]]].copy() # Explicit deep copy - - # Compute group-level means - icustay_means = df_out.loc[:, idx[:, "mean"]].groupby(ID_COLS).transform("mean") - - # Forward fill and fill NaNs with icustay_means - df_out.loc[:, idx[:, "mean"]] = ( - df_out.loc[:, idx[:, "mean"]] - .groupby(ID_COLS) - .ffill() # Forward fill within groups - ) - df_out.loc[:, idx[:, "mean"]] = df_out.loc[:, idx[:, "mean"]].fillna(icustay_means) - - # Fill remaining NaNs with 0 - df_out.loc[:, idx[:, "mean"]] = df_out.loc[:, idx[:, "mean"]].fillna(0) - - # Binary mask for count columns - df_out.loc[:, idx[:, "count"]] = (df.loc[:, idx[:, "count"]] > 0).astype(float) - df_out = df_out.rename(columns={"count": "mask"}, level="Aggregation Function") # Avoid inplace=True - - # Calculate time since last measurement - is_absent = (1 - df_out.loc[:, idx[:, "mask"]]) - hours_of_absence = is_absent.cumsum() - time_since_measured = hours_of_absence - hours_of_absence[is_absent == 0].ffill() - time_since_measured.rename(columns={"mask": "time_since_measured"}, level="Aggregation Function", inplace=True) - - # Add time_since_measured to the output - df_out = pd.concat((df_out, time_since_measured), axis=1) - df_out.loc[:, idx[:, "time_since_measured"]] = df_out.loc[:, idx[:, "time_since_measured"]].fillna(100) - - # Sort columns by index - df_out.sort_index(axis=1, inplace=True) - - return df_out - - - - - -def data_indices(dataset, - train_frac, - valid_frac, - test_frac, - early_stop_frac): - N = len(dataset) - N_train = int(train_frac * N) - N_validation = int(valid_frac * N) - N_test = int(test_frac * N) - N_early_stop = int(early_stop_frac * N) - - # Generate sequential indices for training and testing - # Indices from 0 to N_train-1 - train_indices = list(range(N_train)) - # Indices from N_train to N_train + N_validation-1 - validation_indices = list(range(N_train, N_train + N_validation)) - # Indices for test set - test_indices = list(range(N_train + N_validation, N_train + N_validation + N_test)) - # Indices for early stopping - early_stop_indices = list(range(N_train + N_validation + N_test, N_train + N_validation + N_test + N_early_stop)) - return train_indices, validation_indices, test_indices, early_stop_indices - - -def get_mimic_dataloaders(dataset, - train_indices, - validation_indices, - test_indices, - early_stop_indices, - batch_size=128): - train_subset = Subset(dataset, train_indices) - test_subset = Subset(dataset, test_indices) - validation_subset = Subset(dataset, validation_indices) - early_stop_subset = Subset(dataset, early_stop_indices) - - train_loader = DataLoader(train_subset, batch_size, shuffle=False) - test_loader = DataLoader(test_subset, batch_size, shuffle=False) - validation_loader = DataLoader(validation_subset, batch_size, shuffle=False) - early_stop_loader = DataLoader(early_stop_subset, batch_size, shuffle=False) - - return train_loader, validation_loader, test_loader, early_stop_loader - - -def data_normalization(lvl2_train, - lvl2_test): - idx = pd.IndexSlice - lvl2_means, lvl2_stds = lvl2_train.loc[:, idx[:,"mean"]].mean(axis=0), lvl2_train.loc[:, idx[:,"mean"]].std(axis=0) - - lvl2_train.loc[:, idx[:,"mean"]] = (lvl2_train.loc[:, idx[:,"mean"]] - lvl2_means)/lvl2_stds - lvl2_test.loc[:, idx[:,"mean"]] = (lvl2_test.loc[:, idx[:,"mean"]] - lvl2_means)/lvl2_stds - return lvl2_train, lvl2_test - - -def standard_scaler(flat_train, - flat_test): - # Initialize the scaler - scaler = StandardScaler() - - # Identify continuous columns (float64 and int64 types) - continuous_columns = flat_train.select_dtypes(include=["float64", "int64"]).columns - - # Fit the scaler on training data and transform both training and test sets - train_flat_continuous = scaler.fit_transform(flat_train[continuous_columns]) - test_flat_continuous = scaler.transform(flat_test[continuous_columns]) - - # Create copies of the original DataFrames - train_scaled = flat_train.copy() - test_scaled = flat_test.copy() - - # Replace continuous columns with the scaled versions - train_scaled[continuous_columns] = train_flat_continuous - test_scaled[continuous_columns] = test_flat_continuous - - # Return the scaled DataFrames - return train_scaled, test_scaled - - -def flatten_multi_index(df): - """Flattens the multi-index DataFrame by resetting the index.""" - return df.reset_index(drop=True) - - -def assert_no_missing_values(*dfs): - """Asserts that no DataFrame in the input list contains any missing values.""" - for df in dfs: - assert not df.isnull().any().any(), "DataFrame contains missing values!" \ No newline at end of file diff --git a/examples/expm/utils/gru_model_handler.py b/examples/expm/utils/dpsgd_model.py similarity index 70% rename from examples/expm/utils/gru_model_handler.py rename to examples/expm/utils/dpsgd_model.py index 907e7833..d15f91b9 100644 --- a/examples/expm/utils/gru_model_handler.py +++ b/examples/expm/utils/dpsgd_model.py @@ -76,7 +76,7 @@ def __repr__(self): + ", out_features=" + str(self.out_features) \ + ", bias=" + str(self.bias is not None) + ")" -class GRUD(nn.Module): +class GRUD_DPSGD(nn.Module): def __init__(self, input_size, cell_size, hidden_size, X_mean, batch_size = 0, droupout=0, bn_flag = True, output_last = False): """With minor modifications from https://github.com/zhiyongc/GRU-D/ @@ -255,7 +255,6 @@ def to_numpy(tensor): - def dpsgd_gru_trained_model_and_metadata(model, train_dataloader, test_dataloader, @@ -479,220 +478,3 @@ def dpsgd_gru_trained_model_and_metadata(model, with open("target_GRUD/priv_model_metadata.pkl", "wb") as f: pickle.dump(meta_data, f) return [train_losses, test_losses, train_acces, test_acces, priv_model, niter_per_epoch, privacy_engine] - - - - - - - - - - - -def gru_trained_model_and_metadata(model, - train_dataloader, - test_dataloader, - epochs, - patience_early_stopping, - patience_lr, - min_delta, - learning_rate): - - print("Model Structure: ", model) - print("Start Training ... ") - - # Check if the input tensor is 3D - # This check is nessary because the GRU-D model expects a 3D tensor, meaning the input data should not be flattened - # The input tensor should have the shape (num_datapoints, num_features, num_timepoints) - if train_dataloader.dataset.dataset.x.ndimension() != 3: - warnings.warn("Input tensor is not 3D. There might be a mismatch between .", UserWarning) - - # Early Stopping - min_loss_epoch_valid = float("inf") # Initialize to infinity for comparison - patient_epoch = 0 # Initialize patient counter - - device_name = device("cuda" if cuda.is_available() else "cpu") - - if isinstance(model, nn.Sequential): - output_last = model[-1].output_last - print("Output type dermined by the last layer") - else: - output_last = model.output_last - print("Output type dermined by the model") - - ##TODO BCE - criterion_CEL = nn.CrossEntropyLoss() - criterion_MSE = nn.MSELoss() - criterion_BCE = nn.BCEWithLogitsLoss() - optimizer = optim.Adam(model.parameters(), lr=learning_rate) - - # Reduce learning rate when a metric has stopped improving - scheduler = ReduceLROnPlateau(optimizer, mode="min", factor=0.5, patience = patience_lr) - - - train_losses = [] - test_losses = [] - test_acces = [] - train_acces = [] - - - cur_time = time.time() - pre_time = time.time() - - - model.to(device_name) - - for epoch in tqdm(range(epochs), desc="Training Progress"): - - model.train() - train_loss = 0.0 - - test_dataloader_iter = iter(test_dataloader) - - for _, (X, labels) in enumerate(tqdm(train_dataloader, desc="Training Batches")): - - X = X.to(device_name) - labels = labels.to(device_name) - labels = labels.long() - prediction = model(X) - - output_last = True - if output_last: - loss = criterion_CEL(squeeze(prediction), squeeze(labels)) - else: - full_labels = cat((X[:,1:,:], labels), dim = 1) - loss = criterion_MSE(prediction, full_labels) - - - optimizer.zero_grad() - loss.backward() - optimizer.step() - train_loss += loss.item() - - train_loss /= len(train_dataloader) - train_losses.append(train_loss) - - # Convert predictions to class indices - binary_predictions = to_numpy(prediction).argmax(axis=1) - - # Ensure labels are integer and 1D - binary_labels = to_numpy(labels).astype(int) - # Compute accuracy - train_acc = accuracy_score(binary_labels, binary_predictions) - train_acces.append(train_acc) - - # test - model.eval() - try: - X_test, labels_test = next(test_dataloader_iter) - except StopIteration: - valid_dataloader_iter = iter(test_dataloader) - X_test, labels_test = next(valid_dataloader_iter) - - - model.zero_grad() - X_test = X_test.to(device_name) - labels_test = labels_test.to(device_name) - labels_test = labels_test.long() - - prediction_test = model(X_test) - - - - if output_last: - test_loss = criterion_CEL(squeeze(prediction_test), squeeze(labels_test)) - else: - full_labels_val = cat((X_test[:,1:,:], labels_test), dim = 1) - test_loss = criterion_MSE(prediction_test, full_labels_val) - - test_loss = test_loss.cpu().item() - test_losses.append(test_loss) - - # Convert predictions to class indices - binary_predictions_test = to_numpy(prediction_test).argmax(axis=1) - - # Ensure labels are integer and 1D - binary_labels_test = to_numpy(labels_test).astype(int) - # Compute accuracy - test_acc = accuracy_score(binary_labels_test, binary_predictions_test) - test_acces.append(test_acc) - - # Early stopping - # Assume test_loss is computed for validation set - if test_loss < min_loss_epoch_valid - min_delta: # Improvement condition - min_loss_epoch_valid = test_loss - patient_epoch = 0 - print(f"Epoch {epoch}: Validation loss improved to {test_loss:.4f}") - else: - patient_epoch += 1 - print(f"Epoch {epoch}: No improvement. Patience counter: {patient_epoch}/{patience_early_stopping}") - - if patient_epoch >= patience_early_stopping: - print(f"Early stopping at epoch {epoch}. Best validation loss: {min_loss_epoch_valid:.4f}") - break - - # Step the scheduler - scheduler.step(test_loss) - - # Check the learning rate - current_lr = optimizer.param_groups[0]["lr"] - print(f"Learning Rate: {current_lr:.6f}") - - # Stop if learning rate becomes too small - if current_lr < 1e-6: - print("Learning rate too small, stopping training.") - break - - - # Print training parameters - cur_time = time.time() - print("Epoch: {}, train_loss: {}, valid_loss: {}, time: {}".format( \ - epoch, \ - np.around(train_loss, decimals=8),\ - np.around(test_loss, decimals=8),\ - np.around(cur_time - pre_time, decimals=2))) - pre_time = cur_time - # Move the model back to the CPU - # Ensure the target directory exists - os.makedirs("target_GRUD", exist_ok=True) - model.to("cpu") - with open("target_GRUD/target_model.pkl", "wb") as f: - save(model.state_dict(), f) - - # Create metadata and store it - meta_data = {} - meta_data["train_indices"] = train_dataloader.dataset.indices - meta_data["test_indices"] = test_dataloader.dataset.indices - meta_data["num_train"] = len(meta_data["train_indices"]) - - # Write init params - meta_data["init_params"] = {} - for key, value in model.init_params.items(): - meta_data["init_params"][key] = value - - # read out optimizer parameters - meta_data["optimizer"] = {} - meta_data["optimizer"]["name"] = optimizer.__class__.__name__.lower() - meta_data["optimizer"]["lr"] = optimizer.param_groups[0].get("lr", 0) - meta_data["optimizer"]["weight_decay"] = optimizer.param_groups[0].get("weight_decay", 0) - meta_data["optimizer"]["momentum"] = optimizer.param_groups[0].get("momentum", 0) - meta_data["optimizer"]["dampening"] = optimizer.param_groups[0].get("dampening", 0) - meta_data["optimizer"]["nesterov"] = optimizer.param_groups[0].get("nesterov", False) - - # read out criterion parameters - meta_data["loss"] = {} - meta_data["loss"]["name"] = criterion_CEL.__class__.__name__.lower() - - meta_data["batch_size"] = train_dataloader.batch_size - meta_data["epochs"] = epochs - meta_data["train_acc"] = train_acc - meta_data["test_acc"] = test_acc - meta_data["train_loss"] = train_loss - meta_data["test_loss"] = test_loss - meta_data["dataset"] = "mimiciii" - with open("target_GRUD/model_metadata.pkl", "wb") as f: - pickle.dump(meta_data, f) - return train_losses, test_losses, train_acces, test_acces - - diff --git a/examples/expm/utils/dpsgd_model_handler.py b/examples/expm/utils/expm_model.py similarity index 100% rename from examples/expm/utils/dpsgd_model_handler.py rename to examples/expm/utils/expm_model.py diff --git a/examples/expm/utils/expm_model_handler.py b/examples/expm/utils/expm_model_handler.py deleted file mode 100644 index e69de29b..00000000 diff --git a/examples/expm/utils/non_private_model.py b/examples/expm/utils/non_private_model.py new file mode 100644 index 00000000..15b9b780 --- /dev/null +++ b/examples/expm/utils/non_private_model.py @@ -0,0 +1,460 @@ +""" +This file is inspired by https://github.com/MLforHealth/MIMIC_Extract +MIT License +Copyright (c) 2019 MIT Laboratory for Computational Physiology +""" +import math +import os +import pickle +import time +import warnings + +import numpy as np +import pandas as pd +import torch.nn.functional as F +import torch.utils.data as utils +from sklearn.metrics import accuracy_score +from torch import Tensor, cat, cuda, device, exp, eye, from_numpy, isnan, max, nn, optim, save, sigmoid, squeeze, tanh, zeros, no_grad +from torch.autograd import Variable +from torch.nn.parameter import Parameter +from torch.optim.lr_scheduler import ReduceLROnPlateau +from tqdm import tqdm + +def to_3D_tensor(df): + idx = pd.IndexSlice + np_3D = np.dstack([df.loc[idx[:, :, :, i], :].values for i in sorted(set(df.index.get_level_values("hours_in")))]) + return from_numpy(np_3D) + +def prepare_dataloader(df, Ys, batch_size, shuffle=True): + """Dfs = (df_train, df_dev, df_test). + df_* = (subject, hadm, icustay, hours_in) X (level2, agg fn \ni {mask, mean, time}) + Ys_series = (subject, hadm, icustay) => label. + """ + X = from_numpy(to_3D_tensor(df).astype(np.float32)) + label = from_numpy(Ys.values.astype(np.int64)) + dataset = utils.TensorDataset(X, label) + return utils.DataLoader(dataset, batch_size =int(batch_size) , shuffle=shuffle, drop_last = True) + +class FilterLinear(nn.Module): + def __init__(self, in_features, out_features, filter_square_matrix, device, bias=True): + """filter_square_matrix : filter square matrix, whose each elements is 0 or 1. + """ + super(FilterLinear, self).__init__() + self.in_features = in_features + self.out_features = out_features + + assert in_features > 1 and out_features > 1, "Passing in nonsense sizes" + + self.filter_square_matrix = None + self.filter_square_matrix = Variable(filter_square_matrix.to(device), requires_grad=False) + + self.weight = Parameter(Tensor(out_features, in_features)).to(device) + + if bias: + self.bias = Parameter(Tensor(out_features)).to(device) + else: + self.register_parameter("bias", None) + self.reset_parameters() + + def reset_parameters(self): + stdv = 1. / math.sqrt(self.weight.size(1)) + self.weight.data.uniform_(-stdv, stdv) + if self.bias is not None: + self.bias.data.uniform_(-stdv, stdv) + + def forward(self, x): + return F.linear( + x, + self.filter_square_matrix.mul(self.weight), + self.bias + ) + + def __repr__(self): + return self.__class__.__name__ + "(" \ + + "in_features=" + str(self.in_features) \ + + ", out_features=" + str(self.out_features) \ + + ", bias=" + str(self.bias is not None) + ")" + +class GRUD(nn.Module): + def __init__(self, input_size, cell_size, hidden_size, X_mean, batch_size = 0, + droupout=0, bn_flag = True, output_last = False): + """With minor modifications from https://github.com/zhiyongc/GRU-D/ + + Recurrent Neural Networks for Multivariate Times Series with Missing Values + GRU-D: GRU exploit two representations of informative missingness patterns, i.e., masking and time interval. + cell_size is the size of cell_state. + + Implemented based on the paper: + @article{che2018recurrent, + title={Recurrent neural networks for multivariate time series with missing values}, + author={Che, Zhengping and Purushotham, Sanjay and Cho, Kyunghyun and Sontag, David and Liu, Yan}, + journal={Scientific reports}, + volume={8}, + number={1}, + pages={6085}, + year={2018}, + publisher={Nature Publishing Group} + } + + GRU-D: + input_size: variable dimension of each time + hidden_size: dimension of hidden_state + mask_size: dimension of masking vector + X_mean: the mean of the historical input data + """ + + super(GRUD, self).__init__() + + # Save init params to a dictionary + self.init_params = { + "input_size": input_size, + "cell_size": cell_size, + "hidden_size": hidden_size, + "X_mean": X_mean, + "batch_size": batch_size, + "output_last": output_last, + "dropout":droupout, + "bn_flag":bn_flag, + } + + self.hidden_size = hidden_size + self.delta_size = input_size + self.mask_size = input_size + self.bn_flag = bn_flag + + self.device = device("cuda" if cuda.is_available() else "cpu") + self.identity = eye(input_size).to(self.device) + self.X_mean = Variable(Tensor(X_mean).to(self.device)) + + # Wz, Uz are part of the same network. the bias is bz + self.zl = nn.Linear(input_size + hidden_size + self.mask_size, hidden_size).to(self.device) + + # Wr, Ur are part of the same network. the bias is br + self.rl = nn.Linear(input_size + hidden_size + self.mask_size, hidden_size).to(self.device) + + # W, U are part of the same network. the bias is b + self.hl = nn.Linear(input_size + hidden_size + self.mask_size, hidden_size).to(self.device) + + self.gamma_x_l = FilterLinear(self.delta_size, self.delta_size, self.identity, self.device) + self.gamma_h_l = nn.Linear(self.delta_size, self.hidden_size).to(self.device) + self.output_last = output_last + + #TODO: this part differs from gitcode + # self.fc = nn.Linear(self.hidden_size, 2).to(self.device) + # self.bn= nn.BatchNorm1d(2, eps=1e-05, momentum=0.1, affine=True).to(self.device) + # self.drop=nn.Dropout(p=0.7, inplace=False) + self.fc = nn.Linear(self.hidden_size, 1) # a probability score + self.drop=nn.Dropout(p=0.48, inplace=False) + if self.bn_flag: + self.bn= nn.BatchNorm1d(self.hidden_size, eps=1e-05, momentum=0.1, affine=True) + + def step(self, x, x_last_obsv, x_mean, h, mask, delta): + """Inputs: + x: input tensor + x_last_obsv: input tensor with forward fill applied + x_mean: the mean of each feature + h: the hidden state of the network + mask: the mask of whether or not the current value is observed + delta: the tensor indicating the number of steps since the last time a feature was observed. + + Returns: + h: the updated hidden state of the network + + """ + + # Assert to check for NaNs in x_mean + assert not isnan(x_mean).any(), "NaN values found in x_mean" + + batch_size = x.size()[0] + feature_size = x.size()[1] + zero_x = zeros(batch_size, feature_size).to(self.device) + zero_h = zeros(batch_size, self.hidden_size).to(self.device) + + + gamma_x_l_delta = self.gamma_x_l(delta) + delta_x = exp(-max(zero_x, gamma_x_l_delta)) + + gamma_h_l_delta = self.gamma_h_l(delta) + delta_h = exp(-max(zero_h, gamma_h_l_delta)) + + x_mean = x_mean.repeat(batch_size, 1) + + x = mask * x + (1 - mask) * (delta_x * x_last_obsv + (1 - delta_x) * x_mean) + h = delta_h * h + + combined = cat((x, h, mask), 1) + # Assert to check for NaNs in combined + assert not isnan(combined).any(), "NaN values found in combined" + + z = sigmoid(self.zl(combined)) #sigmoid(W_z*x_t + U_z*h_{t-1} + V_z*m_t + bz) + r = sigmoid(self.rl(combined)) #sigmoid(W_r*x_t + U_r*h_{t-1} + V_r*m_t + br) + combined_new = cat((x, r*h, mask), 1) + h_tilde = tanh(self.hl(combined_new)) #tanh(W*x_t +U(r_t*h_{t-1}) + V*m_t) + b + h = (1 - z) * h + z * h_tilde + + return h + + + def forward(self, X): + """X: Input tensor of shape (batch_size, time_steps * 3, features) + The tensor includes Mask, Measurement, and Delta sequentially for each time step. + """ + + # Step 1: Split the input tensor into Mask, Measurement, and Delta + batch_size = X.size(0) + time_steps = X.size(1) // 3 # Since every 3 consecutive steps represent Mask, Measurement, and Delta + + # Reshape X into 3 separate tensors for Mask, Measurement, and Delta + Mask = X[:, np.arange(0, X.size(1), 3), :] # Extract Mask + Measurement = X[:, np.arange(1, X.size(1), 3), :] # Extract Measurement + Delta = X[:, np.arange(2, X.size(1), 3), :] # Extract Delta + + # Transpose tensors to match (batch_size, time_steps, features) + Mask = Mask.transpose(1, 2) + Measurement = Measurement.transpose(1, 2) + Delta = Delta.transpose(1, 2) + + # X_last_obsv is initialized to Measurement at the starting point + X_last_obsv = Measurement + + # Step 2: Initialize hidden state + step_size = Measurement.size(1) # Number of time points + Hidden_State = self.initHidden(batch_size) + + # Step 3: Iterate through time steps and update the GRU hidden state + outputs = None + for i in range(step_size): + Hidden_State = self.step( + squeeze(Measurement[:, i, :], 1), + squeeze(X_last_obsv[:, i, :], 1), + squeeze(self.X_mean[:, i, :], 1), + Hidden_State, + squeeze(Mask[:, i, :], 1), + squeeze(Delta[:, i, :], 1), + ) + # Collect hidden states + if outputs is None: + outputs = Hidden_State.unsqueeze(1) + else: + outputs = cat((Hidden_State.unsqueeze(1), outputs), 1) + + # Step 4: Predict a binary outcome using FC, BatchNorm, and Dropout layers + if self.bn_flag: + return self.fc(self.bn(self.drop(Hidden_State))) + else: + return self.fc(self.drop(Hidden_State)) + + + def initHidden(self, batch_size): + Hidden_State = Variable(zeros(batch_size, self.hidden_size)).to(self.device) + return Hidden_State + +def to_numpy(tensor): + return tensor.detach().cpu().numpy() if tensor.is_cuda else tensor.detach().numpy() + + +def gru_trained_model_and_metadata(model, + train_dataloader, + test_dataloader, + epochs, + patience_early_stopping, + patience_lr, + min_delta, + learning_rate): + + print("Model Structure: ", model) + print("Start Training ... ") + + # Check if the input tensor is 3D + # This check is nessary because the GRU-D model expects a 3D tensor, meaning the input data should not be flattened + # The input tensor should have the shape (num_datapoints, num_features, num_timepoints) + if train_dataloader.dataset.dataset.x.ndimension() != 3: + warnings.warn("Input tensor is not 3D. There might be a mismatch between .", UserWarning) + + # Early Stopping + min_loss_epoch_valid = float("inf") # Initialize to infinity for comparison + patient_epoch = 0 # Initialize patient counter + + device_name = device("cuda" if cuda.is_available() else "cpu") + + if isinstance(model, nn.Sequential): + output_last = model[-1].output_last + print("Output type dermined by the last layer") + else: + output_last = model.output_last + print("Output type dermined by the model") + + ##TODO BCE + criterion_CEL = nn.CrossEntropyLoss() + criterion_MSE = nn.MSELoss() + criterion_BCE = nn.BCEWithLogitsLoss() + optimizer = optim.Adam(model.parameters(), lr=learning_rate) + + # Reduce learning rate when a metric has stopped improving + scheduler = ReduceLROnPlateau(optimizer, mode="min", factor=0.5, patience = patience_lr) + + + train_losses = [] + test_losses = [] + test_acces = [] + train_acces = [] + + cur_time = time.time() + pre_time = time.time() + + + model.to(device_name) + + for epoch in tqdm(range(epochs), desc="Training Progress"): + + model.train() + train_loss = 0.0 + + test_dataloader_iter = iter(test_dataloader) + + for _, (X, labels) in enumerate(tqdm(train_dataloader, desc="Training Batches")): + + X = X.to(device_name) + labels = labels.to(device_name) + labels = labels.long() + prediction = model(X) + + output_last = True + if output_last: + loss = criterion_CEL(squeeze(prediction), squeeze(labels)) + else: + full_labels = cat((X[:,1:,:], labels), dim = 1) + loss = criterion_MSE(prediction, full_labels) + + + optimizer.zero_grad() + loss.backward() + optimizer.step() + train_loss += loss.item() + + train_loss /= len(train_dataloader) + train_losses.append(train_loss) + + # Convert predictions to class indices + binary_predictions = to_numpy(prediction).argmax(axis=1) + + # Ensure labels are integer and 1D + binary_labels = to_numpy(labels).astype(int) + # Compute accuracy + train_acc = accuracy_score(binary_labels, binary_predictions) + train_acces.append(train_acc) + + # test + model.eval() + try: + X_test, labels_test = next(test_dataloader_iter) + except StopIteration: + valid_dataloader_iter = iter(test_dataloader) + X_test, labels_test = next(valid_dataloader_iter) + + + model.zero_grad() + X_test = X_test.to(device_name) + labels_test = labels_test.to(device_name) + labels_test = labels_test.long() + + prediction_test = model(X_test) + + + + if output_last: + test_loss = criterion_CEL(squeeze(prediction_test), squeeze(labels_test)) + else: + full_labels_val = cat((X_test[:,1:,:], labels_test), dim = 1) + test_loss = criterion_MSE(prediction_test, full_labels_val) + + test_loss = test_loss.cpu().item() + test_losses.append(test_loss) + + # Convert predictions to class indices + binary_predictions_test = to_numpy(prediction_test).argmax(axis=1) + + # Ensure labels are integer and 1D + binary_labels_test = to_numpy(labels_test).astype(int) + # Compute accuracy + test_acc = accuracy_score(binary_labels_test, binary_predictions_test) + test_acces.append(test_acc) + + # Early stopping + # Assume test_loss is computed for validation set + if test_loss < min_loss_epoch_valid - min_delta: # Improvement condition + min_loss_epoch_valid = test_loss + patient_epoch = 0 + print(f"Epoch {epoch}: Validation loss improved to {test_loss:.4f}") + else: + patient_epoch += 1 + print(f"Epoch {epoch}: No improvement. Patience counter: {patient_epoch}/{patience_early_stopping}") + + if patient_epoch >= patience_early_stopping: + print(f"Early stopping at epoch {epoch}. Best validation loss: {min_loss_epoch_valid:.4f}") + break + + # Step the scheduler + scheduler.step(test_loss) + + # Check the learning rate + current_lr = optimizer.param_groups[0]["lr"] + print(f"Learning Rate: {current_lr:.6f}") + + # Stop if learning rate becomes too small + if current_lr < 1e-6: + print("Learning rate too small, stopping training.") + break + + + # Print training parameters + cur_time = time.time() + print("Epoch: {}, train_loss: {}, valid_loss: {}, time: {}".format( \ + epoch, \ + np.around(train_loss, decimals=8),\ + np.around(test_loss, decimals=8),\ + np.around(cur_time - pre_time, decimals=2))) + pre_time = cur_time + # Move the model back to the CPU + # Ensure the target directory exists + os.makedirs("target_GRUD", exist_ok=True) + model.to("cpu") + with open("target_GRUD/target_model.pkl", "wb") as f: + save(model.state_dict(), f) + + # Create metadata and store it + meta_data = {} + meta_data["train_indices"] = train_dataloader.dataset.indices + meta_data["test_indices"] = test_dataloader.dataset.indices + meta_data["num_train"] = len(meta_data["train_indices"]) + + # Write init params + meta_data["init_params"] = {} + for key, value in model.init_params.items(): + meta_data["init_params"][key] = value + + # read out optimizer parameters + meta_data["optimizer"] = {} + meta_data["optimizer"]["name"] = optimizer.__class__.__name__.lower() + meta_data["optimizer"]["lr"] = optimizer.param_groups[0].get("lr", 0) + meta_data["optimizer"]["weight_decay"] = optimizer.param_groups[0].get("weight_decay", 0) + meta_data["optimizer"]["momentum"] = optimizer.param_groups[0].get("momentum", 0) + meta_data["optimizer"]["dampening"] = optimizer.param_groups[0].get("dampening", 0) + meta_data["optimizer"]["nesterov"] = optimizer.param_groups[0].get("nesterov", False) + + # read out criterion parameters + meta_data["loss"] = {} + meta_data["loss"]["name"] = criterion_CEL.__class__.__name__.lower() + + meta_data["batch_size"] = train_dataloader.batch_size + meta_data["epochs"] = epochs + meta_data["train_acc"] = train_acc + meta_data["test_acc"] = test_acc + meta_data["train_loss"] = train_loss + meta_data["test_loss"] = test_loss + meta_data["dataset"] = "mimiciii" + with open("target_GRUD/model_metadata.pkl", "wb") as f: + pickle.dump(meta_data, f) + return train_losses, test_losses, train_acces, test_acces + + From 238afc146e9b37cb7b00b8521b0e79404ce75b92 Mon Sep 17 00:00:00 2001 From: fazelehh Date: Thu, 30 Jan 2025 08:21:44 +0000 Subject: [PATCH 4/5] dpsgd is running for target and shadow models, the attack was successfully executed --- examples/expm/audit.yaml | 6 +- examples/expm/data/.gitkeep | 0 examples/expm/dpsgd_handler.py | 107 +++++++++--------- examples/expm/run_dpsgd_main.py | 51 ++++----- examples/expm/utils/dpsgd_model.py | 82 ++++++++++---- leakpro/attacks/utils/shadow_model_handler.py | 6 +- 6 files changed, 142 insertions(+), 110 deletions(-) delete mode 100644 examples/expm/data/.gitkeep diff --git a/examples/expm/audit.yaml b/examples/expm/audit.yaml index 216a76da..3bdc0586 100644 --- a/examples/expm/audit.yaml +++ b/examples/expm/audit.yaml @@ -31,7 +31,7 @@ audit: # Configurations for auditing attack_type: "mia" #mia, gia dpsgd: dpsgd_use: True - dpsgd_path: "./examples/expm/target_GRUD/dpsgd_dic.pkl" + dpsgd_path: "./examples/expm/target_dpsgd/dpsgd_dic.pkl" @@ -40,10 +40,10 @@ target: module_path: "examples/expm/utils/dpsgd_model.py" # either model_grud.py or model_LR.py for logestic regression model_class: "GRUD_DPSGD" # LR/GRUD # Data paths - target_folder: "./target_dpsgd" # either target_GRUD or target_LR + target_folder: "./examples/expm/target_dpsgd" # either target_GRUD or target_LR data_path: "./examples/expm/data/mimic/dataset.pkl" #unflattened dataset for GRUD and flattened dataset for LR shadow_model: model_class: dpsgd_model_handler # LR/GRUD -distillation_model: "GRUD_DPSGD" \ No newline at end of file +distillation_model: \ No newline at end of file diff --git a/examples/expm/data/.gitkeep b/examples/expm/data/.gitkeep deleted file mode 100644 index e69de29b..00000000 diff --git a/examples/expm/dpsgd_handler.py b/examples/expm/dpsgd_handler.py index 8a43b260..5873a6b6 100644 --- a/examples/expm/dpsgd_handler.py +++ b/examples/expm/dpsgd_handler.py @@ -1,6 +1,6 @@ from torch import cuda, device, nn, optim, squeeze -from torch.nn import CrossEntropyLoss +from torch.nn import BCEWithLogitsLoss from torch.utils.data import DataLoader from tqdm import tqdm from sklearn.metrics import accuracy_score @@ -17,9 +17,9 @@ class MimicInputHandlerGRU(AbstractInputHandler): def __init__(self, configs: dict) -> None: super().__init__(configs = configs) - def get_criterion(self)->CrossEntropyLoss: + def get_criterion(self)->BCEWithLogitsLoss: """Set the CrossEntropyLoss for the model.""" - return CrossEntropyLoss() + return BCEWithLogitsLoss() def get_optimizer(self, model:nn.Module) -> optim.Optimizer: """Set the optimizer for the model.""" @@ -42,80 +42,81 @@ def train( epochs: int = None, ) -> dict: - if self.configs['audit']['dpsgd']: - print("Training shadow models with DP-SGD") - dpsgd_path = self.configs['audit']['dpsgd']['dpsgd_path'] - - sample_rate = 1/len(dataloader) - # Check if the file exists - if os.path.exists(dpsgd_path): - # Open and read the pickle file - with open(dpsgd_path, "rb") as file: - data = pickle.load(file) - print("Pickle file loaded successfully!") - print("Data:", data) - else: - print(f"File not found at: {dpsgd_path}") - try: - noise_multiplier = get_noise_multiplier(target_epsilon = 2, - target_delta = data['delta'], - sample_rate = data['sample_rate'], - epochs = 21, - epsilon_tolerance = data['epsilon_tolerance'], - accountant = 'prv', - eps_error = data['eps_error']) - except: - # the prv accountant is not robust to large epsilon (even epsilon = 10) - # so we will use rdp when it fails, so the actual epsilon may be slightly off - # see https://github.com/pytorch/opacus/issues/604 - noise_multiplier = get_noise_multiplier(target_epsilon = 2, - target_delta = data['delta'], - sample_rate = sample_rate, - epochs = 21, - epsilon_tolerance = 0.01, - accountant = 'rdp') - # make the model private - privacy_engine = PrivacyEngine(accountant = 'prv') - model, optimizer, dataloader = privacy_engine.make_private( - module=model, - optimizer=optimizer, - data_loader=dataloader, - noise_multiplier=noise_multiplier, - max_grad_norm=1, - ) - - + print("Training shadow models with DP-SGD") + dpsgd_path = self.configs['audit']['dpsgd']['dpsgd_path'] + + sample_rate = 1/len(dataloader) + # Check if the file exists + if os.path.exists(dpsgd_path): + # Open and read the pickle file + with open(dpsgd_path, "rb") as file: + privacy_engine_dict = pickle.load(file) + print("Pickle file loaded successfully!") + print("Data:", privacy_engine_dict) + else: + raise Exception(f"File not found at: {dpsgd_path}") + + try: + noise_multiplier = get_noise_multiplier(target_epsilon = privacy_engine_dict["target_epsilon"], + target_delta = privacy_engine_dict["target_delta"], + sample_rate = sample_rate , + epochs = privacy_engine_dict["epochs"], + epsilon_tolerance = privacy_engine_dict["epsilon_tolerance"], + accountant = 'prv', + eps_error = privacy_engine_dict["eps_error"],) + except: + # the prv accountant is not robust to large epsilon (even epsilon = 10) + # so we will use rdp when it fails, so the actual epsilon may be slightly off + # see https://github.com/pytorch/opacus/issues/604 + noise_multiplier = get_noise_multiplier(target_epsilon = 2, + target_delta = privacy_engine_dict["target_delta"], + sample_rate = sample_rate, + epochs = privacy_engine_dict["epochs"], + epsilon_tolerance = privacy_engine_dict["epsilon_tolerance"], + accountant = 'rdp') + + # make the model private + privacy_engine = PrivacyEngine(accountant = 'prv') + model, optimizer, dataloader = privacy_engine.make_private( + module=model, + optimizer=optimizer, + data_loader=dataloader, + noise_multiplier=noise_multiplier, + max_grad_norm= privacy_engine_dict["max_grad_norm"], + ) device_name = device("cuda" if cuda.is_available() else "cpu") model.to(device_name) model.train() criterion = self.get_criterion() - optimizer = self.get_optimizer(model) for e in tqdm(range(epochs), desc="Training Progress"): model.train() train_acc, train_loss = 0.0, 0.0 for _, (x, labels) in enumerate(tqdm(dataloader, desc="Training Batches")): + if x.numel() == 0: # Skip empty batches + continue + x = self.convert_to_device(x) labels = self.convert_to_device(labels) - labels = labels.long() + labels = labels.float() optimizer.zero_grad() - output = model(x) + output = model(x).squeeze(dim=1) - loss = criterion(squeeze(output), squeeze(labels).long()) + loss = criterion(output, labels) loss.backward() optimizer.step() + train_loss += loss.item() train_loss = train_loss/len(dataloader) - binary_predictions = self.to_numpy(output).argmax(axis=1) + binary_predictions = (output > 0).float().cpu().numpy() - # Ensure labels are integer and 1D - binary_labels = self.to_numpy(labels).astype(int) + binary_labels = labels.cpu().numpy() # Compute accuracy train_acc = accuracy_score(binary_labels, binary_predictions) diff --git a/examples/expm/run_dpsgd_main.py b/examples/expm/run_dpsgd_main.py index 854e9b98..9ea3355c 100644 --- a/examples/expm/run_dpsgd_main.py +++ b/examples/expm/run_dpsgd_main.py @@ -3,21 +3,23 @@ from torch import zeros from utils.data_handler import get_mimic_dataloaders, get_mimic_dataset -from opacus.accountants.utils import get_noise_multiplier -from utils.dpsgd_model import * +from utils.dpsgd_model import * +# Import and initialize ReportHandler +from leakpro.reporting.report_handler import ReportHandler # Generate the dataset and dataloaders path = os.path.join(os.getcwd(), "examples/expm/data/mimic/") +target_model_dir = "./examples/expm/target_dpsgd" epsilons = [.0001, .001, .01, .1, .5, 1, 2, 3.5, 5, 7, 10] # epsilons to run over delta = 1e-5 -target_epsilon = 2 +target_epsilon = 3.5 train_frac = 0.4 valid_frac = 0.0 test_frac = 0.0 early_stop_frac = 0.4 -batch_size = 59 +batch_size = 55 use_LR = False # True if you want to use the LR model, False if you want to use the GRUD model dataset, train_indices, validation_indices, test_indices, early_stop_indices= get_mimic_dataset(path, @@ -47,25 +49,7 @@ "max_grad_norm": 1, } -try: - noise_multiplier = get_noise_multiplier(target_epsilon = target_epsilon, - target_delta = delta, - sample_rate = sample_rate, - epochs = 21, - epsilon_tolerance = 0.01, - accountant = 'prv', - eps_error = 0.01) -except: - # the prv accountant is not robust to large epsilon (even epsilon = 10) - # so we will use rdp when it fails, so the actual epsilon may be slightly off - # see https://github.com/pytorch/opacus/issues/604 - noise_multiplier = get_noise_multiplier(target_epsilon = 2, - target_delta = delta, - sample_rate = sample_rate, - epochs = 21, - epsilon_tolerance = 0.01, - accountant = 'rdp') - + @@ -73,13 +57,13 @@ "cell_size": 58, "hidden_size": 78, "learning_rate": 0.0004738759319792616, - "num_epochs":1, + "num_epochs":50, "patience_early_stopping": 20, "patience_lr_scheduler": 5, "batch_size": 59, "seed": 4410, "min_delta": 0.00001, - "epsilon": 10, + "epsilon": 3.5, "max_grad_norm": 1, } n_features = int(dataset.x.shape[1]/3) @@ -93,7 +77,7 @@ "X_mean": X_mean, "output_last": False, "bn_flag": False, - "droupout": 0.1, + # "droupout": 0.33, }) @@ -104,13 +88,13 @@ model, train_loader, early_stop_loader, - noise_multiplier, - max_grad_norm = optimized_hyperparams['max_grad_norm'], + noise_multiplier_dict, epochs=optimized_hyperparams['num_epochs'], patience_early_stopping = optimized_hyperparams["patience_early_stopping"], patience_lr= optimized_hyperparams["patience_lr_scheduler"], min_delta = optimized_hyperparams["min_delta"], - learning_rate = optimized_hyperparams["learning_rate"]) + learning_rate = optimized_hyperparams["learning_rate"], + target_model_dir = target_model_dir,) train_losses, test_losses , train_acc, test_acc, best_model,niter_per_epoch, privacy_engine = results @@ -159,3 +143,12 @@ # Run the audit mia_results = leakpro.run_audit(return_results=True) + + +# report_handler = ReportHandler() +report_handler = ReportHandler(report_dir="./examples/expm/leakpro_output/results") + +# Save MIA resuls using report handler +for res in mia_results: + report_handler.save_results(attack_name=res.attack_name, result_data=res, config=res.configs) + diff --git a/examples/expm/utils/dpsgd_model.py b/examples/expm/utils/dpsgd_model.py index d15f91b9..273da64b 100644 --- a/examples/expm/utils/dpsgd_model.py +++ b/examples/expm/utils/dpsgd_model.py @@ -20,6 +20,7 @@ from torch.optim.lr_scheduler import ReduceLROnPlateau from tqdm import tqdm from opacus import PrivacyEngine, GradSampleModule +from opacus.accountants.utils import get_noise_multiplier def to_3D_tensor(df): idx = pd.IndexSlice @@ -77,8 +78,8 @@ def __repr__(self): + ", bias=" + str(self.bias is not None) + ")" class GRUD_DPSGD(nn.Module): - def __init__(self, input_size, cell_size, hidden_size, X_mean, batch_size = 0, - droupout=0, bn_flag = True, output_last = False): + def __init__(self, input_size, cell_size, hidden_size, X_mean, batch_size, + bn_flag = True, output_last = False): """With minor modifications from https://github.com/zhiyongc/GRU-D/ Recurrent Neural Networks for Multivariate Times Series with Missing Values @@ -104,7 +105,7 @@ def __init__(self, input_size, cell_size, hidden_size, X_mean, batch_size = 0, X_mean: the mean of the historical input data """ - super(GRUD, self).__init__() + super(GRUD_DPSGD, self).__init__() # Save init params to a dictionary self.init_params = { @@ -114,7 +115,6 @@ def __init__(self, input_size, cell_size, hidden_size, X_mean, batch_size = 0, "X_mean": X_mean, "batch_size": batch_size, "output_last": output_last, - "dropout":droupout, "bn_flag":bn_flag, } @@ -145,7 +145,7 @@ def __init__(self, input_size, cell_size, hidden_size, X_mean, batch_size = 0, # self.bn= nn.BatchNorm1d(2, eps=1e-05, momentum=0.1, affine=True).to(self.device) # self.drop=nn.Dropout(p=0.7, inplace=False) self.fc = nn.Linear(self.hidden_size, 1) # a probability score - self.drop=nn.Dropout(p=0.48, inplace=False) + self.drop=nn.Dropout(p=0.33, inplace=False) if self.bn_flag: self.bn= nn.BatchNorm1d(self.hidden_size, eps=1e-05, momentum=0.1, affine=True) @@ -258,13 +258,13 @@ def to_numpy(tensor): def dpsgd_gru_trained_model_and_metadata(model, train_dataloader, test_dataloader, - noise_multiplier, - max_grad_norm, + privacy_engine_dict, epochs, patience_early_stopping, patience_lr, min_delta, - learning_rate): + learning_rate, + target_model_dir): print("Model Structure: ", model) print("Start Training ... ") @@ -289,11 +289,32 @@ def dpsgd_gru_trained_model_and_metadata(model, print("Output type dermined by the model") criterion_BCE = nn.BCEWithLogitsLoss() - criterion_CEL = nn.CrossEntropyLoss() + # criterion_CEL = nn.CrossEntropyLoss() criterion_MSE = nn.MSELoss() optimizer = optim.Adam(model.parameters(), lr=learning_rate) + sample_rate = 1 / len(train_dataloader) + try: + noise_multiplier = get_noise_multiplier(target_epsilon = privacy_engine_dict["target_epsilon"], + target_delta = privacy_engine_dict["target_delta"], + sample_rate = sample_rate , + epochs = privacy_engine_dict["epochs"], + epsilon_tolerance = privacy_engine_dict["epsilon_tolerance"], + accountant = 'prv', + eps_error = privacy_engine_dict["eps_error"],) + except: + # the prv accountant is not robust to large epsilon (even epsilon = 10) + # so we will use rdp when it fails, so the actual epsilon may be slightly off + # see https://github.com/pytorch/opacus/issues/604 + noise_multiplier = get_noise_multiplier(target_epsilon = 2, + target_delta = privacy_engine_dict["target_delta"], + sample_rate = sample_rate, + epochs = privacy_engine_dict["epochs"], + epsilon_tolerance = privacy_engine_dict["epsilon_tolerance"], + accountant = 'rdp') + + # make the model private privacy_engine = PrivacyEngine(accountant = 'prv') priv_model, priv_opt, priv_train_dataloader = privacy_engine.make_private( @@ -301,7 +322,7 @@ def dpsgd_gru_trained_model_and_metadata(model, optimizer=optimizer, data_loader=train_dataloader, noise_multiplier=noise_multiplier, - max_grad_norm=max_grad_norm, + max_grad_norm= privacy_engine_dict["max_grad_norm"], ) # Reduce learning rate when a metric has stopped improving @@ -330,6 +351,9 @@ def dpsgd_gru_trained_model_and_metadata(model, for _, (X, labels) in enumerate(tqdm(priv_train_dataloader, desc="Training Batches")): + if X.numel() == 0: # Skip empty batches + continue + if epoch == 0: niter_per_epoch += 1 @@ -372,8 +396,6 @@ def dpsgd_gru_trained_model_and_metadata(model, valid_dataloader_iter = iter(test_dataloader) X_test, labels_test = next(valid_dataloader_iter) - - X_test = X_test.to(device_name) labels_test = labels_test.to(device_name) labels_test = labels_test.long().float() @@ -439,10 +461,21 @@ def dpsgd_gru_trained_model_and_metadata(model, pre_time = cur_time # Move the model back to the CPU # Ensure the target directory exists - os.makedirs("target_GRUD", exist_ok=True) + os.makedirs(target_model_dir, exist_ok=True) + # Access the raw model if it's wrapped in DataParallel + + state_dict = priv_model.state_dict() + cleaned_state_dict = {key.replace("_module.", "").replace("module.", ""): value + for key, value in state_dict.items()} + + # Save the state_dict to a file priv_model.to("cpu") - with open("target_GRUD/target_model.pkl", "wb") as f: - save(priv_model.state_dict(), f) + with open(f"{target_model_dir}/target_model.pkl", "wb") as f: + save(cleaned_state_dict, f) + + # Create metadata for privacy engine + with open(f"{target_model_dir}/dpsgd_dic.pkl", "wb") as f: + pickle.dump(privacy_engine_dict, f) # Create metadata and store it meta_data = {} @@ -457,24 +490,25 @@ def dpsgd_gru_trained_model_and_metadata(model, # read out optimizer parameters meta_data["optimizer"] = {} - meta_data["optimizer"]["name"] = priv_opt.__class__.__name__.lower() - meta_data["optimizer"]["lr"] = priv_opt.param_groups[0].get("lr", 0) - meta_data["optimizer"]["weight_decay"] = priv_opt.param_groups[0].get("weight_decay", 0) - meta_data["optimizer"]["momentum"] = priv_opt.param_groups[0].get("momentum", 0) - meta_data["optimizer"]["dampening"] = priv_opt.param_groups[0].get("dampening", 0) - meta_data["optimizer"]["nesterov"] = priv_opt.param_groups[0].get("nesterov", False) + meta_data["optimizer"]["name"] = optimizer.__class__.__name__.lower() + meta_data["optimizer"]["lr"] = optimizer.param_groups[0].get("lr", 0) + meta_data["optimizer"]["weight_decay"] = optimizer.param_groups[0].get("weight_decay", 0) + meta_data["optimizer"]["momentum"] = optimizer.param_groups[0].get("momentum", 0) + meta_data["optimizer"]["dampening"] = optimizer.param_groups[0].get("dampening", 0) + meta_data["optimizer"]["nesterov"] = optimizer.param_groups[0].get("nesterov", False) # read out criterion parameters meta_data["loss"] = {} - meta_data["loss"]["name"] = criterion_CEL.__class__.__name__.lower() + meta_data["loss"]["name"] = criterion_BCE.__class__.__name__.lower() - meta_data["batch_size"] = priv_train_dataloader.batch_size + #priv_train_dataloader.batch_size is dynamic. Therefore, we use the original batch size + meta_data["batch_size"] = train_dataloader.batch_size meta_data["epochs"] = epochs meta_data["train_acc"] = train_acc meta_data["test_acc"] = test_acc meta_data["train_loss"] = train_loss meta_data["test_loss"] = test_loss meta_data["dataset"] = "mimiciii" - with open("target_GRUD/priv_model_metadata.pkl", "wb") as f: + with open(f"{target_model_dir}/model_metadata.pkl", "wb") as f: pickle.dump(meta_data, f) return [train_losses, test_losses, train_acces, test_acces, priv_model, niter_per_epoch, privacy_engine] diff --git a/leakpro/attacks/utils/shadow_model_handler.py b/leakpro/attacks/utils/shadow_model_handler.py index de8d25b9..ad0772dc 100755 --- a/leakpro/attacks/utils/shadow_model_handler.py +++ b/leakpro/attacks/utils/shadow_model_handler.py @@ -166,8 +166,12 @@ def create_shadow_models( train_loss = training_results["metrics"]["loss"] logger.info(f"Training shadow model {i} complete") + shadow_model_state_dict = shadow_model.state_dict() + cleaned_state_dict = {key.replace("_module.", "").replace("module.", ""): value + for key, value in shadow_model_state_dict.items()} + with open(f"{self.storage_path}/{self.model_storage_name}_{i}.pkl", "wb") as f: - save(shadow_model.state_dict(), f) + save(cleaned_state_dict, f) logger.info(f"Saved shadow model {i} to {self.storage_path}") logger.info(f"Storing metadata for shadow model {i}") From be0d72a2c2413d466df0dc9c281f48d1835de669 Mon Sep 17 00:00:00 2001 From: fazelehh Date: Mon, 10 Feb 2025 12:41:44 +0000 Subject: [PATCH 5/5] ruff fix --- examples/expm/dpsgd_handler.py | 32 ++++++------ examples/expm/run_dpsgd_main.py | 15 +++--- examples/expm/run_nonprivate_main.py | 37 +++++++------- examples/expm/utils/dpsgd_model.py | 51 +++++++++++++------ examples/expm/utils/non_private_model.py | 28 +++++++++-- examples/expm/utils/utils.py | 62 +++++++++++------------- 6 files changed, 128 insertions(+), 97 deletions(-) diff --git a/examples/expm/dpsgd_handler.py b/examples/expm/dpsgd_handler.py index 5873a6b6..3a93a68e 100644 --- a/examples/expm/dpsgd_handler.py +++ b/examples/expm/dpsgd_handler.py @@ -1,14 +1,16 @@ -from torch import cuda, device, nn, optim, squeeze +import os +import pickle + +from opacus import PrivacyEngine +from opacus.accountants.utils import get_noise_multiplier +from sklearn.metrics import accuracy_score +from torch import cuda, device, nn, optim from torch.nn import BCEWithLogitsLoss from torch.utils.data import DataLoader from tqdm import tqdm -from sklearn.metrics import accuracy_score + from leakpro import AbstractInputHandler -import os -import pickle -from opacus.accountants.utils import get_noise_multiplier -from opacus import PrivacyEngine, GradSampleModule class MimicInputHandlerGRU(AbstractInputHandler): @@ -44,9 +46,9 @@ def train( print("Training shadow models with DP-SGD") - dpsgd_path = self.configs['audit']['dpsgd']['dpsgd_path'] + dpsgd_path = self.configs["audit"]["dpsgd"]["dpsgd_path"] - sample_rate = 1/len(dataloader) + sample_rate = 1/len(dataloader) # Check if the file exists if os.path.exists(dpsgd_path): # Open and read the pickle file @@ -56,14 +58,14 @@ def train( print("Data:", privacy_engine_dict) else: raise Exception(f"File not found at: {dpsgd_path}") - + try: noise_multiplier = get_noise_multiplier(target_epsilon = privacy_engine_dict["target_epsilon"], target_delta = privacy_engine_dict["target_delta"], sample_rate = sample_rate , epochs = privacy_engine_dict["epochs"], epsilon_tolerance = privacy_engine_dict["epsilon_tolerance"], - accountant = 'prv', + accountant = "prv", eps_error = privacy_engine_dict["eps_error"],) except: # the prv accountant is not robust to large epsilon (even epsilon = 10) @@ -74,10 +76,10 @@ def train( sample_rate = sample_rate, epochs = privacy_engine_dict["epochs"], epsilon_tolerance = privacy_engine_dict["epsilon_tolerance"], - accountant = 'rdp') + accountant = "rdp") # make the model private - privacy_engine = PrivacyEngine(accountant = 'prv') + privacy_engine = PrivacyEngine(accountant = "prv") model, optimizer, dataloader = privacy_engine.make_private( module=model, optimizer=optimizer, @@ -105,12 +107,12 @@ def train( labels = labels.float() optimizer.zero_grad() - output = model(x).squeeze(dim=1) + output = model(x).squeeze(dim=1) loss = criterion(output, labels) loss.backward() optimizer.step() - + train_loss += loss.item() train_loss = train_loss/len(dataloader) @@ -120,4 +122,4 @@ def train( # Compute accuracy train_acc = accuracy_score(binary_labels, binary_predictions) - return {"model": model, "metrics": {"accuracy": train_acc, "loss": train_loss}} \ No newline at end of file + return {"model": model, "metrics": {"accuracy": train_acc, "loss": train_loss}} diff --git a/examples/expm/run_dpsgd_main.py b/examples/expm/run_dpsgd_main.py index 9ea3355c..5f4c8d2d 100644 --- a/examples/expm/run_dpsgd_main.py +++ b/examples/expm/run_dpsgd_main.py @@ -1,10 +1,9 @@ import os -import sys from torch import zeros from utils.data_handler import get_mimic_dataloaders, get_mimic_dataset - from utils.dpsgd_model import * + # Import and initialize ReportHandler from leakpro.reporting.report_handler import ReportHandler @@ -44,8 +43,8 @@ "sample_rate": sample_rate, "epochs": 21, "epsilon_tolerance": 0.01, - "accountant": 'prv', - "eps_error": 0.01, + "accountant": "prv", + "eps_error": 0.01, "max_grad_norm": 1, } @@ -64,7 +63,7 @@ "seed": 4410, "min_delta": 0.00001, "epsilon": 3.5, - "max_grad_norm": 1, + "max_grad_norm": 1, } n_features = int(dataset.x.shape[1]/3) X_mean = zeros(1,dataset.x.shape[2],n_features) @@ -85,11 +84,11 @@ model = GRUD_DPSGD(**model_params) # Train the model results= dpsgd_gru_trained_model_and_metadata( - model, + model, train_loader, - early_stop_loader, + early_stop_loader, noise_multiplier_dict, - epochs=optimized_hyperparams['num_epochs'], + epochs=optimized_hyperparams["num_epochs"], patience_early_stopping = optimized_hyperparams["patience_early_stopping"], patience_lr= optimized_hyperparams["patience_lr_scheduler"], min_delta = optimized_hyperparams["min_delta"], diff --git a/examples/expm/run_nonprivate_main.py b/examples/expm/run_nonprivate_main.py index b46cb498..6fdd2ede 100644 --- a/examples/expm/run_nonprivate_main.py +++ b/examples/expm/run_nonprivate_main.py @@ -1,11 +1,10 @@ import os -import sys -from torch import zeros -from examples.expm.utils.data_preparation import get_mimic_dataloaders, get_mimic_dataset from opacus.accountants.utils import get_noise_multiplier +from torch import zeros from utils.gru_model_handler import * +from examples.expm.utils.data_preparation import get_mimic_dataloaders, get_mimic_dataset # Generate the dataset and dataloaders path = os.path.join(os.getcwd(), "examples/expm/data/mimic/") @@ -42,7 +41,7 @@ sample_rate = sample_rate, epochs = 21, epsilon_tolerance = 0.01, - accountant = 'prv', + accountant = "prv", eps_error = 0.01) except: # the prv accountant is not robust to large epsilon (even epsilon = 10) @@ -53,19 +52,19 @@ sample_rate = sample_rate, epochs = 21, epsilon_tolerance = 0.01, - accountant = 'rdp') - + accountant = "rdp") + # Initialize model with the best hyperparameters model_params = { - 'X_mean': X_mean, - 'input_size': X_mean.shape[2], - 'device_id': device, - 'cell_size': h['cell_size'], - 'hidden_size': h['hidden_size'], - 'batch_size': h['batch_size'], - 'apply_sigmoid': h['loss'] == 'l2', # Only add apply_sigmoid for 'l2' loss - 'use_bn': h["use_bn"] != "nobn", # Use batch norm unless 'nobn' specified + "X_mean": X_mean, + "input_size": X_mean.shape[2], + "device_id": device, + "cell_size": h["cell_size"], + "hidden_size": h["hidden_size"], + "batch_size": h["batch_size"], + "apply_sigmoid": h["loss"] == "l2", # Only add apply_sigmoid for 'l2' loss + "use_bn": h["use_bn"] != "nobn", # Use batch norm unless 'nobn' specified } @@ -80,7 +79,7 @@ "seed": 4410, "min_delta": 0.00001, "epsilon": 10, - "max_grad_norm": 1, + "max_grad_norm": 1, } n_features = int(dataset.x.shape[1]/3) X_mean = zeros(1,dataset.x.shape[2],n_features) @@ -106,12 +105,12 @@ model = GRUD(**model_params) # Train the model results= dpsgd_gru_trained_model_and_metadata( - model, + model, train_loader, - early_stop_loader, + early_stop_loader, noise_multiplier, - max_grad_norm = optimized_hyperparams['max_grad_norm'], - epochs=optimized_hyperparams['num_epochs'], + max_grad_norm = optimized_hyperparams["max_grad_norm"], + epochs=optimized_hyperparams["num_epochs"], patience_early_stopping = optimized_hyperparams["patience_early_stopping"], patience_lr= optimized_hyperparams["patience_lr_scheduler"], min_delta = optimized_hyperparams["min_delta"], diff --git a/examples/expm/utils/dpsgd_model.py b/examples/expm/utils/dpsgd_model.py index 273da64b..d8be58fd 100644 --- a/examples/expm/utils/dpsgd_model.py +++ b/examples/expm/utils/dpsgd_model.py @@ -1,5 +1,4 @@ -""" -This file is inspired by https://github.com/MLforHealth/MIMIC_Extract +"""This file is inspired by https://github.com/MLforHealth/MIMIC_Extract MIT License Copyright (c) 2019 MIT Laboratory for Computational Physiology """ @@ -13,14 +12,33 @@ import pandas as pd import torch.nn.functional as F import torch.utils.data as utils +from opacus import PrivacyEngine +from opacus.accountants.utils import get_noise_multiplier from sklearn.metrics import accuracy_score -from torch import Tensor, cat, cuda, device, exp, eye, from_numpy, isnan, max, nn, optim, save, sigmoid, squeeze, tanh, zeros, no_grad +from torch import ( + Tensor, + cat, + cuda, + device, + exp, + eye, + from_numpy, + isnan, + max, + nn, + no_grad, + optim, + save, + sigmoid, + squeeze, + tanh, + zeros, +) from torch.autograd import Variable from torch.nn.parameter import Parameter from torch.optim.lr_scheduler import ReduceLROnPlateau from tqdm import tqdm -from opacus import PrivacyEngine, GradSampleModule -from opacus.accountants.utils import get_noise_multiplier + def to_3D_tensor(df): idx = pd.IndexSlice @@ -158,7 +176,8 @@ def step(self, x, x_last_obsv, x_mean, h, mask, delta): mask: the mask of whether or not the current value is observed delta: the tensor indicating the number of steps since the last time a feature was observed. - Returns: + Returns + ------- h: the updated hidden state of the network """ @@ -301,7 +320,7 @@ def dpsgd_gru_trained_model_and_metadata(model, sample_rate = sample_rate , epochs = privacy_engine_dict["epochs"], epsilon_tolerance = privacy_engine_dict["epsilon_tolerance"], - accountant = 'prv', + accountant = "prv", eps_error = privacy_engine_dict["eps_error"],) except: # the prv accountant is not robust to large epsilon (even epsilon = 10) @@ -312,11 +331,11 @@ def dpsgd_gru_trained_model_and_metadata(model, sample_rate = sample_rate, epochs = privacy_engine_dict["epochs"], epsilon_tolerance = privacy_engine_dict["epsilon_tolerance"], - accountant = 'rdp') - + accountant = "rdp") + # make the model private - privacy_engine = PrivacyEngine(accountant = 'prv') + privacy_engine = PrivacyEngine(accountant = "prv") priv_model, priv_opt, priv_train_dataloader = privacy_engine.make_private( module=model, optimizer=optimizer, @@ -353,7 +372,7 @@ def dpsgd_gru_trained_model_and_metadata(model, if X.numel() == 0: # Skip empty batches continue - + if epoch == 0: niter_per_epoch += 1 @@ -361,7 +380,7 @@ def dpsgd_gru_trained_model_and_metadata(model, labels = labels.to(device_name) labels = labels.long().float() prediction = priv_model(X) - prediction = prediction.squeeze(dim=1) + prediction = prediction.squeeze(dim=1) output_last = True if output_last: @@ -399,11 +418,11 @@ def dpsgd_gru_trained_model_and_metadata(model, X_test = X_test.to(device_name) labels_test = labels_test.to(device_name) labels_test = labels_test.long().float() - - with no_grad(): + + with no_grad(): prediction_test = priv_model(X_test) - prediction_test = prediction_test.squeeze(dim=1) + prediction_test = prediction_test.squeeze(dim=1) if output_last: @@ -502,7 +521,7 @@ def dpsgd_gru_trained_model_and_metadata(model, meta_data["loss"]["name"] = criterion_BCE.__class__.__name__.lower() #priv_train_dataloader.batch_size is dynamic. Therefore, we use the original batch size - meta_data["batch_size"] = train_dataloader.batch_size + meta_data["batch_size"] = train_dataloader.batch_size meta_data["epochs"] = epochs meta_data["train_acc"] = train_acc meta_data["test_acc"] = test_acc diff --git a/examples/expm/utils/non_private_model.py b/examples/expm/utils/non_private_model.py index 15b9b780..56f642d2 100644 --- a/examples/expm/utils/non_private_model.py +++ b/examples/expm/utils/non_private_model.py @@ -1,5 +1,4 @@ -""" -This file is inspired by https://github.com/MLforHealth/MIMIC_Extract +"""This file is inspired by https://github.com/MLforHealth/MIMIC_Extract MIT License Copyright (c) 2019 MIT Laboratory for Computational Physiology """ @@ -14,12 +13,30 @@ import torch.nn.functional as F import torch.utils.data as utils from sklearn.metrics import accuracy_score -from torch import Tensor, cat, cuda, device, exp, eye, from_numpy, isnan, max, nn, optim, save, sigmoid, squeeze, tanh, zeros, no_grad +from torch import ( + Tensor, + cat, + cuda, + device, + exp, + eye, + from_numpy, + isnan, + max, + nn, + optim, + save, + sigmoid, + squeeze, + tanh, + zeros, +) from torch.autograd import Variable from torch.nn.parameter import Parameter from torch.optim.lr_scheduler import ReduceLROnPlateau from tqdm import tqdm + def to_3D_tensor(df): idx = pd.IndexSlice np_3D = np.dstack([df.loc[idx[:, :, :, i], :].values for i in sorted(set(df.index.get_level_values("hours_in")))]) @@ -157,7 +174,8 @@ def step(self, x, x_last_obsv, x_mean, h, mask, delta): mask: the mask of whether or not the current value is observed delta: the tensor indicating the number of steps since the last time a feature was observed. - Returns: + Returns + ------- h: the updated hidden state of the network """ @@ -359,7 +377,7 @@ def gru_trained_model_and_metadata(model, labels_test = labels_test.long() prediction_test = model(X_test) - + if output_last: diff --git a/examples/expm/utils/utils.py b/examples/expm/utils/utils.py index 6d3ae71d..b473d1a8 100644 --- a/examples/expm/utils/utils.py +++ b/examples/expm/utils/utils.py @@ -12,26 +12,18 @@ # See the License for the specific language governing permissions and # limitations under the License. +# file is from MIMIC Extract Paper +import os +from pathlib import Path from typing import Optional +import numpy as np +import pandas as pd +import torch +from opacus import PrivacyEngine from opacus.accountants import create_accountant -# file is from MIMIC Extract Paper -import copy, math, os, pickle, time, pandas as pd, numpy as np, scipy.stats as ss - -from sklearn.linear_model import LogisticRegression -from sklearn.ensemble import RandomForestClassifier -from sklearn.metrics import average_precision_score, roc_auc_score, accuracy_score, f1_score - -from opacus import PrivacyEngine, GradSampleModule -import torch, torch.utils.data as utils, torch.nn as nn, torch.nn.functional as F, torch.optim as optim -from torch.autograd import Variable -from torch.nn.parameter import Parameter from torch.optim.lr_scheduler import ReduceLROnPlateau - -from pathlib import Path - - MAX_SIGMA = 1e6 @@ -46,11 +38,11 @@ def get_noise_multiplier( epsilon_tolerance: float = 0.01, **kwargs, ) -> float: - r""" - Computes the noise level sigma to reach a total budget of (target_epsilon, target_delta) + r"""Computes the noise level sigma to reach a total budget of (target_epsilon, target_delta) at the end of epochs, with a given sample_rate Args: + ---- target_epsilon: the privacy budget's epsilon target_delta: the privacy budget's delta sample_rate: the sampling rate (usually batch_size / n_data) @@ -60,6 +52,7 @@ def get_noise_multiplier( epsilon_tolerance: precision for the binary search Returns: The noise level sigma to ensure privacy budget of (target_epsilon, target_delta) + """ if (steps is None) == (epochs is None): raise ValueError( @@ -93,11 +86,10 @@ def get_noise_multiplier( return sigma_high -def Train_Model_DPSGD(pre_model, loss_fn, pre_train_dataloader, noise_multiplier, - max_grad_norm = 1, num_epochs = 300, patience = 1000, +def Train_Model_DPSGD(pre_model, loss_fn, pre_train_dataloader, noise_multiplier, + max_grad_norm = 1, num_epochs = 300, patience = 1000, learning_rate=1e-3, batch_size=None): - """ - Inputs: + """Inputs: pre_model: a GRUD model loss_fn: the loss function to use pre_train_dataloader: training data @@ -108,15 +100,17 @@ def Train_Model_DPSGD(pre_model, loss_fn, pre_train_dataloader, noise_multiplier min_delta: if the loss stays within this value on the next step stop early batch_size: size of a batch - Returns: + Returns + ------- best_model losses_train losses_epochs_train + """ pre_opt = torch.optim.Adam(pre_model.parameters(), lr = learning_rate) # make private - privacy_engine = PrivacyEngine(accountant = 'prv') + privacy_engine = PrivacyEngine(accountant = "prv") priv_model, priv_opt, priv_train_dataloader = privacy_engine.make_private( module=pre_model, optimizer=pre_opt, @@ -124,7 +118,7 @@ def Train_Model_DPSGD(pre_model, loss_fn, pre_train_dataloader, noise_multiplier noise_multiplier=noise_multiplier, max_grad_norm=max_grad_norm, ) - scheduler = ReduceLROnPlateau(priv_opt, 'min', patience=patience, verbose = True) + scheduler = ReduceLROnPlateau(priv_opt, "min", patience=patience, verbose = True) # losses_train = [] # losses_epochs_train = [] @@ -149,17 +143,17 @@ def Train_Model_DPSGD(pre_model, loss_fn, pre_train_dataloader, noise_multiplier m_shape = measurement.shape[0] # we delete last column and prepend mean so that the last observed is used measurement_last_obsv = measurement[:, 0:measurement.shape[1]-1, :] - measurement_last_obsv = torch.cat((torch.stack([X_mean[:, 0, :]]*m_shape), + measurement_last_obsv = torch.cat((torch.stack([X_mean[:, 0, :]]*m_shape), measurement_last_obsv), dim = 1) convert_to_tensor = lambda x: torch.autograd.Variable(x) - X, X_last_obsv, Mask, Delta, labels = map(convert_to_tensor, - [measurement, + X, X_last_obsv, Mask, Delta, labels = map(convert_to_tensor, + [measurement, measurement_last_obsv, mask, time_, labels]) - + priv_model.zero_grad() prediction = priv_model(X, X_last_obsv, Mask, Delta) @@ -183,15 +177,15 @@ def get_results_df(RESULTS_FOLDER, h_pass, run, task, verbose = False): task_d = {} i = 0 folder = Path(RESULTS_FOLDER, f"{h_pass}{run}") - for filename in folder.glob('*'): + for filename in folder.glob("*"): if os.path.isdir(filename): - for subfilename in filename.glob('*'): - if task in str(filename) and 'json' in str(subfilename) and 'results' in str(subfilename): + for subfilename in filename.glob("*"): + if task in str(filename) and "json" in str(subfilename) and "results" in str(subfilename): task_d[i] = unjsonify(subfilename) i += 1 - if task in str(filename) and 'json' in str(filename): + if task in str(filename) and "json" in str(filename): task_d[i] = unjsonify(filename) i += 1 - if verbose: print(f'---Processing {h_pass}{run} run for {task} ICU hyperparameter results -----') + if verbose: print(f"---Processing {h_pass}{run} run for {task} ICU hyperparameter results -----") task_df = pd.concat([pd.json_normalize(task_d[j]) for j in range(0,i)]) - return task_df \ No newline at end of file + return task_df