From f30b53a68ad711126730998a0f0cef906f83b062 Mon Sep 17 00:00:00 2001 From: fazelehh <78154933+fazelehh@users.noreply.github.com> Date: Fri, 10 Jan 2025 12:43:36 +0100 Subject: [PATCH] GRU model for LOS (#170) * ReadMe file for creating dataset from MIMICIII, and training LOS>3 classifier * adding links to readMe file * fixing the LR model * Mimic_Hnadler, data_processeing, model and the main file * all attacks are running * add relevant files from MIMIC_Extract and mimic_code repos * fix some bugs regarding nivduration * revert conflicting tabular_mia files * adding subset to the mimicDataset * fixing loading the database and indices * adding GRU * adding GRU-D structure * updating grud class * adding early stopping * the model is running * the model is running * working on meta data * mimic hanlder gru is added * LiRA in place * running lira * LiRA and RMIA for GRU, temp implmemntation * adding flag for changing the classifier * LR and read me * up to adding report handler module * renaming files, adding report handler module * renaming files, adding report handler module * adding the license, renaming some files and classes * fixing the accuracy in model handler * fixing pandas warning and adding a check to saved shadow models model type * fixing a bug in shadow_model_handler * ruff fix --- examples/mia/LOS/ReadMe.md | 5 + examples/mia/LOS/audit.yaml | 27 +- examples/mia/LOS/mimic_GRUD_main.ipynb | 846 ++++++++++++++++++ .../{mimic_handler.py => mimic_LR_handler.py} | 13 +- examples/mia/LOS/mimic_LR_main.ipynb | 318 +++++++ examples/mia/LOS/mimic_gru_handler.py | 82 ++ examples/mia/LOS/mimic_main.ipynb | 278 ------ .../mimiciii_prepration/MIMIC_Extract/run.sh | 2 +- examples/mia/LOS/utils/data_processing.py | 366 +++++--- .../mia/LOS/utils/{model.py => model_LR.py} | 79 +- examples/mia/LOS/utils/model_grud.py | 447 +++++++++ examples/mia/cifar/audit.yaml | 22 +- leakpro/attacks/utils/shadow_model_handler.py | 61 +- .../input_handler/abstract_input_handler.py | 5 + 14 files changed, 2063 insertions(+), 488 deletions(-) create mode 100644 examples/mia/LOS/ReadMe.md create mode 100644 examples/mia/LOS/mimic_GRUD_main.ipynb rename examples/mia/LOS/{mimic_handler.py => mimic_LR_handler.py} (92%) create mode 100644 examples/mia/LOS/mimic_LR_main.ipynb create mode 100644 examples/mia/LOS/mimic_gru_handler.py delete mode 100644 examples/mia/LOS/mimic_main.ipynb rename examples/mia/LOS/utils/{model.py => model_LR.py} (76%) create mode 100644 examples/mia/LOS/utils/model_grud.py diff --git a/examples/mia/LOS/ReadMe.md b/examples/mia/LOS/ReadMe.md new file mode 100644 index 00000000..973e906d --- /dev/null +++ b/examples/mia/LOS/ReadMe.md @@ -0,0 +1,5 @@ +# Lenght-of-Stay Usecase +In this use case, we focus on attacking length-of-stay classifier models. As part of the example, we train a Logistic Regression model and a Gated Recurrent Unit with Decay (GRU-D). + +## MIMIC-III Data Preprocessing +To prepare the data, refer to instructions in ```mimic_prepration/ReadMe.md ``` diff --git a/examples/mia/LOS/audit.yaml b/examples/mia/LOS/audit.yaml index 62ef45e2..f6cb89d9 100644 --- a/examples/mia/LOS/audit.yaml +++ b/examples/mia/LOS/audit.yaml @@ -1,15 +1,15 @@ audit: # Configurations for auditing random_seed: 1234 # Integer specifying the random seed attack_list: - # rmia: - # training_data_fraction: 0.5 # Fraction of the auxilary dataset to use for this attack (in each shadow model training) - # attack_data_fraction: 0.5 # Fraction of auxiliary dataset to sample from during attack - # num_shadow_models: 8 # Number of shadow models to train - # online: True # perform online or offline attack - # temperature: 2 - # gamma: 1.0 - # offline_a: 0.33 # parameter from which we compute p(x) from p_OUT(x) such that p_IN(x) = a p_OUT(x) + b. - # offline_b: 0.66 + rmia: + training_data_fraction: 0.5 # Fraction of the auxilary dataset to use for this attack (in each shadow model training) + attack_data_fraction: 0.5 # Fraction of auxiliary dataset to sample from during attack + num_shadow_models: 8 # Number of shadow models to train + online: True # perform online or offline attack + temperature: 2 + gamma: 1.0 + offline_a: 0.33 # parameter from which we compute p(x) from p_OUT(x) such that p_IN(x) = a p_OUT(x) + b. + offline_b: 0.66 # qmia: # training_data_fraction: 1.0 # Fraction of the auxilary dataset (data without train and test indices) to use for training the quantile regressor # epochs: 5 # Number of training epochs for quantile regression @@ -32,12 +32,13 @@ audit: # Configurations for auditing target: # Target model path - module_path: "utils/model.py" - model_class: "MimicLR" + module_path: "utils/model_LR.py" # either model_grud.py or model_LR.py for logestic regression + model_class: "LR" # LR/GRUD # Data paths - target_folder: "./target" - data_path: "./data/dataset.pkl" + target_folder: "./target_LR" # either target_GRUD or target_LR + data_path: "./data/flattened/dataset.pkl" #unflattened dataset for GRUD and flattened dataset for LR shadow_model: + model_class: # LR/GRUD distillation_model: diff --git a/examples/mia/LOS/mimic_GRUD_main.ipynb b/examples/mia/LOS/mimic_GRUD_main.ipynb new file mode 100644 index 00000000..8676ba82 --- /dev/null +++ b/examples/mia/LOS/mimic_GRUD_main.ipynb @@ -0,0 +1,846 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# MIA attacks on Length-of-Stay predictor, Gated Recurrent Unit with Decay (GRU-D)\n", + "## Installation of Packages in Conda\n", + "\n", + "To install the required packages in your conda environment, you can use the following commands:\n", + "\n", + "```bash\n", + "conda install h5py\n", + "conda install pytables" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "%reload_ext autoreload\n", + "%autoreload 2" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "import sys\n", + "\n", + "from torch import zeros\n", + "\n", + "project_root = os.path.abspath(os.path.join(os.getcwd(), \"../../..\"))\n", + "sys.path.append(project_root)\n", + "\n", + "from utils.data_processing import get_mimic_dataloaders, get_mimic_dataset" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The `batch_size` is one of the parameters which is assigned based on hyperparameter tuning as detailed in [this notebook](https://github.com/MLforHealth/MIMIC_Extract/blob/4daf3c89be7de05d26f47819d68d5532de6f753a/notebooks/Baselines%20for%20Mortality%20and%20LOS%20prediction%20-%20GRU-D.ipynb)." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Loading dataset...\n", + "Loaded dataset from /home/fazeleh/LeakPro/examples/mia/LOS/data/unflattened/dataset.pkl\n" + ] + } + ], + "source": [ + "# Generate the dataset and dataloaders\n", + "path = os.path.join(os.getcwd(), \"data/\")\n", + "\n", + "train_frac = 0.4\n", + "valid_frac = 0.0\n", + "test_frac = 0.0\n", + "early_stop_frac = 0.4\n", + "batch_size = 59\n", + "use_LR = False # True if you want to use the LR model, False if you want to use the GRUD model\n", + "\n", + "dataset, train_indices, validation_indices, test_indices, early_stop_indices= get_mimic_dataset(path,\n", + " train_frac ,\n", + " valid_frac,\n", + " test_frac,\n", + " early_stop_frac,\n", + " use_LR)" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "train_loader, validation_loader, test_loader, early_stop_loader = get_mimic_dataloaders(dataset,\n", + " train_indices,\n", + " validation_indices,\n", + " test_indices,\n", + " early_stop_indices,\n", + " batch_size)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The `optimized_hyperparams` is assigned based on hyperparameter tuning as detailed in [this notebook](https://github.com/MLforHealth/MIMIC_Extract/blob/4daf3c89be7de05d26f47819d68d5532de6f753a/notebooks/Baselines%20for%20Mortality%20and%20LOS%20prediction%20-%20GRU-D.ipynb)." + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "optimized_hyperparams ={\n", + " \"cell_size\": 58,\n", + " \"hidden_size\": 78,\n", + " \"learning_rate\": 0.0004738759319792616,\n", + " \"num_epochs\":37,\n", + " \"patience_early_stopping\": 20,\n", + " \"patience_lr_scheduler\": 5,\n", + " \"batch_size\": 59,\n", + " \"seed\": 4410,\n", + " \"min_delta\": 0.00001,\n", + " }\n", + "n_features = int(dataset.x.shape[1]/3)\n", + "X_mean = zeros(1,dataset.x.shape[2],n_features)\n", + "\n", + "model_params = {k: optimized_hyperparams[k] for k in [\"cell_size\", \"hidden_size\", \"batch_size\"]}\n", + "\n", + "# Add other required parameters to model_params\n", + "model_params.update({\n", + " \"input_size\": n_features,\n", + " \"X_mean\": X_mean,\n", + " \"output_last\": False\n", + "})" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Model Structure: GRUD(\n", + " (zl): Linear(in_features=286, out_features=78, bias=True)\n", + " (rl): Linear(in_features=286, out_features=78, bias=True)\n", + " (hl): Linear(in_features=286, out_features=78, bias=True)\n", + " (gamma_x_l): FilterLinear(in_features=104, out_features=104, bias=True)\n", + " (gamma_h_l): Linear(in_features=104, out_features=78, bias=True)\n", + " (fc): Linear(in_features=78, out_features=2, bias=True)\n", + " (bn): BatchNorm1d(2, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n", + " (drop): Dropout(p=0.7, inplace=False)\n", + ")\n", + "Start Training ... \n", + "Output type dermined by the model\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Training Batches: 100%|██████████| 163/163 [00:07<00:00, 22.28it/s]\n", + "Training Progress: 3%|▎ | 1/37 [00:07<04:24, 7.34s/it]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Epoch 0: Validation loss improved to 0.5127\n", + "Learning Rate: 0.000474\n", + "Epoch: 0, train_loss: 0.90600625, valid_loss: 0.51268703, time: 7.34\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Training Batches: 100%|██████████| 163/163 [00:06<00:00, 24.12it/s]\n", + "Training Progress: 5%|▌ | 2/37 [00:14<04:05, 7.01s/it]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Epoch 1: Validation loss improved to 0.5078\n", + "Learning Rate: 0.000474\n", + "Epoch: 1, train_loss: 0.80034931, valid_loss: 0.50783253, time: 6.78\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Training Batches: 100%|██████████| 163/163 [00:06<00:00, 23.95it/s]\n", + "Training Progress: 8%|▊ | 3/37 [00:20<03:55, 6.93s/it]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Epoch 2: No improvement. Patience counter: 1/20\n", + "Learning Rate: 0.000474\n", + "Epoch: 2, train_loss: 0.76733459, valid_loss: 0.55360639, time: 6.83\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Training Batches: 100%|██████████| 163/163 [00:06<00:00, 24.15it/s]\n", + "Training Progress: 11%|█ | 4/37 [00:27<03:46, 6.87s/it]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Epoch 3: No improvement. Patience counter: 2/20\n", + "Learning Rate: 0.000474\n", + "Epoch: 3, train_loss: 0.74066611, valid_loss: 0.53837186, time: 6.77\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Training Batches: 100%|██████████| 163/163 [00:06<00:00, 23.87it/s]\n", + "Training Progress: 14%|█▎ | 5/37 [00:34<03:39, 6.86s/it]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Epoch 4: No improvement. Patience counter: 3/20\n", + "Learning Rate: 0.000474\n", + "Epoch: 4, train_loss: 0.72348623, valid_loss: 0.52800447, time: 6.85\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Training Batches: 100%|██████████| 163/163 [00:06<00:00, 23.92it/s]\n", + "Training Progress: 16%|█▌ | 6/37 [00:41<03:32, 6.86s/it]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Epoch 5: No improvement. Patience counter: 4/20\n", + "Learning Rate: 0.000474\n", + "Epoch: 5, train_loss: 0.70422922, valid_loss: 0.5873853, time: 6.84\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Training Batches: 100%|██████████| 163/163 [00:06<00:00, 23.78it/s]\n", + "Training Progress: 19%|█▉ | 7/37 [00:48<03:25, 6.86s/it]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Epoch 6: No improvement. Patience counter: 5/20\n", + "Learning Rate: 0.000474\n", + "Epoch: 6, train_loss: 0.68721295, valid_loss: 0.54968578, time: 6.88\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Training Batches: 100%|██████████| 163/163 [00:06<00:00, 23.98it/s]\n", + "Training Progress: 22%|██▏ | 8/37 [00:55<03:18, 6.85s/it]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Epoch 7: No improvement. Patience counter: 6/20\n", + "Epoch 00008: reducing learning rate of group 0 to 2.3694e-04.\n", + "Learning Rate: 0.000237\n", + "Epoch: 7, train_loss: 0.65936494, valid_loss: 0.56551975, time: 6.82\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Training Batches: 100%|██████████| 163/163 [00:06<00:00, 23.80it/s]\n", + "Training Progress: 24%|██▍ | 9/37 [01:02<03:12, 6.86s/it]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Epoch 8: No improvement. Patience counter: 7/20\n", + "Learning Rate: 0.000237\n", + "Epoch: 8, train_loss: 0.61763578, valid_loss: 0.57842529, time: 6.88\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Training Batches: 100%|██████████| 163/163 [00:06<00:00, 23.89it/s]\n", + "Training Progress: 27%|██▋ | 10/37 [01:08<03:05, 6.86s/it]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Epoch 9: No improvement. Patience counter: 8/20\n", + "Learning Rate: 0.000237\n", + "Epoch: 9, train_loss: 0.62895368, valid_loss: 0.56870216, time: 6.85\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Training Batches: 100%|██████████| 163/163 [00:06<00:00, 23.81it/s]\n", + "Training Progress: 30%|██▉ | 11/37 [01:15<02:58, 6.86s/it]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Epoch 10: No improvement. Patience counter: 9/20\n", + "Learning Rate: 0.000237\n", + "Epoch: 10, train_loss: 0.6022617, valid_loss: 0.5680961, time: 6.87\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Training Batches: 100%|██████████| 163/163 [00:06<00:00, 23.66it/s]\n", + "Training Progress: 32%|███▏ | 12/37 [01:22<02:51, 6.88s/it]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Epoch 11: No improvement. Patience counter: 10/20\n", + "Learning Rate: 0.000237\n", + "Epoch: 11, train_loss: 0.59018976, valid_loss: 0.55305964, time: 6.92\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Training Batches: 100%|██████████| 163/163 [00:06<00:00, 23.78it/s]\n", + "Training Progress: 35%|███▌ | 13/37 [01:29<02:45, 6.88s/it]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Epoch 12: No improvement. Patience counter: 11/20\n", + "Learning Rate: 0.000237\n", + "Epoch: 12, train_loss: 0.58661764, valid_loss: 0.57494694, time: 6.88\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Training Batches: 100%|██████████| 163/163 [00:06<00:00, 23.83it/s]\n", + "Training Progress: 38%|███▊ | 14/37 [01:36<02:38, 6.87s/it]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Epoch 13: No improvement. Patience counter: 12/20\n", + "Epoch 00014: reducing learning rate of group 0 to 1.1847e-04.\n", + "Learning Rate: 0.000118\n", + "Epoch: 13, train_loss: 0.58373045, valid_loss: 0.56426746, time: 6.87\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Training Batches: 100%|██████████| 163/163 [00:06<00:00, 23.66it/s]\n", + "Training Progress: 41%|████ | 15/37 [01:43<02:31, 6.89s/it]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Epoch 14: No improvement. Patience counter: 13/20\n", + "Learning Rate: 0.000118\n", + "Epoch: 14, train_loss: 0.5602767, valid_loss: 0.56786698, time: 6.91\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Training Batches: 100%|██████████| 163/163 [00:06<00:00, 23.72it/s]\n", + "Training Progress: 43%|████▎ | 16/37 [01:50<02:24, 6.89s/it]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Epoch 15: No improvement. Patience counter: 14/20\n", + "Learning Rate: 0.000118\n", + "Epoch: 15, train_loss: 0.55402973, valid_loss: 0.56124943, time: 6.9\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Training Batches: 100%|██████████| 163/163 [00:06<00:00, 23.68it/s]\n", + "Training Progress: 46%|████▌ | 17/37 [01:57<02:17, 6.90s/it]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Epoch 16: No improvement. Patience counter: 15/20\n", + "Learning Rate: 0.000118\n", + "Epoch: 16, train_loss: 0.55244331, valid_loss: 0.5543347, time: 6.91\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Training Batches: 100%|██████████| 163/163 [00:06<00:00, 23.61it/s]\n", + "Training Progress: 49%|████▊ | 18/37 [02:04<02:11, 6.91s/it]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Epoch 17: No improvement. Patience counter: 16/20\n", + "Learning Rate: 0.000118\n", + "Epoch: 17, train_loss: 0.54556851, valid_loss: 0.56179345, time: 6.93\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Training Batches: 100%|██████████| 163/163 [00:06<00:00, 23.75it/s]\n", + "Training Progress: 51%|█████▏ | 19/37 [02:10<02:04, 6.90s/it]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Epoch 18: No improvement. Patience counter: 17/20\n", + "Learning Rate: 0.000118\n", + "Epoch: 18, train_loss: 0.53646851, valid_loss: 0.57852155, time: 6.89\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Training Batches: 100%|██████████| 163/163 [00:06<00:00, 26.42it/s]\n", + "Training Progress: 54%|█████▍ | 20/37 [02:17<01:53, 6.69s/it]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Epoch 19: No improvement. Patience counter: 18/20\n", + "Epoch 00020: reducing learning rate of group 0 to 5.9234e-05.\n", + "Learning Rate: 0.000059\n", + "Epoch: 19, train_loss: 0.53324886, valid_loss: 0.57487661, time: 6.19\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Training Batches: 100%|██████████| 163/163 [00:05<00:00, 27.72it/s]\n", + "Training Progress: 57%|█████▋ | 21/37 [02:23<01:43, 6.45s/it]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Epoch 20: No improvement. Patience counter: 19/20\n", + "Learning Rate: 0.000059\n", + "Epoch: 20, train_loss: 0.52443605, valid_loss: 0.57012284, time: 5.9\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Training Batches: 100%|██████████| 163/163 [00:05<00:00, 27.85it/s]\n", + "Training Progress: 57%|█████▋ | 21/37 [02:28<01:53, 7.09s/it]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Epoch 21: No improvement. Patience counter: 20/20\n", + "Early stopping at epoch 21. Best validation loss: 0.5078\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n" + ] + } + ], + "source": [ + "from utils.model_grud import *\n", + "\n", + "# Initialize the model with filtered parameters\n", + "model = GRUD(**model_params)\n", + "\n", + "# Train the model with Train_Model function\n", + "train_losses, test_losses , train_acc, test_acc = gru_trained_model_and_metadata(model,\n", + " train_loader,\n", + " early_stop_loader,\n", + " epochs = optimized_hyperparams[\"num_epochs\"],\n", + " patience_early_stopping = optimized_hyperparams[\"patience_early_stopping\"],\n", + " patience_lr= optimized_hyperparams[\"patience_lr_scheduler\"],\n", + " min_delta = optimized_hyperparams[\"min_delta\"],\n", + " learning_rate = optimized_hyperparams[\"learning_rate\"])" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "import matplotlib.pyplot as plt\n", + "\n", + "# Convert losses to numpy-compatible lists directly\n", + "train_losses_cpu = [float(loss) for loss in train_losses]\n", + "test_losses_cpu = [float(loss) for loss in test_losses]\n", + "\n", + "# Plot training and test accuracy\n", + "plt.figure(figsize=(5, 4))\n", + "\n", + "plt.subplot(1, 2, 1)\n", + "plt.plot(train_acc, label=\"Train Accuracy\")\n", + "plt.plot(test_acc, label=\"Test Accuracy\")\n", + "plt.xlabel(\"Epoch\")\n", + "plt.ylabel(\"Accuracy\")\n", + "plt.title(\"Accuracy over Epochs\")\n", + "plt.legend()\n", + "\n", + "# Plot training and test loss\n", + "plt.subplot(1, 2, 2)\n", + "plt.plot(train_losses, label=\"Train Loss\")\n", + "plt.plot(test_losses, label=\"Test Loss\")\n", + "plt.xlabel(\"Epoch\")\n", + "plt.ylabel(\"Loss\")\n", + "plt.title(\"Loss over Epochs\")\n", + "plt.legend()\n", + "\n", + "plt.tight_layout()\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Attacking the GRUD model\n", + "Modify ```audit.yaml ``` file to attack GRUD model: \n", + " \n", + " ```\n", + " module_path: \"utils/model_grud.py\" \n", + " model_class: \"GRUD\"\n", + " target_folder: \"./target_GRUD\"\n", + " data_path: \"./data/unflattened/dataset.pkl\"\n", + " ```" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2025-01-04 09:41:35,162 INFO Target model blueprint created from GRUD in utils/model_grud.py.\n", + "2025-01-04 09:41:35,170 INFO Loaded target model metadata from ./target_GRUD/model_metadata.pkl\n", + "2025-01-04 09:41:35,174 INFO Loaded target model from ./target_GRUD\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2025-01-04 09:41:36,126 INFO Loaded population dataset from ./data/unflattened/dataset.pkl\n", + "2025-01-04 09:41:36,127 INFO Loaded population dataset from ./data/unflattened/dataset.pkl\n", + "2025-01-04 09:41:36,128 INFO Creating shadow model handler singleton\n", + "2025-01-04 09:41:36,132 INFO Creating distillation model handler singleton\n", + "2025-01-04 09:41:36,138 INFO Configuring RMIA attack\n", + "2025-01-04 09:41:36,138 INFO Added attack: rmia\n", + "2025-01-04 09:41:36,138 INFO Added attack: lira\n", + "2025-01-04 09:41:36,139 INFO Preparing attack: rmia\n", + "2025-01-04 09:41:36,139 INFO Preparing shadow models for RMIA attack\n", + "2025-01-04 09:41:36,139 INFO Preparing attack data for training the RMIA attack\n", + "2025-01-04 09:41:36,141 INFO Check for 8 shadow models (dataset: 23944 points)\n", + "2025-01-04 09:41:36,143 INFO Number of existing models exceeds or equals the number of models to create\n", + "2025-01-04 09:41:36,144 INFO Loading shadow model 0\n", + "2025-01-04 09:41:36,150 INFO Loaded model from ./leakpro_output/attack_objects/shadow_model/shadow_model_0.pkl\n", + "2025-01-04 09:41:36,150 INFO Loading shadow model 4\n", + "2025-01-04 09:41:36,154 INFO Loaded model from ./leakpro_output/attack_objects/shadow_model/shadow_model_4.pkl\n", + "2025-01-04 09:41:36,154 INFO Loading shadow model 2\n", + "2025-01-04 09:41:36,158 INFO Loaded model from ./leakpro_output/attack_objects/shadow_model/shadow_model_2.pkl\n", + "2025-01-04 09:41:36,158 INFO Loading shadow model 3\n", + "2025-01-04 09:41:36,162 INFO Loaded model from ./leakpro_output/attack_objects/shadow_model/shadow_model_3.pkl\n", + "2025-01-04 09:41:36,163 INFO Loading shadow model 5\n", + "2025-01-04 09:41:36,166 INFO Loaded model from ./leakpro_output/attack_objects/shadow_model/shadow_model_5.pkl\n", + "2025-01-04 09:41:36,167 INFO Loading shadow model 7\n", + "2025-01-04 09:41:36,171 INFO Loaded model from ./leakpro_output/attack_objects/shadow_model/shadow_model_7.pkl\n", + "2025-01-04 09:41:36,171 INFO Loading shadow model 1\n", + "2025-01-04 09:41:36,175 INFO Loaded model from ./leakpro_output/attack_objects/shadow_model/shadow_model_1.pkl\n", + "2025-01-04 09:41:36,175 INFO Loading shadow model 6\n", + "2025-01-04 09:41:36,179 INFO Loaded model from ./leakpro_output/attack_objects/shadow_model/shadow_model_6.pkl\n", + "2025-01-04 09:41:36,179 INFO Running attack: rmia\n", + "2025-01-04 09:41:36,180 INFO Running RMIA online attack\n", + "2025-01-04 09:41:36,180 INFO Loading metadata 0\n", + "2025-01-04 09:41:36,181 INFO Loading metadata 4\n", + "2025-01-04 09:41:36,181 INFO Loading metadata 2\n", + "2025-01-04 09:41:36,182 INFO Loading metadata 3\n", + "2025-01-04 09:41:36,183 INFO Loading metadata 5\n", + "2025-01-04 09:41:36,183 INFO Loading metadata 7\n", + "2025-01-04 09:41:36,184 INFO Loading metadata 1\n", + "2025-01-04 09:41:36,184 INFO Loading metadata 6\n", + "2025-01-04 09:41:36,952 INFO Number of points in the audit dataset that are used for online attack: 18985\n", + "2025-01-04 09:43:01,678 INFO Subsampling attack data from 4790 points \n", + "2025-01-04 09:43:01,679 INFO Number of attack data points after subsampling: 2395\n", + "2025-01-04 09:43:13,335 INFO Finished attack: rmia \n", + "2025-01-04 09:43:13,336 INFO Preparing attack: lira\n", + "2025-01-04 09:43:13,339 INFO Number of existing models exceeds or equals the number of models to create\n", + "2025-01-04 09:43:13,340 INFO Loading shadow model 0\n", + "2025-01-04 09:43:13,343 INFO Loaded model from ./leakpro_output/attack_objects/shadow_model/shadow_model_0.pkl\n", + "2025-01-04 09:43:13,344 INFO Loading shadow model 4\n", + "2025-01-04 09:43:13,348 INFO Loaded model from ./leakpro_output/attack_objects/shadow_model/shadow_model_4.pkl\n", + "2025-01-04 09:43:13,348 INFO Loading shadow model 2\n", + "2025-01-04 09:43:13,352 INFO Loaded model from ./leakpro_output/attack_objects/shadow_model/shadow_model_2.pkl\n", + "2025-01-04 09:43:13,352 INFO Loading shadow model 3\n", + "2025-01-04 09:43:13,356 INFO Loaded model from ./leakpro_output/attack_objects/shadow_model/shadow_model_3.pkl\n", + "2025-01-04 09:43:13,356 INFO Loading shadow model 5\n", + "2025-01-04 09:43:13,360 INFO Loaded model from ./leakpro_output/attack_objects/shadow_model/shadow_model_5.pkl\n", + "2025-01-04 09:43:13,361 INFO Loading shadow model 7\n", + "2025-01-04 09:43:13,364 INFO Loaded model from ./leakpro_output/attack_objects/shadow_model/shadow_model_7.pkl\n", + "2025-01-04 09:43:13,365 INFO Loading shadow model 1\n", + "2025-01-04 09:43:13,369 INFO Loaded model from ./leakpro_output/attack_objects/shadow_model/shadow_model_1.pkl\n", + "2025-01-04 09:43:13,369 INFO Loading shadow model 6\n", + "2025-01-04 09:43:13,373 INFO Loaded model from ./leakpro_output/attack_objects/shadow_model/shadow_model_6.pkl\n", + "2025-01-04 09:43:13,374 INFO Create masks for all IN and OUT samples\n", + "2025-01-04 09:43:13,374 INFO Loading metadata 0\n", + "2025-01-04 09:43:13,375 INFO Loading metadata 4\n", + "2025-01-04 09:43:13,376 INFO Loading metadata 2\n", + "2025-01-04 09:43:13,376 INFO Loading metadata 3\n", + "2025-01-04 09:43:13,377 INFO Loading metadata 5\n", + "2025-01-04 09:43:13,377 INFO Loading metadata 7\n", + "2025-01-04 09:43:13,378 INFO Loading metadata 1\n", + "2025-01-04 09:43:13,379 INFO Loading metadata 6\n", + "2025-01-04 09:43:13,388 INFO Calculating the logits for all 8 shadow models\n", + "2025-01-04 09:44:30,637 INFO Calculating the logits for the target model \n", + "2025-01-04 09:44:40,363 INFO Running attack: lira \n", + "Processing audit samples: 100%|██████████| 18985/18985 [00:03<00:00, 5962.65it/s]\n", + "2025-01-04 09:44:43,585 INFO Finished attack: lira\n", + "2025-01-04 09:44:43,586 INFO Preparing results for attack: rmia\n", + "2025-01-04 09:44:43,586 INFO Preparing results for attack: lira\n", + "2025-01-04 09:44:43,586 INFO Auditing completed\n" + ] + } + ], + "source": [ + "from mimic_gru_handler import MimicInputHandlerGRU\n", + "\n", + "from leakpro import LeakPro\n", + "\n", + "# Read the config file\n", + "config_path = \"audit.yaml\"\n", + "\n", + "# Prepare leakpro object\n", + "leakpro = LeakPro(MimicInputHandlerGRU, config_path)\n", + "\n", + "# Run the audit\n", + "mia_results = leakpro.run_audit(return_results=True)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Report Genrating" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2025-01-04 09:45:16,653 INFO Initializing report handler...\n", + "2025-01-04 09:45:16,654 INFO report_dir set to: ./leakpro_output/results\n", + "2025-01-04 09:45:16,654 INFO Saving results for rmia\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2025-01-04 09:45:20,466 INFO Saving results for lira\n" + ] + }, + { + "data": { + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "\n", + "# Import and initialize ReportHandler\n", + "from leakpro.reporting.report_handler import ReportHandler\n", + "\n", + "# report_handler = ReportHandler()\n", + "report_handler = ReportHandler(report_dir=\"./leakpro_output/results\")\n", + "\n", + "# Save MIA resuls using report handler\n", + "for res in mia_results:\n", + " report_handler.save_results(attack_name=res.attack_name, result_data=res, config=res.configs)" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2025-01-04 09:45:33,289 INFO No results of type GIAResults found.\n", + "2025-01-04 09:45:33,289 INFO No results of type SinglingOutResults found.\n", + "2025-01-04 09:45:33,290 INFO No results of type InferenceResults found.\n", + "2025-01-04 09:45:33,290 INFO No results of type LinkabilityResults found.\n", + "2025-01-04 09:45:40,303 INFO PDF compiled\n" + ] + }, + { + "data": { + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "\n", + "# # Create the report by compiling the latex text\n", + "report_handler.create_report()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "leakpro_test", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.2" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/examples/mia/LOS/mimic_handler.py b/examples/mia/LOS/mimic_LR_handler.py similarity index 92% rename from examples/mia/LOS/mimic_handler.py rename to examples/mia/LOS/mimic_LR_handler.py index eafa12b4..9f8ade1d 100644 --- a/examples/mia/LOS/mimic_handler.py +++ b/examples/mia/LOS/mimic_LR_handler.py @@ -7,15 +7,16 @@ from leakpro import AbstractInputHandler + class MimicInputHandler(AbstractInputHandler): - """Class to handle the user input for the CIFAR10 dataset.""" + """Class to handle the user input for the MIMICIII dataset.""" def __init__(self, configs: dict) -> None: super().__init__(configs = configs) def get_criterion(self)->None: - """Set the CrossEntropyLoss for the model.""" + """Set the Binary Cross Entropy Loss for the model.""" return BCELoss() def get_optimizer(self, model:torch.nn.Module) -> None: @@ -40,11 +41,11 @@ def train( criterion = self.get_criterion() optimizer = self.get_optimizer(model) - + for e in tqdm(range(epochs), desc="Training Progress"): model.train() train_acc, train_loss = 0.0, 0.0 - + for data, target in dataloader: target = target.float().unsqueeze(1) data, target = data.to(compute_device, non_blocking=True), target.to(compute_device, non_blocking=True) @@ -54,11 +55,11 @@ def train( loss = criterion(output, target) pred = sigmoid(output) >= 0.5 train_acc += pred.eq(target).sum().item() - + loss.backward() optimizer.step() train_loss += loss.item() - + train_acc = train_acc/len(dataloader.dataset) train_loss = train_loss/len(dataloader) diff --git a/examples/mia/LOS/mimic_LR_main.ipynb b/examples/mia/LOS/mimic_LR_main.ipynb new file mode 100644 index 00000000..f0ca43cc --- /dev/null +++ b/examples/mia/LOS/mimic_LR_main.ipynb @@ -0,0 +1,318 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# MIA attacks on Length-of-Stay predictor, Logistic Regression\n", + "## Installation of Packages in Conda\n", + "\n", + "To install the required packages in your conda environment, you can use the following commands:\n", + "\n", + "```bash\n", + "conda install h5py\n", + "conda install pytables\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%reload_ext autoreload\n", + "%autoreload 2" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "import sys\n", + "\n", + "project_root = os.path.abspath(os.path.join(os.getcwd(), \"../../..\"))\n", + "sys.path.append(project_root)\n", + "\n", + "from examples.mia.LOS.utils.data_processing import get_mimic_dataloaders, get_mimic_dataset\n", + "from examples.mia.LOS.utils.model_LR import LR, create_trained_model_and_metadata\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Train the classifier\n", + "For the LR, the data should be flatten. So set the value to True for the LR model anb False for the GRU-D" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Generate the dataset and dataloaders\n", + "path = os.path.join(os.getcwd(), \"data/\")\n", + "use_LR = True # If True, use a logistic regression model. If False, use a GRUD model.\n", + "dataset, train_indices, validation_indices, test_indices, early_stop_indices = get_mimic_dataset(path, train_frac = 0.5,\n", + " test_frac = 0.2,\n", + " validation_frac = 0,\n", + " early_stop_frac = 0,\n", + " use_LR = use_LR)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "train_loader, validation_loader, test_loader, early_stop_loader = get_mimic_dataloaders(dataset,\n", + " train_indices,\n", + " validation_indices,\n", + " test_indices,\n", + " early_stop_indices,\n", + " batch_size=128)" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Number of features: 7488\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Training Progress: 100%|██████████| 20/20 [00:27<00:00, 1.37s/it]\n" + ] + } + ], + "source": [ + "n_features = dataset.x.shape[1]\n", + "print(f\"Number of features: {n_features}\")\n", + "\n", + "model = LR(n_features)\n", + "train_acc, train_loss, test_acc, test_loss = create_trained_model_and_metadata(model,\n", + " train_loader,\n", + " test_loader,\n", + " lr = 0.0001,\n", + " weight_decay = 5.392,\n", + " epochs=20)" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "import matplotlib.pyplot as plt\n", + "\n", + "# Plot training and test accuracy\n", + "plt.figure(figsize=(5, 4))\n", + "\n", + "plt.subplot(1, 2, 1)\n", + "plt.plot(train_acc, label=\"Train Accuracy\")\n", + "plt.plot(test_acc, label=\"Test Accuracy\")\n", + "plt.xlabel(\"Epoch\")\n", + "plt.ylabel(\"Accuracy\")\n", + "plt.title(\"Accuracy over Epochs\")\n", + "plt.legend()\n", + "\n", + "# Plot training and test loss\n", + "plt.subplot(1, 2, 2)\n", + "plt.plot(train_loss, label=\"Train Loss\")\n", + "plt.plot(test_loss, label=\"Test Loss\")\n", + "plt.xlabel(\"Epoch\")\n", + "plt.ylabel(\"Loss\")\n", + "plt.title(\"Loss over Epochs\")\n", + "plt.legend()\n", + "\n", + "plt.tight_layout()\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Attack the LR model\n", + "Modify ```audit.yaml ``` file to attack LR model: \n", + " \n", + " ```\n", + " module_path: \"utils/model_LR.py\" \n", + " model_class: \"LR\"\n", + " target_folder: \"./target_LR\"\n", + " data_path: \"./data/flattened/dataset.pkl\"\n", + " ```\n" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2025-01-10 09:53:42,304 INFO Target model blueprint created from LR in utils/model_LR.py.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2025-01-10 09:53:42,358 INFO Loaded target model metadata from ./target_LR/model_metadata.pkl\n", + "2025-01-10 09:53:42,360 INFO Loaded target model from ./target_LR\n", + "2025-01-10 09:53:43,098 INFO Loaded population dataset from ./data/flattened/dataset.pkl\n", + "2025-01-10 09:53:43,100 INFO Loaded population dataset from ./data/flattened/dataset.pkl\n", + "2025-01-10 09:53:43,101 INFO Configuring RMIA attack\n", + "2025-01-10 09:53:43,101 INFO Added attack: rmia\n", + "2025-01-10 09:53:43,102 INFO Added attack: lira\n", + "2025-01-10 09:53:43,103 INFO Preparing attack: rmia\n", + "2025-01-10 09:53:43,103 INFO Preparing shadow models for RMIA attack\n", + "2025-01-10 09:53:43,104 INFO Preparing attack data for training the RMIA attack\n", + "2025-01-10 09:53:43,105 INFO Check for 8 shadow models (dataset: 23944 points)\n", + "2025-01-10 09:53:43,106 WARNING Using the same model class for shadow models as the target model.\n", + "2025-01-10 09:53:43,121 WARNING Mismatched model types found in saved shadow models: [(0, 'Unknown'), (4, 'Unknown'), (2, 'Unknown'), (15, 'LogisticRegression'), (14, 'LogisticRegression'), (11, 'LogisticRegression'), (3, 'Unknown'), (9, 'LogisticRegression'), (5, 'Unknown'), (8, 'LogisticRegression'), (7, 'Unknown'), (16, 'LogisticRegression'), (13, 'LogisticRegression'), (1, 'Unknown'), (12, 'LogisticRegression'), (10, 'LogisticRegression'), (17, 'LogisticRegression'), (6, 'Unknown')]. Expected model type: LR.\n", + "2025-01-10 09:53:43,122 INFO Number of existing models exceeds or equals the number of models to create\n", + "2025-01-10 09:53:43,123 INFO Loading shadow model 23\n", + "2025-01-10 09:53:43,125 INFO Loaded model from ./leakpro_output/attack_objects/shadow_model/shadow_model_23.pkl\n", + "2025-01-10 09:53:43,126 INFO Loading shadow model 20\n", + "2025-01-10 09:53:43,128 INFO Loaded model from ./leakpro_output/attack_objects/shadow_model/shadow_model_20.pkl\n", + "2025-01-10 09:53:43,129 INFO Loading shadow model 21\n", + "2025-01-10 09:53:43,131 INFO Loaded model from ./leakpro_output/attack_objects/shadow_model/shadow_model_21.pkl\n", + "2025-01-10 09:53:43,131 INFO Loading shadow model 25\n", + "2025-01-10 09:53:43,133 INFO Loaded model from ./leakpro_output/attack_objects/shadow_model/shadow_model_25.pkl\n", + "2025-01-10 09:53:43,134 INFO Loading shadow model 18\n", + "2025-01-10 09:53:43,135 INFO Loaded model from ./leakpro_output/attack_objects/shadow_model/shadow_model_18.pkl\n", + "2025-01-10 09:53:43,136 INFO Loading shadow model 24\n", + "2025-01-10 09:53:43,138 INFO Loaded model from ./leakpro_output/attack_objects/shadow_model/shadow_model_24.pkl\n", + "2025-01-10 09:53:43,138 INFO Loading shadow model 22\n", + "2025-01-10 09:53:43,140 INFO Loaded model from ./leakpro_output/attack_objects/shadow_model/shadow_model_22.pkl\n", + "2025-01-10 09:53:43,141 INFO Loading shadow model 19\n", + "2025-01-10 09:53:43,143 INFO Loaded model from ./leakpro_output/attack_objects/shadow_model/shadow_model_19.pkl\n", + "2025-01-10 09:53:43,143 INFO Running attack: rmia\n", + "2025-01-10 09:53:43,144 INFO Running RMIA online attack\n", + "2025-01-10 09:53:43,144 INFO Loading metadata 23\n", + "2025-01-10 09:53:43,146 INFO Loading metadata 20\n", + "2025-01-10 09:53:43,147 INFO Loading metadata 21\n", + "2025-01-10 09:53:43,148 INFO Loading metadata 25\n", + "2025-01-10 09:53:43,149 INFO Loading metadata 18\n", + "2025-01-10 09:53:43,150 INFO Loading metadata 24\n", + "2025-01-10 09:53:43,151 INFO Loading metadata 22\n", + "2025-01-10 09:53:43,152 INFO Loading metadata 19\n", + "2025-01-10 09:53:43,705 INFO Number of points in the audit dataset that are used for online attack: 16632\n", + "2025-01-10 09:53:49,467 INFO Subsampling attack data from 7184 points \n", + "2025-01-10 09:53:49,468 INFO Number of attack data points after subsampling: 3592\n", + "2025-01-10 09:53:51,950 INFO Finished attack: rmia \n", + "2025-01-10 09:53:51,951 INFO Preparing attack: lira\n", + "2025-01-10 09:53:51,953 WARNING Using the same model class for shadow models as the target model.\n", + "2025-01-10 09:53:51,968 WARNING Mismatched model types found in saved shadow models: [(0, 'Unknown'), (4, 'Unknown'), (2, 'Unknown'), (15, 'LogisticRegression'), (14, 'LogisticRegression'), (11, 'LogisticRegression'), (3, 'Unknown'), (9, 'LogisticRegression'), (5, 'Unknown'), (8, 'LogisticRegression'), (7, 'Unknown'), (16, 'LogisticRegression'), (13, 'LogisticRegression'), (1, 'Unknown'), (12, 'LogisticRegression'), (10, 'LogisticRegression'), (17, 'LogisticRegression'), (6, 'Unknown')]. Expected model type: LR.\n", + "2025-01-10 09:53:51,969 INFO Number of existing models exceeds or equals the number of models to create\n", + "2025-01-10 09:53:51,969 INFO Loading shadow model 23\n", + "2025-01-10 09:53:51,971 INFO Loaded model from ./leakpro_output/attack_objects/shadow_model/shadow_model_23.pkl\n", + "2025-01-10 09:53:51,972 INFO Loading shadow model 20\n", + "2025-01-10 09:53:51,974 INFO Loaded model from ./leakpro_output/attack_objects/shadow_model/shadow_model_20.pkl\n", + "2025-01-10 09:53:51,974 INFO Loading shadow model 21\n", + "2025-01-10 09:53:51,976 INFO Loaded model from ./leakpro_output/attack_objects/shadow_model/shadow_model_21.pkl\n", + "2025-01-10 09:53:51,977 INFO Loading shadow model 25\n", + "2025-01-10 09:53:51,978 INFO Loaded model from ./leakpro_output/attack_objects/shadow_model/shadow_model_25.pkl\n", + "2025-01-10 09:53:51,979 INFO Loading shadow model 18\n", + "2025-01-10 09:53:51,980 INFO Loaded model from ./leakpro_output/attack_objects/shadow_model/shadow_model_18.pkl\n", + "2025-01-10 09:53:51,981 INFO Loading shadow model 24\n", + "2025-01-10 09:53:51,983 INFO Loaded model from ./leakpro_output/attack_objects/shadow_model/shadow_model_24.pkl\n", + "2025-01-10 09:53:51,984 INFO Loading shadow model 22\n", + "2025-01-10 09:53:51,985 INFO Loaded model from ./leakpro_output/attack_objects/shadow_model/shadow_model_22.pkl\n", + "2025-01-10 09:53:51,986 INFO Loading shadow model 19\n", + "2025-01-10 09:53:51,987 INFO Loaded model from ./leakpro_output/attack_objects/shadow_model/shadow_model_19.pkl\n", + "2025-01-10 09:53:51,988 INFO Create masks for all IN and OUT samples\n", + "2025-01-10 09:53:51,989 INFO Loading metadata 23\n", + "2025-01-10 09:53:51,990 INFO Loading metadata 20\n", + "2025-01-10 09:53:51,991 INFO Loading metadata 21\n", + "2025-01-10 09:53:51,992 INFO Loading metadata 25\n", + "2025-01-10 09:53:51,993 INFO Loading metadata 18\n", + "2025-01-10 09:53:51,994 INFO Loading metadata 24\n", + "2025-01-10 09:53:51,995 INFO Loading metadata 22\n", + "2025-01-10 09:53:51,996 INFO Loading metadata 19\n", + "2025-01-10 09:53:52,003 INFO Calculating the logits for all 8 shadow models\n", + "2025-01-10 09:53:58,105 INFO Calculating the logits for the target model \n", + "2025-01-10 09:53:58,985 INFO Running attack: lira \n", + "Processing audit samples: 100%|██████████| 16632/16632 [00:05<00:00, 3226.14it/s]\n", + "2025-01-10 09:54:04,171 INFO Finished attack: lira\n", + "2025-01-10 09:54:04,172 INFO Preparing results for attack: rmia\n", + "2025-01-10 09:54:10,179 INFO Preparing results for attack: lira\n", + "2025-01-10 09:54:15,544 INFO Auditing completed\n" + ] + }, + { + "data": { + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "from mimic_LR_handler import MimicInputHandler\n", + "\n", + "from leakpro import LeakPro\n", + "\n", + "# Read the config file\n", + "config_path = \"audit.yaml\"\n", + "\n", + "# Prepare leakpro object\n", + "leakpro = LeakPro(MimicInputHandler, config_path)\n", + "\n", + "# Run the audit\n", + "leakpro.run_audit()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "leakpro_test", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.2" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/examples/mia/LOS/mimic_gru_handler.py b/examples/mia/LOS/mimic_gru_handler.py new file mode 100644 index 00000000..be51e6b8 --- /dev/null +++ b/examples/mia/LOS/mimic_gru_handler.py @@ -0,0 +1,82 @@ + +from torch import cuda, device, nn, optim, squeeze +from torch.nn import CrossEntropyLoss +from torch.utils.data import DataLoader +from tqdm import tqdm +from sklearn.metrics import accuracy_score +from leakpro import AbstractInputHandler + + +class MimicInputHandlerGRU(AbstractInputHandler): + """Class to handle the user input for the MIMICIII dataset.""" + + def __init__(self, configs: dict) -> None: + super().__init__(configs = configs) + + def get_criterion(self)->CrossEntropyLoss: + """Set the CrossEntropyLoss for the model.""" + return CrossEntropyLoss() + + def get_optimizer(self, model:nn.Module) -> optim.Optimizer: + """Set the optimizer for the model.""" + learning_rate = 0.01 + return optim.Adam(model.parameters(), lr=learning_rate) + + def convert_to_device(self, x): + device_name = device("cuda" if cuda.is_available() else "cpu") + return x.to(device_name) + + def to_numpy(self, tensor) : + return tensor.detach().cpu().numpy() if tensor.is_cuda else tensor.detach().numpy() + + def train( + self, + dataloader: DataLoader, + model: nn.Module = None, + criterion: nn.Module = None, + optimizer: optim.Optimizer = None, + epochs: int = None, + ) -> dict: + + """Model training procedure.""" + device_name = device("cuda" if cuda.is_available() else "cpu") + model.to(device_name) + model.train() + + criterion = self.get_criterion() + optimizer = self.get_optimizer(model) + + for e in tqdm(range(epochs), desc="Training Progress"): + model.train() + train_acc, train_loss = 0.0, 0.0 + + for _, (x, labels) in enumerate(tqdm(dataloader, desc="Training Batches")): + x = self.convert_to_device(x) + labels = self.convert_to_device(labels) + labels = labels.long() + + optimizer.zero_grad() + output = model(x) + + loss = criterion(squeeze(output), squeeze(labels).long()) + loss.backward() + optimizer.step() + train_loss += loss.item() + + train_loss = train_loss/len(dataloader) + binary_predictions = self.to_numpy(output).argmax(axis=1) + + # Ensure labels are integer and 1D + binary_labels = self.to_numpy(labels).astype(int) + # Compute accuracy + train_acc = accuracy_score(binary_labels, binary_predictions) + + return {"model": model, "metrics": {"accuracy": train_acc, "loss": train_loss}} + + + + + + + + diff --git a/examples/mia/LOS/mimic_main.ipynb b/examples/mia/LOS/mimic_main.ipynb deleted file mode 100644 index 47578870..00000000 --- a/examples/mia/LOS/mimic_main.ipynb +++ /dev/null @@ -1,278 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Installation of Packages in Conda\n", - "\n", - "After prepraring the data according to ```mimiciii_prepration/ReadMe.md```, to install the required packages in your conda environment, you can use the following commands:\n", - "\n", - "```bash\n", - "conda install h5py\n", - "conda install pytables\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "%reload_ext autoreload\n", - "%autoreload 2" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [], - "source": [ - "import os\n", - "import sys\n", - "\n", - "project_root = os.path.abspath(os.path.join(os.getcwd(), \"../../..\"))\n", - "sys.path.append(project_root)\n", - "\n", - "from examples.mia.LOS.utils.data_processing import get_mimic_dataset, get_mimic_dataloaders\n", - "from examples.mia.LOS.utils.model import MimicLR, create_trained_model_and_metadata\n", - "\n", - "\n" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [], - "source": [ - "\n", - "# Generate the dataset and dataloaders\n", - "path = os.path.join(os.getcwd(), \"data/\")\n", - "\n", - "dataset, train_indices, test_indices= get_mimic_dataset(path, train_frac = 0.5, test_frac= 0.2)" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [], - "source": [ - "\n", - "train_loader, test_loader= get_mimic_dataloaders(dataset, train_indices, test_indices, batch_size=128)" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Number of features: 7488\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Training Progress: 100%|██████████| 50/50 [01:06<00:00, 1.33s/it]\n" - ] - } - ], - "source": [ - "n_features = dataset.x.shape[1]\n", - "print(f\"Number of features: {n_features}\")\n", - "\n", - "# Train the model\n", - "if not os.path.exists(\"target\"):\n", - " os.makedirs(\"target\")\n", - "model = MimicLR(n_features)\n", - "train_acc, train_loss, test_acc, test_loss = create_trained_model_and_metadata(model, \n", - " train_loader, \n", - " test_loader, \n", - " lr = 0.0001,\n", - " weight_decay = 0.5392, # chosing 5.392 decrease the train test gap \n", - " epochs=50)\n", - "\n", - "\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [ - { - "data": { - "image/png": "", - "text/plain": [ - "
" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "import matplotlib.pyplot as plt\n", - "\n", - "# Plot training and test accuracy\n", - "plt.figure(figsize=(5, 4))\n", - "\n", - "plt.subplot(1, 2, 1)\n", - "plt.plot(train_acc, label='Train Accuracy')\n", - "plt.plot(test_acc, label='Test Accuracy')\n", - "plt.xlabel('Epoch')\n", - "plt.ylabel('Accuracy')\n", - "plt.title('Accuracy over Epochs')\n", - "plt.legend()\n", - "\n", - "# Plot training and test loss\n", - "plt.subplot(1, 2, 2)\n", - "plt.plot(train_loss, label='Train Loss')\n", - "plt.plot(test_loss, label='Test Loss')\n", - "plt.xlabel('Epoch')\n", - "plt.ylabel('Loss')\n", - "plt.title('Loss over Epochs')\n", - "plt.legend()\n", - "\n", - "plt.tight_layout()\n", - "plt.show()" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "2024-11-22 09:25:45,391 INFO Target model blueprint created from MimicLR in utils/model.py.\n", - "2024-11-22 09:25:45,398 INFO Loaded target model metadata from ./target/model_metadata.pkl\n", - "2024-11-22 09:25:45,400 INFO Loaded target model from ./target\n", - "2024-11-22 09:25:46,568 INFO Loaded population dataset from ./data/dataset.pkl\n", - "2024-11-22 09:25:46,569 INFO Loaded population dataset from ./data/dataset.pkl\n", - "2024-11-22 09:25:46,570 INFO Added attack: lira\n", - "2024-11-22 09:25:46,571 INFO Preparing attack: lira\n", - "2024-11-22 09:25:46,574 INFO Number of existing models exceeds or equals the number of models to create\n", - "2024-11-22 09:25:46,575 INFO Loading shadow model 0\n", - "2024-11-22 09:25:46,578 INFO Loaded model from ./leakpro_output/attack_objects/shadow_model/shadow_model_0.pkl\n", - "2024-11-22 09:25:46,579 INFO Loading shadow model 4\n", - "2024-11-22 09:25:46,580 INFO Loaded model from ./leakpro_output/attack_objects/shadow_model/shadow_model_4.pkl\n", - "2024-11-22 09:25:46,581 INFO Loading shadow model 2\n", - "2024-11-22 09:25:46,582 INFO Loaded model from ./leakpro_output/attack_objects/shadow_model/shadow_model_2.pkl\n", - "2024-11-22 09:25:46,582 INFO Loading shadow model 3\n", - "2024-11-22 09:25:46,584 INFO Loaded model from ./leakpro_output/attack_objects/shadow_model/shadow_model_3.pkl\n", - "2024-11-22 09:25:46,584 INFO Loading shadow model 5\n", - "2024-11-22 09:25:46,585 INFO Loaded model from ./leakpro_output/attack_objects/shadow_model/shadow_model_5.pkl\n", - "2024-11-22 09:25:46,586 INFO Loading shadow model 7\n", - "2024-11-22 09:25:46,587 INFO Loaded model from ./leakpro_output/attack_objects/shadow_model/shadow_model_7.pkl\n", - "2024-11-22 09:25:46,587 INFO Loading shadow model 1\n", - "2024-11-22 09:25:46,589 INFO Loaded model from ./leakpro_output/attack_objects/shadow_model/shadow_model_1.pkl\n", - "2024-11-22 09:25:46,589 INFO Loading shadow model 6\n", - "2024-11-22 09:25:46,594 INFO Loaded model from ./leakpro_output/attack_objects/shadow_model/shadow_model_6.pkl\n", - "2024-11-22 09:25:46,594 INFO Create masks for all IN and OUT samples\n", - "2024-11-22 09:25:46,595 INFO Loading metadata 0\n", - "2024-11-22 09:25:46,596 INFO Loading metadata 4\n", - "2024-11-22 09:25:46,596 INFO Loading metadata 2\n", - "2024-11-22 09:25:46,597 INFO Loading metadata 3\n", - "2024-11-22 09:25:46,598 INFO Loading metadata 5\n", - "2024-11-22 09:25:46,598 INFO Loading metadata 7\n", - "2024-11-22 09:25:46,599 INFO Loading metadata 1\n", - "2024-11-22 09:25:46,599 INFO Loading metadata 6\n", - "2024-11-22 09:25:46,610 INFO Calculating the logits for all 8 shadow models\n", - "2024-11-22 09:25:52,584 INFO Calculating the logits for the target model \n", - "2024-11-22 09:25:53,550 INFO Running attack: lira \n", - "Processing audit samples: 100%|██████████| 16634/16634 [00:02<00:00, 6073.71it/s]\n", - "2024-11-22 09:25:56,390 INFO Finished attack: lira\n", - "2024-11-22 09:25:56,390 INFO Preparing results for attack: lira\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "No existing file named './leakpro_output/results/lira/results.txt'. A new file will be created.\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "2024-11-22 09:25:59,687 INFO Auditing completed\n" - ] - }, - { - "data": { - "text/plain": [ - "
" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "from mimic_handler import MimicInputHandler\n", - "\n", - "from leakpro import LeakPro\n", - "\n", - "# Read the config file\n", - "config_path = \"audit.yaml\"\n", - "\n", - "# Prepare leakpro object\n", - "leakpro = LeakPro(MimicInputHandler, config_path)\n", - "\n", - "# Run the audit \n", - "leakpro.run_audit()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "leakpro_test", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.12.2" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/examples/mia/LOS/mimiciii_prepration/MIMIC_Extract/run.sh b/examples/mia/LOS/mimiciii_prepration/MIMIC_Extract/run.sh index 277c1c25..5fcf13ac 100644 --- a/examples/mia/LOS/mimiciii_prepration/MIMIC_Extract/run.sh +++ b/examples/mia/LOS/mimiciii_prepration/MIMIC_Extract/run.sh @@ -48,4 +48,4 @@ python3 mimic_direct_extract.py --resource_path resources/ --out_path output/ -- # # Part 3: Copy the output to the data directory # echo 'Copying file to target directory' -cp ./output/all_hourly_data.h5 ../../data/ \ No newline at end of file +cp ./output/all_hourly_data.h5 ../../data/ diff --git a/examples/mia/LOS/utils/data_processing.py b/examples/mia/LOS/utils/data_processing.py index b4a25098..b3b2a26c 100644 --- a/examples/mia/LOS/utils/data_processing.py +++ b/examples/mia/LOS/utils/data_processing.py @@ -1,26 +1,19 @@ """ +This file is inspired by https://github.com/MLforHealth/MIMIC_Extract MIT License - Copyright (c) 2019 MIT Laboratory for Computational Physiology - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in all -copies or substantial portions of the Software. """ #TODO: Do I need to include the license for this file.? import os +import pickle + import numpy as np import pandas as pd -import pickle -from torch import from_numpy, Tensor -from torch.utils.data import Dataset, Subset, DataLoader from sklearn.preprocessing import StandardScaler +from torch import Tensor, from_numpy +from torch.utils.data import DataLoader, Dataset, Subset +from utils.model_grud import to_3D_tensor +from tqdm import tqdm class MimicDataset(Dataset): @@ -30,7 +23,7 @@ def __init__(self, x, y): self.x = from_numpy(x).float() # Convert features to torch tensors if needed else: self.x = x.float() # Ensure it is of type float - + # Check if y is already a tensor if not isinstance(y, Tensor): self.y = from_numpy(y).float() # Convert labels to torch tensors if needed @@ -42,104 +35,148 @@ def __len__(self): def __getitem__(self, idx): return self.x[idx], self.y[idx].squeeze(0) - + def subset(self, indices): return MimicDataset(self.x[indices], self.y[indices]) - -def get_mimic_dataset(path, train_frac, test_frac): + +def get_mimic_dataset(data_path, + train_frac, + validation_frac, + test_frac, + early_stop_frac, + use_LR = True): """Get the dataset, download it if necessary, and store it.""" - + + # Assert that the sum of all fractions is between 0 and 1 + total_frac = train_frac + validation_frac + test_frac + early_stop_frac + assert 0 < total_frac <= 1, "The sum of dataset fractions must be between 0 and 1." + + if use_LR: + path = data_path + "flattened/" + else: + path = data_path + "unflattened/" dataset_path = os.path.join(path, "dataset.pkl") indices_path = os.path.join(path, "indices.pkl") - + if os.path.exists(dataset_path) and os.path.exists(indices_path): + print("Loading dataset...") with open(dataset_path, "rb") as f: dataset = pickle.load(f) # Load the dataset with open(indices_path, "rb") as f: indices_dict = pickle.load(f) # Load the dictionary containing indices - train_indices = indices_dict['train_indices'] # Get the actual train indices - test_indices = indices_dict['test_indices'] # Get the actual test indices - return dataset, train_indices, test_indices - - else: - data_file_path = os.path.join(path, "all_hourly_data.h5") - if os.path.exists(data_file_path): - data= pd.read_hdf(data_file_path, 'vitals_labs') - statics = pd.read_hdf(data_file_path, 'patients') - - ID_COLS = ['subject_id', 'hadm_id', 'icustay_id'] - - train_data, holdout_data, y_train, y_holdout_data = data_splitter(statics, - data, - train_frac) - - train_data , holdout_data = data_normalization(train_data, holdout_data) - - train_data, holdout_data = [simple_imputer(df, ID_COLS) for df in (train_data, holdout_data)] - flat_train, flat_holdout = [df.pivot_table(index=['subject_id', 'hadm_id', 'icustay_id'], - columns=['hours_in']) for df in (train_data, holdout_data) ] - - # Reset the index to flatten the multi-index structure - flat_train, flat_holdout, Ys_train, Ys_test = [flatten_multi_index(df) - for df in (flat_train, flat_holdout, y_train, y_holdout_data)] - - # Check for missing values in all relevant DataFrames - assert_no_missing_values(train_data, holdout_data, flat_train, flat_holdout) - - train_df, test_df = standard_scaler(flat_train, flat_holdout) + train_indices = indices_dict["train_indices"] # Get the actual train indices + validation_indices = indices_dict["validation_indices"] # Get the actual validation indices + test_indices = indices_dict["test_indices"] # Get the actual test indices + early_stop_indices = indices_dict["early_stop_indices"] # Get the actual early stop indices + print(f"Loaded dataset from {dataset_path}") + return dataset, train_indices, validation_indices ,test_indices, early_stop_indices + + data_file_path = os.path.join(data_path, "all_hourly_data.h5") + if os.path.exists(data_file_path): + print("Loading data...") + data = pd.read_hdf(data_file_path, "vitals_labs") + statics = pd.read_hdf(data_file_path, "patients") + + ID_COLS = ["subject_id", "hadm_id", "icustay_id"] + + print("Splitting data...") + train_data, holdout_data, y_train, y_holdout_data = data_splitter(statics, + data, + train_frac) - # Creating the dataset - data_x = pd.concat((train_df, test_df), axis=0) - data_y = pd.concat((Ys_train, Ys_test), axis=0) + print("Normalizing data...") + train_data , holdout_data = data_normalization(train_data, holdout_data) + + print("Imputing missing values...") + train_data, holdout_data = [ + simple_imputer(df, ID_COLS) for df in tqdm((train_data, holdout_data), desc="Imputation")] + + if use_LR: + # Apply pivot_table to flatten the data + print("Flattening data for LR...") + flat_train, flat_holdout = [ + df.pivot_table(index=ID_COLS, columns=["hours_in"]) + for df in tqdm((train_data, holdout_data), desc="Flattening") + ] + print("Flattening data...") + train, holdout, label_train, label_holdout = [ + flatten_multi_index(df) + for df in tqdm((flat_train, flat_holdout, y_train, y_holdout_data), desc="Flattening Index") + ] + else: + # Skip pivot_table if flatten is False + train, holdout, label_train, label_holdout = train_data, holdout_data, y_train, y_holdout_data + + assert_no_missing_values(train_data, holdout_data, train, holdout) + + train_df, holdout_df = standard_scaler(train, holdout) + + # Creating the dataset + data_x = pd.concat((train_df, holdout_df), axis=0) + data_y = pd.concat((label_train, label_holdout), axis=0) - assert np.issubdtype(data_x.values.dtype, np.number), "Non-numeric data found in features." - assert np.issubdtype(data_y.values.dtype, np.number), "Non-numeric data found in labels." + assert np.issubdtype(data_x.values.dtype, np.number), "Non-numeric data found in features." + assert np.issubdtype(data_y.values.dtype, np.number), "Non-numeric data found in labels." + print("Creating dataset...") + if use_LR: dataset = MimicDataset(data_x.values, data_y.values) - - # Generate indices for training and testing - train_indices, test_indices = data_indices(data_x, train_frac, test_frac) - - # Save the dataset to dataset.pkl - with open(dataset_path, "wb") as file: - pickle.dump(dataset, file) - print(f"Saved dataset to {dataset_path}") - - # Save train and test indices to indices.pkl - indices_to_save = { - "train_indices": train_indices, - "test_indices": test_indices - } - with open(indices_path, "wb") as file: - pickle.dump(indices_to_save, file) - print(f"Saved train and test indices to {indices_path}") - - else: - msg = "Please download the MIMIC-III dataset from https://physionet.org/content/mimiciii/1.4/ and save it in the specified path." - raise FileNotFoundError(msg) - return dataset, train_indices, test_indices - - -def data_splitter(statics, data, train_frac): + else: + data_x = to_3D_tensor(data_x) + dataset = MimicDataset(data_x, data_y.values) + + # Generate indices for training, validation, test, and early stopping + train_indices, validation_indices, test_indices, early_stop_indices = data_indices(data_x, + train_frac, + validation_frac, + test_frac, + early_stop_frac) + + os.makedirs(os.path.dirname(dataset_path), exist_ok=True) + # Save the dataset to dataset.pkl + print("Saving dataset and indices...") + with open(dataset_path, "wb") as file: + pickle.dump(dataset, file) + print(f"Saved dataset to {dataset_path}") + + # Save train and test indices to indices.pkl + indices_to_save = { + "train_indices": train_indices, + "validation_indices": validation_indices, + "test_indices": test_indices, + "early_stop_indices": early_stop_indices, + } + with open(indices_path, "wb") as file: + pickle.dump(indices_to_save, file) + print(f"Saved train and test indices to {indices_path}") + else: + msg = "Please download the MIMIC-III dataset from https://physionet.org/content/mimiciii/1.4/ and save it in the specified path." + raise FileNotFoundError(msg) + return dataset, train_indices, validation_indices, test_indices, early_stop_indices + + +def data_splitter(statics, + data, + train_frac): GAP_TIME = 6 # In hours WINDOW_SIZE = 24 # In hours SEED = 1 - Ys = statics[statics.max_hours > WINDOW_SIZE + GAP_TIME][['los_icu']] - Ys['los_3'] = Ys['los_icu'] > 3 - Ys.drop(columns=['los_icu'], inplace=True) - Ys['los_3'] = Ys['los_3'].astype(float) + Ys = statics[statics.max_hours > WINDOW_SIZE + GAP_TIME][["los_icu"]] + Ys["los_3"] = Ys["los_icu"] > 3 + Ys.drop(columns=["los_icu"], inplace=True) + Ys["los_3"] = Ys["los_3"].astype(float) lvl2 = data[ - (data.index.get_level_values('icustay_id').isin(set(Ys.index.get_level_values('icustay_id')))) & - (data.index.get_level_values('hours_in') < WINDOW_SIZE) + (data.index.get_level_values("icustay_id").isin(set(Ys.index.get_level_values("icustay_id")))) & + (data.index.get_level_values("hours_in") < WINDOW_SIZE) ] - - data_subj_idx, y_subj_idx = [df.index.get_level_values('subject_id') for df in (lvl2, Ys)] + + data_subj_idx, y_subj_idx = [df.index.get_level_values("subject_id") for df in (lvl2, Ys)] data_subjects = set(data_subj_idx) assert data_subjects == set(y_subj_idx), "Subject ID pools differ!" - + # Randomly shuffle subjects and compute the sizes of the splits np.random.seed(SEED) subjects = np.random.permutation(list(data_subjects)) @@ -152,75 +189,154 @@ def data_splitter(statics, data, train_frac): # Split the data according to the subjects (train_data, holdout_data), (y_train, y_holdout) = [ - [df[df.index.get_level_values('subject_id').isin(s)] for s in (train_subj, test_subj)] + [df[df.index.get_level_values("subject_id").isin(s)] for s in (train_subj, test_subj)] for df in (lvl2, Ys) ] return train_data, holdout_data, y_train, y_holdout -def simple_imputer(df, ID_COLS): +# def simple_imputer(dataframe, +# ID_COLS): +# idx = pd.IndexSlice +# df = dataframe.copy() +# if len(df.columns.names) > 2: df.columns = df.columns.droplevel(("label", "LEVEL1", "LEVEL2")) + +# df_out = df.loc[:, idx[:, ["mean", "count"]]] +# icustay_means = df_out.loc[:, idx[:, "mean"]].groupby(ID_COLS).mean() + +# df_out.loc[:, idx[:, "mean"]] = ( +# df_out.loc[:, idx[:, "mean"]] +# .groupby(ID_COLS) +# .ffill() # Replace forward fill method +# .groupby(ID_COLS) +# .fillna(icustay_means) # Fill remaining NaNs with icustay_means +# .fillna(0) # Fill any remaining NaNs with 0 +# ) + +# # df_out.loc[:,idx[:,"mean"]] = df_out.loc[:,idx[:,"mean"]].groupby(ID_COLS).fillna( +# # method="ffill" +# # ).groupby(ID_COLS).fillna(icustay_means).fillna(0) + +# df_out.loc[:, idx[:, "count"]] = (df.loc[:, idx[:, "count"]] > 0).astype(float) +# df_out.rename(columns={"count": "mask"}, level="Aggregation Function", inplace=True) + +# is_absent = (1 - df_out.loc[:, idx[:, "mask"]]) +# hours_of_absence = is_absent.cumsum() +# time_since_measured = hours_of_absence - hours_of_absence[is_absent==0].fillna(method="ffill") +# time_since_measured.rename(columns={"mask": "time_since_measured"}, level="Aggregation Function", inplace=True) + +# df_out = pd.concat((df_out, time_since_measured), axis=1) +# df_out.loc[:, idx[:, "time_since_measured"]] = df_out.loc[:, idx[:, "time_since_measured"]].fillna(100) + +# df_out.sort_index(axis=1, inplace=True) +# return df_out + +def simple_imputer(dataframe, ID_COLS): idx = pd.IndexSlice - df = df.copy() - if len(df.columns.names) > 2: df.columns = df.columns.droplevel(('label', 'LEVEL1', 'LEVEL2')) - - df_out = df.loc[:, idx[:, ['mean', 'count']]] - icustay_means = df_out.loc[:, idx[:, 'mean']].groupby(ID_COLS).mean() - - df_out.loc[:,idx[:,'mean']] = df_out.loc[:,idx[:,'mean']].groupby(ID_COLS).fillna( - method='ffill' - ).groupby(ID_COLS).fillna(icustay_means).fillna(0) - - df_out.loc[:, idx[:, 'count']] = (df.loc[:, idx[:, 'count']] > 0).astype(float) - df_out.rename(columns={'count': 'mask'}, level='Aggregation Function', inplace=True) - - is_absent = (1 - df_out.loc[:, idx[:, 'mask']]) + df = dataframe.copy() + + # Adjust column levels if necessary + if len(df.columns.names) > 2: + df.columns = df.columns.droplevel(("label", "LEVEL1", "LEVEL2")) + + # Select mean and count columns + df_out = df.loc[:, idx[:, ["mean", "count"]]].copy() # Explicit deep copy + + # Compute group-level means + icustay_means = df_out.loc[:, idx[:, "mean"]].groupby(ID_COLS).transform("mean") + + # Forward fill and fill NaNs with icustay_means + df_out.loc[:, idx[:, "mean"]] = ( + df_out.loc[:, idx[:, "mean"]] + .groupby(ID_COLS) + .ffill() # Forward fill within groups + ) + df_out.loc[:, idx[:, "mean"]] = df_out.loc[:, idx[:, "mean"]].fillna(icustay_means) + + # Fill remaining NaNs with 0 + df_out.loc[:, idx[:, "mean"]] = df_out.loc[:, idx[:, "mean"]].fillna(0) + + # Binary mask for count columns + df_out.loc[:, idx[:, "count"]] = (df.loc[:, idx[:, "count"]] > 0).astype(float) + df_out = df_out.rename(columns={"count": "mask"}, level="Aggregation Function") # Avoid inplace=True + + # Calculate time since last measurement + is_absent = (1 - df_out.loc[:, idx[:, "mask"]]) hours_of_absence = is_absent.cumsum() - time_since_measured = hours_of_absence - hours_of_absence[is_absent==0].fillna(method='ffill') - time_since_measured.rename(columns={'mask': 'time_since_measured'}, level='Aggregation Function', inplace=True) + time_since_measured = hours_of_absence - hours_of_absence[is_absent == 0].ffill() + time_since_measured.rename(columns={"mask": "time_since_measured"}, level="Aggregation Function", inplace=True) + # Add time_since_measured to the output df_out = pd.concat((df_out, time_since_measured), axis=1) - df_out.loc[:, idx[:, 'time_since_measured']] = df_out.loc[:, idx[:, 'time_since_measured']].fillna(100) - + df_out.loc[:, idx[:, "time_since_measured"]] = df_out.loc[:, idx[:, "time_since_measured"]].fillna(100) + + # Sort columns by index df_out.sort_index(axis=1, inplace=True) + return df_out -def data_indices(dataset, train_frac, test_frac): + + + +def data_indices(dataset, + train_frac, + valid_frac, + test_frac, + early_stop_frac): N = len(dataset) N_train = int(train_frac * N) + N_validation = int(valid_frac * N) N_test = int(test_frac * N) - - # Generate sequential indices for training and testing - train_indices = list(range(N_train)) # Indices from 0 to N_train-1 - test_indices = list(range(N_train, N_train + N_test)) # Indices from N_train to N_train + N_test-1 - - return train_indices, test_indices - + N_early_stop = int(early_stop_frac * N) -def get_mimic_dataloaders(dataset, train_indices, test_indices, batch_size=128): + # Generate sequential indices for training and testing + # Indices from 0 to N_train-1 + train_indices = list(range(N_train)) + # Indices from N_train to N_train + N_validation-1 + validation_indices = list(range(N_train, N_train + N_validation)) + # Indices for test set + test_indices = list(range(N_train + N_validation, N_train + N_validation + N_test)) + # Indices for early stopping + early_stop_indices = list(range(N_train + N_validation + N_test, N_train + N_validation + N_test + N_early_stop)) + return train_indices, validation_indices, test_indices, early_stop_indices + + +def get_mimic_dataloaders(dataset, + train_indices, + validation_indices, + test_indices, + early_stop_indices, + batch_size=128): train_subset = Subset(dataset, train_indices) test_subset = Subset(dataset, test_indices) + validation_subset = Subset(dataset, validation_indices) + early_stop_subset = Subset(dataset, early_stop_indices) train_loader = DataLoader(train_subset, batch_size, shuffle=False) test_loader = DataLoader(test_subset, batch_size, shuffle=False) + validation_loader = DataLoader(validation_subset, batch_size, shuffle=False) + early_stop_loader = DataLoader(early_stop_subset, batch_size, shuffle=False) - return train_loader, test_loader + return train_loader, validation_loader, test_loader, early_stop_loader -def data_normalization(lvl2_train, lvl2_test): +def data_normalization(lvl2_train, + lvl2_test): idx = pd.IndexSlice - lvl2_means, lvl2_stds = lvl2_train.loc[:, idx[:,'mean']].mean(axis=0), lvl2_train.loc[:, idx[:,'mean']].std(axis=0) + lvl2_means, lvl2_stds = lvl2_train.loc[:, idx[:,"mean"]].mean(axis=0), lvl2_train.loc[:, idx[:,"mean"]].std(axis=0) - lvl2_train.loc[:, idx[:,'mean']] = (lvl2_train.loc[:, idx[:,'mean']] - lvl2_means)/lvl2_stds - lvl2_test.loc[:, idx[:,'mean']] = (lvl2_test.loc[:, idx[:,'mean']] - lvl2_means)/lvl2_stds + lvl2_train.loc[:, idx[:,"mean"]] = (lvl2_train.loc[:, idx[:,"mean"]] - lvl2_means)/lvl2_stds + lvl2_test.loc[:, idx[:,"mean"]] = (lvl2_test.loc[:, idx[:,"mean"]] - lvl2_means)/lvl2_stds return lvl2_train, lvl2_test -def standard_scaler(flat_train, flat_test): +def standard_scaler(flat_train, + flat_test): # Initialize the scaler scaler = StandardScaler() # Identify continuous columns (float64 and int64 types) - continuous_columns = flat_train.select_dtypes(include=['float64', 'int64']).columns + continuous_columns = flat_train.select_dtypes(include=["float64", "int64"]).columns # Fit the scaler on training data and transform both training and test sets train_flat_continuous = scaler.fit_transform(flat_train[continuous_columns]) diff --git a/examples/mia/LOS/utils/model.py b/examples/mia/LOS/utils/model_LR.py similarity index 76% rename from examples/mia/LOS/utils/model.py rename to examples/mia/LOS/utils/model_LR.py index a525085b..a99c61ad 100644 --- a/examples/mia/LOS/utils/model.py +++ b/examples/mia/LOS/utils/model_LR.py @@ -1,19 +1,20 @@ +import os +import pickle import tqdm as tqdm -import pickle -import torch.nn as nn -from torch import device, optim, no_grad, save, sigmoid, cuda +from torch import cuda, device, nn, no_grad, optim, save, sigmoid -class MimicLR(nn.Module): +class LR(nn.Module): def __init__(self, input_dim: int): """Initialize the logistic regression model with a single linear layer. Args: ---- input_dim (int): The size of the input feature vector. + """ - super(MimicLR, self).__init__() + super(LR, self).__init__() self.linear = nn.Linear(input_dim, 1) # Binary classification (1 output) # Metadata initialization self.init_params = {"input_dim": input_dim} @@ -24,26 +25,24 @@ def forward(self, x): def evaluate(model, loader, criterion, device): model.eval() - loss, correct = 0, 0 - for data, target in loader: - data, target = data.to(device), target.to(device) - target = target.float().unsqueeze(1) - - with no_grad(): + loss, acc = 0, 0 + with no_grad(): + for data, target in loader: + data, target = data.to(device), target.to(device) + target = target.float().unsqueeze(1) output = model(data) loss += criterion(output, target).item() - pred = output >= 0.5 - correct += (pred == target).float().sum() - acc = float(correct)/ len(loader.dataset) - loss /= len(loader) - + pred = (output) >= 0.5 + acc += pred.eq(target.data.view_as(pred)).sum() + loss /= len(loader) + acc = float(acc) / len(loader.dataset) return loss, acc def create_trained_model_and_metadata(model, train_loader, test_loader, - epochs , + epochs , lr , weight_decay , metadata = None): @@ -52,17 +51,17 @@ def create_trained_model_and_metadata(model, model.to(device_name) model.train() - criterion = nn.BCELoss(reduction="mean") + criterion = nn.BCELoss() optimizer = optim.SGD(model.parameters(), lr = lr, weight_decay = weight_decay) train_losses, train_accuracies = [], [] test_losses, test_accuracies = [], [] - + for e in tqdm.tqdm(range(epochs), desc="Training Progress"): model.train() train_acc, train_loss = 0.0, 0.0 - + for data, target in train_loader: target = target.float().unsqueeze(1) data, target = data.to(device_name, non_blocking=True), target.to(device_name, non_blocking=True) @@ -70,27 +69,30 @@ def create_trained_model_and_metadata(model, output = model(data) loss = criterion(output, target) - pred =output >= 0.5 + pred = (output) >= 0.5 train_acc += pred.eq(target).sum().item() - + loss.backward() optimizer.step() train_loss += loss.item() - - epoch_train_loss = train_loss / len(train_loader) - epoch_train_acc = train_acc / len(train_loader.dataset) - - train_losses.append(epoch_train_loss) - train_accuracies.append(epoch_train_acc) - + + train_loss /= len(train_loader) + train_acc /= len(train_loader.dataset) + + train_losses.append(train_loss) + train_accuracies.append(train_acc) + test_loss, test_acc = evaluate(model, test_loader, criterion, device_name) - # _ , train_loss = evaluate(model, train_loader, criterion, device_name) test_losses.append(test_loss) test_accuracies.append(test_acc) - + # Move the model back to the CPU model.to("cpu") - with open("target/target_model.pkl", "wb") as f: + + + if not os.path.exists("target_LR"): + os.makedirs("target_LR") + with open("target_LR/target_model.pkl", "wb") as f: save(model.state_dict(), f) # Create metadata and store it @@ -98,12 +100,12 @@ def create_trained_model_and_metadata(model, meta_data["train_indices"] = train_loader.dataset.indices meta_data["test_indices"] = test_loader.dataset.indices meta_data["num_train"] = len(meta_data["train_indices"]) - + # Write init params meta_data["init_params"] = {} for key, value in model.init_params.items(): meta_data["init_params"][key] = value - + # read out optimizer parameters meta_data["optimizer"] = {} meta_data["optimizer"]["name"] = optimizer.__class__.__name__.lower() @@ -124,8 +126,9 @@ def create_trained_model_and_metadata(model, meta_data["train_loss"] = train_loss meta_data["test_loss"] = test_loss meta_data["dataset"] = "mimiciii" - - with open("target/model_metadata.pkl", "wb") as f: + + + with open("target_LR/model_metadata.pkl", "wb") as f: pickle.dump(meta_data, f) - - return train_accuracies, train_losses, test_accuracies, test_losses \ No newline at end of file + + return train_accuracies, train_losses, test_accuracies, test_losses diff --git a/examples/mia/LOS/utils/model_grud.py b/examples/mia/LOS/utils/model_grud.py new file mode 100644 index 00000000..fed35043 --- /dev/null +++ b/examples/mia/LOS/utils/model_grud.py @@ -0,0 +1,447 @@ +""" +This file is inspired by https://github.com/MLforHealth/MIMIC_Extract +MIT License +Copyright (c) 2019 MIT Laboratory for Computational Physiology +""" +import math +import os +import pickle +import time +import warnings + +import numpy as np +import pandas as pd +import torch.nn.functional as F +import torch.utils.data as utils +from sklearn.metrics import accuracy_score +from torch import Tensor, cat, cuda, device, exp, eye, from_numpy, isnan, max, nn, optim, save, sigmoid, squeeze, tanh, zeros +from torch.autograd import Variable +from torch.nn.parameter import Parameter +from torch.optim.lr_scheduler import ReduceLROnPlateau +from tqdm import tqdm + + +def to_3D_tensor(df): + idx = pd.IndexSlice + np_3D = np.dstack([df.loc[idx[:, :, :, i], :].values for i in sorted(set(df.index.get_level_values("hours_in")))]) + return from_numpy(np_3D) + +def prepare_dataloader(df, Ys, batch_size, shuffle=True): + """Dfs = (df_train, df_dev, df_test). + df_* = (subject, hadm, icustay, hours_in) X (level2, agg fn \ni {mask, mean, time}) + Ys_series = (subject, hadm, icustay) => label. + """ + X = from_numpy(to_3D_tensor(df).astype(np.float32)) + label = from_numpy(Ys.values.astype(np.int64)) + dataset = utils.TensorDataset(X, label) + return utils.DataLoader(dataset, batch_size =int(batch_size) , shuffle=shuffle, drop_last = True) + +class FilterLinear(nn.Module): + def __init__(self, in_features, out_features, filter_square_matrix, device, bias=True): + """filter_square_matrix : filter square matrix, whose each elements is 0 or 1. + """ + super(FilterLinear, self).__init__() + self.in_features = in_features + self.out_features = out_features + + assert in_features > 1 and out_features > 1, "Passing in nonsense sizes" + + self.filter_square_matrix = None + self.filter_square_matrix = Variable(filter_square_matrix.to(device), requires_grad=False) + + self.weight = Parameter(Tensor(out_features, in_features)).to(device) + + if bias: + self.bias = Parameter(Tensor(out_features)).to(device) + else: + self.register_parameter("bias", None) + self.reset_parameters() + + def reset_parameters(self): + stdv = 1. / math.sqrt(self.weight.size(1)) + self.weight.data.uniform_(-stdv, stdv) + if self.bias is not None: + self.bias.data.uniform_(-stdv, stdv) + + def forward(self, x): + return F.linear( + x, + self.filter_square_matrix.mul(self.weight), + self.bias + ) + + def __repr__(self): + return self.__class__.__name__ + "(" \ + + "in_features=" + str(self.in_features) \ + + ", out_features=" + str(self.out_features) \ + + ", bias=" + str(self.bias is not None) + ")" + +class GRUD(nn.Module): + def __init__(self, input_size, cell_size, hidden_size, X_mean, batch_size = 0, output_last = False): + """With minor modifications from https://github.com/zhiyongc/GRU-D/ + + Recurrent Neural Networks for Multivariate Times Series with Missing Values + GRU-D: GRU exploit two representations of informative missingness patterns, i.e., masking and time interval. + cell_size is the size of cell_state. + + Implemented based on the paper: + @article{che2018recurrent, + title={Recurrent neural networks for multivariate time series with missing values}, + author={Che, Zhengping and Purushotham, Sanjay and Cho, Kyunghyun and Sontag, David and Liu, Yan}, + journal={Scientific reports}, + volume={8}, + number={1}, + pages={6085}, + year={2018}, + publisher={Nature Publishing Group} + } + + GRU-D: + input_size: variable dimension of each time + hidden_size: dimension of hidden_state + mask_size: dimension of masking vector + X_mean: the mean of the historical input data + """ + + super(GRUD, self).__init__() + + # Save init params to a dictionary + self.init_params = { + "input_size": input_size, + "cell_size": cell_size, + "hidden_size": hidden_size, + "X_mean": X_mean, + "batch_size": batch_size, + "output_last": output_last + } + + self.hidden_size = hidden_size + self.delta_size = input_size + self.mask_size = input_size + + self.device = device("cuda" if cuda.is_available() else "cpu") + self.identity = eye(input_size).to(self.device) + self.X_mean = Variable(Tensor(X_mean).to(self.device)) + + # Wz, Uz are part of the same network. the bias is bz + self.zl = nn.Linear(input_size + hidden_size + self.mask_size, hidden_size).to(self.device) + + # Wr, Ur are part of the same network. the bias is br + self.rl = nn.Linear(input_size + hidden_size + self.mask_size, hidden_size).to(self.device) + + # W, U are part of the same network. the bias is b + self.hl = nn.Linear(input_size + hidden_size + self.mask_size, hidden_size).to(self.device) + + self.gamma_x_l = FilterLinear(self.delta_size, self.delta_size, self.identity, self.device) + self.gamma_h_l = nn.Linear(self.delta_size, self.hidden_size).to(self.device) + self.output_last = output_last + + self.fc = nn.Linear(self.hidden_size, 2).to(self.device) + self.bn= nn.BatchNorm1d(2, eps=1e-05, momentum=0.1, affine=True).to(self.device) + self.drop=nn.Dropout(p=0.7, inplace=False) + + + def step(self, x, x_last_obsv, x_mean, h, mask, delta): + """Inputs: + x: input tensor + x_last_obsv: input tensor with forward fill applied + x_mean: the mean of each feature + h: the hidden state of the network + mask: the mask of whether or not the current value is observed + delta: the tensor indicating the number of steps since the last time a feature was observed. + + Returns: + h: the updated hidden state of the network + + """ + + # Assert to check for NaNs in x_mean + assert not isnan(x_mean).any(), "NaN values found in x_mean" + + batch_size = x.size()[0] + feature_size = x.size()[1] + zero_x = zeros(batch_size, feature_size).to(self.device) + zero_h = zeros(batch_size, self.hidden_size).to(self.device) + + + gamma_x_l_delta = self.gamma_x_l(delta) + delta_x = exp(-max(zero_x, gamma_x_l_delta)) + + gamma_h_l_delta = self.gamma_h_l(delta) + delta_h = exp(-max(zero_h, gamma_h_l_delta)) + + x_mean = x_mean.repeat(batch_size, 1) + + x = mask * x + (1 - mask) * (delta_x * x_last_obsv + (1 - delta_x) * x_mean) + h = delta_h * h + + combined = cat((x, h, mask), 1) + # Assert to check for NaNs in combined + assert not isnan(combined).any(), "NaN values found in combined" + + z = sigmoid(self.zl(combined)) #sigmoid(W_z*x_t + U_z*h_{t-1} + V_z*m_t + bz) + r = sigmoid(self.rl(combined)) #sigmoid(W_r*x_t + U_r*h_{t-1} + V_r*m_t + br) + combined_new = cat((x, r*h, mask), 1) + h_tilde = tanh(self.hl(combined_new)) #tanh(W*x_t +U(r_t*h_{t-1}) + V*m_t) + b + h = (1 - z) * h + z * h_tilde + + return h + + + def forward(self, X): + """X: Input tensor of shape (batch_size, time_steps * 3, features) + The tensor includes Mask, Measurement, and Delta sequentially for each time step. + """ + + # Step 1: Split the input tensor into Mask, Measurement, and Delta + batch_size = X.size(0) + time_steps = X.size(1) // 3 # Since every 3 consecutive steps represent Mask, Measurement, and Delta + + # Reshape X into 3 separate tensors for Mask, Measurement, and Delta + Mask = X[:, np.arange(0, X.size(1), 3), :] # Extract Mask + Measurement = X[:, np.arange(1, X.size(1), 3), :] # Extract Measurement + Delta = X[:, np.arange(2, X.size(1), 3), :] # Extract Delta + + # Transpose tensors to match (batch_size, time_steps, features) + Mask = Mask.transpose(1, 2) + Measurement = Measurement.transpose(1, 2) + Delta = Delta.transpose(1, 2) + + # X_last_obsv is initialized to Measurement at the starting point + X_last_obsv = Measurement + + # Step 2: Initialize hidden state + step_size = Measurement.size(1) # Number of time points + Hidden_State = self.initHidden(batch_size) + + # Step 3: Iterate through time steps and update the GRU hidden state + outputs = None + for i in range(step_size): + Hidden_State = self.step( + squeeze(Measurement[:, i, :], 1), + squeeze(X_last_obsv[:, i, :], 1), + squeeze(self.X_mean[:, i, :], 1), + Hidden_State, + squeeze(Mask[:, i, :], 1), + squeeze(Delta[:, i, :], 1), + ) + # Collect hidden states + if outputs is None: + outputs = Hidden_State.unsqueeze(1) + else: + outputs = cat((Hidden_State.unsqueeze(1), outputs), 1) + + # Step 4: Predict a binary outcome using FC, BatchNorm, and Dropout layers + return self.drop(self.bn(self.fc(Hidden_State))) + + def initHidden(self, batch_size): + Hidden_State = Variable(zeros(batch_size, self.hidden_size)).to(self.device) + return Hidden_State + +def to_numpy(tensor): + return tensor.detach().cpu().numpy() if tensor.is_cuda else tensor.detach().numpy() + +def gru_trained_model_and_metadata(model, + train_dataloader, + test_dataloader, + epochs, + patience_early_stopping, + patience_lr, + min_delta, + learning_rate): + + print("Model Structure: ", model) + print("Start Training ... ") + + # Check if the input tensor is 3D + # This check is nessary because the GRU-D model expects a 3D tensor, meaning the input data should not be flattened + # The input tensor should have the shape (num_datapoints, num_features, num_timepoints) + if train_dataloader.dataset.dataset.x.ndimension() != 3: + warnings.warn("Input tensor is not 3D. There might be a mismatch between .", UserWarning) + + # Early Stopping + min_loss_epoch_valid = float("inf") # Initialize to infinity for comparison + patient_epoch = 0 # Initialize patient counter + + device_name = device("cuda" if cuda.is_available() else "cpu") + + if isinstance(model, nn.Sequential): + output_last = model[-1].output_last + print("Output type dermined by the last layer") + else: + output_last = model.output_last + print("Output type dermined by the model") + + criterion_CEL = nn.CrossEntropyLoss() + criterion_MSE = nn.MSELoss() + optimizer = optim.Adam(model.parameters(), lr=learning_rate) + + # Reduce learning rate when a metric has stopped improving + scheduler = ReduceLROnPlateau(optimizer, mode="min", factor=0.5, patience = patience_lr) + + + train_losses = [] + test_losses = [] + test_acces = [] + train_acces = [] + + + cur_time = time.time() + pre_time = time.time() + + + model.to(device_name) + + for epoch in tqdm(range(epochs), desc="Training Progress"): + + model.train() + train_loss = 0.0 + + test_dataloader_iter = iter(test_dataloader) + + for _, (X, labels) in enumerate(tqdm(train_dataloader, desc="Training Batches")): + + X = X.to(device_name) + labels = labels.to(device_name) + labels = labels.long() + prediction = model(X) + + output_last = True + if output_last: + loss = criterion_CEL(squeeze(prediction), squeeze(labels)) + else: + full_labels = cat((X[:,1:,:], labels), dim = 1) + loss = criterion_MSE(prediction, full_labels) + + + optimizer.zero_grad() + loss.backward() + optimizer.step() + train_loss += loss.item() + + train_loss /= len(train_dataloader) + train_losses.append(train_loss) + + # Convert predictions to class indices + binary_predictions = to_numpy(prediction).argmax(axis=1) + + # Ensure labels are integer and 1D + binary_labels = to_numpy(labels).astype(int) + # Compute accuracy + train_acc = accuracy_score(binary_labels, binary_predictions) + train_acces.append(train_acc) + + # test + model.eval() + try: + X_test, labels_test = next(test_dataloader_iter) + except StopIteration: + valid_dataloader_iter = iter(test_dataloader) + X_test, labels_test = next(valid_dataloader_iter) + + + model.zero_grad() + X_test = X_test.to(device_name) + labels_test = labels_test.to(device_name) + labels_test = labels_test.long() + + prediction_test = model(X_test) + + + if output_last: + test_loss = criterion_CEL(squeeze(prediction_test), squeeze(labels_test)) + else: + full_labels_val = cat((X_test[:,1:,:], labels_test), dim = 1) + test_loss = criterion_MSE(prediction_test, full_labels_val) + + test_loss = test_loss.cpu().item() + test_losses.append(test_loss) + + # Convert predictions to class indices + binary_predictions_test = to_numpy(prediction_test).argmax(axis=1) + + # Ensure labels are integer and 1D + binary_labels_test = to_numpy(labels_test).astype(int) + # Compute accuracy + test_acc = accuracy_score(binary_labels_test, binary_predictions_test) + test_acces.append(test_acc) + + # Early stopping + # Assume test_loss is computed for validation set + if test_loss < min_loss_epoch_valid - min_delta: # Improvement condition + min_loss_epoch_valid = test_loss + patient_epoch = 0 + print(f"Epoch {epoch}: Validation loss improved to {test_loss:.4f}") + else: + patient_epoch += 1 + print(f"Epoch {epoch}: No improvement. Patience counter: {patient_epoch}/{patience_early_stopping}") + + if patient_epoch >= patience_early_stopping: + print(f"Early stopping at epoch {epoch}. Best validation loss: {min_loss_epoch_valid:.4f}") + break + + # Step the scheduler + scheduler.step(test_loss) + + # Check the learning rate + current_lr = optimizer.param_groups[0]["lr"] + print(f"Learning Rate: {current_lr:.6f}") + + # Stop if learning rate becomes too small + if current_lr < 1e-6: + print("Learning rate too small, stopping training.") + break + + + # Print training parameters + cur_time = time.time() + print("Epoch: {}, train_loss: {}, valid_loss: {}, time: {}".format( \ + epoch, \ + np.around(train_loss, decimals=8),\ + np.around(test_loss, decimals=8),\ + np.around(cur_time - pre_time, decimals=2))) + pre_time = cur_time + # Move the model back to the CPU + # Ensure the target directory exists + os.makedirs("target_GRUD", exist_ok=True) + model.to("cpu") + with open("target_GRUD/target_model.pkl", "wb") as f: + save(model.state_dict(), f) + + # Create metadata and store it + meta_data = {} + meta_data["train_indices"] = train_dataloader.dataset.indices + meta_data["test_indices"] = test_dataloader.dataset.indices + meta_data["num_train"] = len(meta_data["train_indices"]) + + # Write init params + meta_data["init_params"] = {} + for key, value in model.init_params.items(): + meta_data["init_params"][key] = value + + # read out optimizer parameters + meta_data["optimizer"] = {} + meta_data["optimizer"]["name"] = optimizer.__class__.__name__.lower() + meta_data["optimizer"]["lr"] = optimizer.param_groups[0].get("lr", 0) + meta_data["optimizer"]["weight_decay"] = optimizer.param_groups[0].get("weight_decay", 0) + meta_data["optimizer"]["momentum"] = optimizer.param_groups[0].get("momentum", 0) + meta_data["optimizer"]["dampening"] = optimizer.param_groups[0].get("dampening", 0) + meta_data["optimizer"]["nesterov"] = optimizer.param_groups[0].get("nesterov", False) + + # read out criterion parameters + meta_data["loss"] = {} + meta_data["loss"]["name"] = criterion_CEL.__class__.__name__.lower() + + meta_data["batch_size"] = train_dataloader.batch_size + meta_data["epochs"] = epochs + meta_data["train_acc"] = train_acc + meta_data["test_acc"] = test_acc + meta_data["train_loss"] = train_loss + meta_data["test_loss"] = test_loss + meta_data["dataset"] = "mimiciii" + with open("target_GRUD/model_metadata.pkl", "wb") as f: + pickle.dump(meta_data, f) + return train_losses, test_losses, train_acces, test_acces + + + diff --git a/examples/mia/cifar/audit.yaml b/examples/mia/cifar/audit.yaml index 8440826b..dc38dd37 100644 --- a/examples/mia/cifar/audit.yaml +++ b/examples/mia/cifar/audit.yaml @@ -10,11 +10,11 @@ audit: # Configurations for auditing gamma: 2.0 offline_a: 0.33 # parameter from which we compute p(x) from p_OUT(x) such that p_IN(x) = a p_OUT(x) + b. offline_b: 0.66 - qmia: - training_data_fraction: 1.0 # Fraction of the auxilary dataset (data without train and test indices) to use for training the quantile regressor - epochs: 5 # Number of training epochs for quantile regression - population: - attack_data_fraction: 1.0 # Fraction of the auxilary dataset to use for this attack + # qmia: + # training_data_fraction: 1.0 # Fraction of the auxilary dataset (data without train and test indices) to use for training the quantile regressor + # epochs: 5 # Number of training epochs for quantile regression + # population: + # attack_data_fraction: 1.0 # Fraction of the auxilary dataset to use for this attack lira: training_data_fraction: 0.5 # Fraction of the auxilary dataset to use for this attack (in each shadow model training) num_shadow_models: 8 # Number of shadow models to train @@ -38,12 +38,12 @@ audit: # Configurations for auditing batch_size: 50 verbose: True epsilon_threshold: 1e-6 - yoqo: - training_data_fraction: 0.5 # Fraction of the auxilary dataset to use for this attack (in each shadow model training) - num_shadow_models: 8 # Number of shadow models to train - online: True # perform online or offline attack - lr_xprime_optimization: .01 - max_iterations: 35 + # yoqo: + # training_data_fraction: 0.5 # Fraction of the auxilary dataset to use for this attack (in each shadow model training) + # num_shadow_models: 8 # Number of shadow models to train + # online: True # perform online or offline attack + # lr_xprime_optimization: .01 + # max_iterations: 35 output_dir: "./leakpro_output" attack_type: "mia" #mia, gia diff --git a/leakpro/attacks/utils/shadow_model_handler.py b/leakpro/attacks/utils/shadow_model_handler.py index b06612ef..de8d25b9 100755 --- a/leakpro/attacks/utils/shadow_model_handler.py +++ b/leakpro/attacks/utils/shadow_model_handler.py @@ -59,25 +59,44 @@ def __init__(self:Self, handler: AbstractInputHandler) -> None: # noqa: PLR0912 """ caller = "shadow_model" super().__init__(handler, caller) - self.configs = handler.configs.get("shadow_model", None) + + shadow_model = self.handler.configs.get("shadow_model", None) + self.shadow_model_type = shadow_model.get("model_class", None) if isinstance(shadow_model, dict) else None + + self.target_model_type = self.handler.configs.get("target", {}).get("model_class", None) + if self.target_model_type is None: + raise ValueError("Target model type is not specified") # Set up the names of the shadow model self.model_storage_name = "shadow_model" self.metadata_storage_name = "metadata" - def _filter(self:Self, data_size:int, online:bool)->list[int]: + def _filter(self:Self, data_size:int, online:bool, model_type: str)->list[int]: # Get the metadata for the shadow models entries = os.listdir(self.storage_path) pattern = re.compile(rf"^{self.metadata_storage_name}_\d+\.pkl$") files = [f for f in entries if pattern.match(f)] # Extract the index of the metadata all_indices = [int(re.search(r"\d+", f).group()) for f in files] + # Filter out indices to only keep the ones with the same data size filtered_indices = [] + mismatched_model_types = [] + for i in all_indices: metadata = self._load_shadow_metadata(i) if metadata["num_train"] == data_size and metadata["online"] == online: - filtered_indices.append(i) + if metadata.get("model_type") == model_type: + filtered_indices.append(i) + else: + mismatched_model_types.append((i, metadata.get("model_type", "Unknown"))) + + # Warn about mismatched model types + if mismatched_model_types: + logger.warning( + f"Mismatched model types found in saved shadow models: {mismatched_model_types}. " + f"Expected model type: {model_type}." + ) return all_indices, filtered_indices def create_shadow_models( @@ -104,9 +123,18 @@ def create_shadow_models( if num_models < 0: raise ValueError("Number of models cannot be negative") + # Get shadow model class + if self.shadow_model_type is None: + logger.warning( + "Using the same model class for shadow models as the target model." + ) + shadow_model_type = self.target_model_type + else: + shadow_model_type = self.shadow_model_type + # Get the size of the dataset data_size = int(len(shadow_population)*training_fraction) - all_indices, filtered_indices = self._filter(data_size, online) + all_indices, filtered_indices = self._filter(data_size, online, shadow_model_type) # Create a list of indices to use for the new shadow models n_existing_models = len(filtered_indices) @@ -143,18 +171,19 @@ def create_shadow_models( logger.info(f"Saved shadow model {i} to {self.storage_path}") logger.info(f"Storing metadata for shadow model {i}") - meta_data = {} - meta_data["init_params"] = self.init_params - meta_data["train_indices"] = data_indices - meta_data["num_train"] = len(data_indices) - meta_data["optimizer"] = optimizer.__class__.__name__ - meta_data["criterion"] = criterion.__class__.__name__ - meta_data["batch_size"] = self.batch_size - meta_data["epochs"] = self.epochs - meta_data["train_acc"] = train_acc - meta_data["train_loss"] = train_loss - meta_data["online"] = online - + meta_data = { + "init_params": self.init_params, + "train_indices": data_indices, + "num_train": len(data_indices), + "optimizer": optimizer.__class__.__name__, + "criterion": criterion.__class__.__name__, + "batch_size": self.batch_size, + "epochs": self.epochs, + "train_acc": train_acc, + "train_loss": train_loss, + "online": online, + "model_type": shadow_model_type, + } with open(f"{self.storage_path}/{self.metadata_storage_name}_{i}.pkl", "wb") as f: pickle.dump(meta_data, f) diff --git a/leakpro/input_handler/abstract_input_handler.py b/leakpro/input_handler/abstract_input_handler.py index 1fa65a8a..c546fc5e 100755 --- a/leakpro/input_handler/abstract_input_handler.py +++ b/leakpro/input_handler/abstract_input_handler.py @@ -25,6 +25,11 @@ def get_optimizer(self:Self, model:torch.nn.Module) -> torch.optim.Optimizer: """Get the optimizer used for the target model to be used in model training.""" pass + # @abstractmethod + # def get_shadow_model_type(self:Self) -> str: + # """Get the type of shadow model to be used in the attack.""" + # pass + @abstractmethod def train( self: Self,