init dpml-benchmark

sunblaze-ucb · Mar 5, 2018 · 820cc89 · 820cc89
1 parent 57b244a
commit 820cc89
Show file tree

Hide file tree

Showing 40 changed files with 2,895 additions and 1 deletion.
diff --git a/LICENSE b/LICENSE
@@ -1,6 +1,6 @@
 MIT License
 
-Copyright (c) 2018 
+Copyright (c) 2018 dpmlanonymous
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal

diff --git a/README.md b/README.md
@@ -0,0 +1,118 @@
+# Differentially Private Convex Optimization Benchmark
+
+A benchmark and implementations for differentially private convex optimization algorithms.
+
+The algorithms implemented in this repository are as follows:
+1. Approximate Minia Perturbation - An original algorithm proposed in our paper.
+2. Hyperparameter-free Approximate Minima Perturbation - Hyperparameter-free version of 1.
+3. Private Stochastic Gradient Descent in [scs13](http://ai2-s2-pdfs.s3.amazonaws.com/6154/ce8c02375184f7928e41c4fae532500f7175.pdf)
+4. Private Convex Perturbation-Based Stochastic Gradient Descent in [wlk17](https://arxiv.org/pdf/1606.04722.pdf)
+5. Private Strongly Convex Perturbation-Based Stochastic Gradient Descent in [wlk17](https://arxiv.org/pdf/1606.04722.pdf)
+6. Private Frank-Wolfe in [ttz16](https://arxiv.org/pdf/1411.5417.pdf)
+
+## Getting Started
+
+These instructions will get you a copy of the project up and running on your local machine for development and testing purposes. The codes are currently implemented using NumPy. They require Python version 3.5 or newer. You will also need to install all dependencies listed in the requirements.txt file in the repository. The recommended way to do this is through the use of a Python virtual environment.
+
+### Virtual Environment
+
+You can set up a virtual environment as follows:
+
+1. Navigate to the directory that you have checked out this repository in.
+2. Create a virtual environment named *venv* by running:
+```bash
+python3 -m venv venv
+```
+If any needed packages are missing, you should get an error message telling you which ones to install.
+
+3. Activate the virtual environment by running the following command on Posix systems:
+```bash
+source venv/bin/activate
+```
+There will be a script for Windows systems located at *venv/Scripts/activate*. However, none of the code in this repository has been tested on Windows.
+
+### Prerequisites
+
+```
+cycler==0.10.0
+matplotlib==2.0.2
+numpy==1.13.0
+pyparsing==2.2.0
+python-dateutil==2.6.0
+pytz==2017.2
+scipy==0.19.0
+scikit-learn==0.18.1
+six==1.10.0
+xlrd==1.0.0
+```
+
+### Installing
+
+#### Linux
+
+1. Navigate to the this repository.
+2. Run the following command line.
+
+```bash
+pip install -r requirements.txt
+```
+
+#### Windows
+
+1. Navigate to the this repository.
+2. Open requirements.txt in the repo, and run ''pip install'' for all of the prequisities in order.
+
+## Running the benchmark
+
+### Download and preprocess the datasets
+
+1. Navigate to the ''datasets'' directory.
+2. Run the following command line to download and preprocess all the benchmark datasets automatically.
+```bash
+python main_preprocess.py all
+```
+3. If you want to download one of the datasets, just replace ''all'' with the name of the dataset. All available datasets are listed as following.
+```
+adult, covertype, gisette, kddcup99, mnist, realsim, rcv1
+```
+
+### Run the benchmarks
+
+1. Navigate to this repository.
+2. Run algorithms on one dataset using the following command.
+
+```bash
+python gridsearch.py [ALG_NAME] [DATASET_NAME] [MODEL_NAME]
+```
+
+3. Available ALG_NAME
+```
+ALL: all the algorithms
+AMP: Flexible Objective Perturbation
+AMP-NT: Hyperparameter-free Flexible Objective Perturbation
+PSGD: Private Stochastic Gradient Descent
+PPSGD: Private Convex Perturbation-Based Stochastic Gradient Descent
+PPSSGD: Private Convex Perturbation-Based Stochastic Gradient Descent
+FW: Private Frank-Wolfe
+```
+
+4. Available DATASET_NAME
+```
+adult, covertype, gisette, kddcup99, mnist, realsim, rcv1
+```
+5. Available MODEL_NAME
+```
+LR: Logistic Regression
+SVM: Huber SVM without kernel functions
+```
+6. The results are stored in csvs in ''dpml-algorithms/results/rough_results''
+
+### Draw the graphs
+
+1. Navigate to this repository.
+2. Run the following command to get the graph after running the corresponding benchmark.
+```bash
+python draw.py [DATASET_NAME] [ALG_NAME] [MODEL_NAME]
+```
+3. The graphs are in ''dpml-algorihtms/results/graphs''.
+
diff --git a/algorithms/approximate_minima_perturbation.py b/algorithms/approximate_minima_perturbation.py
@@ -0,0 +1,191 @@
+import numpy as np
+from common.common import Algorithm, LEARNING_RATE_CONSTANT, DEFAULT_NUM_ITERS
+from lossfunctions.logistic_regression import (
+    LogisticRegression, LogisticRegressionSinglePoint)
+from lossfunctions.huber_svm import HuberSVM
+from scipy.optimize import minimize
+from scipy.sparse import csr_matrix, hstack
+import logging
+import os
+
+USE_LOWMEM = False
+
+def amp_run_classification(x, y, loss_func, grad_func,
+                                epsilon, delta, lambda_param,
+                                learning_rate=None, num_iters=None,
+                                l2_constraint=None, eps_frac=0.9,
+                                eps_out_frac=0.01,
+                                gamma=None, L=1, gamma_mult = 1):
+    n = x.shape[0]
+    m = x.shape[1]
+    lmbda = pow(L, 2)
+    r = 2                 # for GLMs
+    beta = pow(L, 2)      # from psgd
+
+    # initial model
+    x0 = np.zeros(shape=x.shape[1])
+
+    # hard-code the split for obj/out
+    delta_out_frac = eps_out_frac
+
+    # strategy for split within obj
+    if eps_frac is None:
+        # old strategy
+        # best = 0.796 + 0.149*np.exp(-3.435*epsilon)
+
+        # "Strategy #1"
+        best = min(0.88671+0.0186607/(epsilon**0.372906), .99)
+
+        # "Strategy #2"
+        # best = 0.909994+0.0769162*np.exp(-9.41309*epsilon)
+
+        eps_frac = max(best, 1 - 1/epsilon + 0.001)
+
+    # split the budget 3 ways
+    eps_out = epsilon*eps_out_frac
+    eps_obj = epsilon - eps_out
+    eps_p = eps_frac * eps_obj
+
+    delta_out = delta_out_frac * delta
+    delta_obj = delta - delta_out
+
+    # set the lower bound on regularization
+    big_lambda = r * beta / (eps_obj - eps_p)
+
+    # set gamma
+    if gamma is None:
+        if USE_LOWMEM:
+            gamma = 1.0/n
+        else:
+            gamma = 1.0/(n**2)
+
+    # enforce the constraint on eps_p
+    if (eps_obj - eps_p) >= 1:
+        return x0, gamma
+
+    effective_gamma = gamma * gamma_mult
+
+    # set the sensitivity
+    sensitivity_obj = 2*L / n
+    sensitivity_out = n*gamma / big_lambda
+
+    # set the std dev of noise for obj part
+    std_dev_obj = sensitivity_obj * (1 + np.sqrt(2 * np.log(1 / delta_obj))) / eps_p
+    std_dev_out = sensitivity_out * (1 + np.sqrt(2 * np.log(1 / delta_out))) / eps_out
+
+    # generate the noise for obj part
+    np.random.seed(ord(os.urandom(1)))
+    noise_obj = np.random.normal(scale=std_dev_obj, size=x.shape[1])
+
+    # generate the noise for out part
+    noise_out = np.random.normal(scale=std_dev_out, size=x.shape[1])
+
+    if l2_constraint is None:
+        x0 = np.zeros(shape=x.shape[1])
+    else:
+        x0 = (np.random.rand(x.shape[1]) - .5) * 2 * l2_constraint
+
+    def private_loss(theta, x, y):
+        raw_loss = loss_func(theta, x, y)
+        result = (raw_loss + ((big_lambda/(2*n)) *
+                             (np.linalg.norm(theta, ord=2) ** 2)) + \
+                 (noise_obj.T @ theta)) * gamma_mult
+        return result
+
+    def private_gradient(theta, x, y, use_gamma_mult = True):
+        raw_gradient = grad_func(theta, x, y)
+        result = raw_gradient + ((big_lambda/n) * theta) + noise_obj
+        if use_gamma_mult:
+            result *= gamma_mult
+        return result
+
+    if USE_LOWMEM:
+        c = 200
+        opts = {'gtol': effective_gamma/c}
+        result = minimize(private_loss, x0, (x, y), method='L-BFGS-B',
+                          jac=private_gradient, options=opts)
+        theta = result.x
+        grad = private_gradient(theta, x, y)
+        norm = np.linalg.norm(grad, ord=2)
+
+        if norm <= effective_gamma:
+            theta_mid = result.x
+            return theta_mid + noise_out, gamma
+        else:
+            if effective_gamma < 1e-04:
+                gamma_mult *= 10
+            else:
+                gamma_mult = 1
+                gamma *= 2  
+            return amp_run_classification(x, y, loss_func, grad_func, epsilon, delta, lambda_param,
+                        learning_rate=learning_rate, num_iters=None, l2_constraint=l2_constraint, 
+                        eps_frac=eps_frac, gamma=gamma, L=L, gamma_mult=gamma_mult)
+    else:
+        def constrain_theta(theta):
+            theta = constrain_l2_norm(theta, l2_constraint)
+
+        if l2_constraint is not None:
+            cb = constrain_theta
+        else:
+            cb = None
+
+        opts = {'gtol': effective_gamma, 'norm': 2}
+        result = minimize(private_loss, x0, (x, y), method='BFGS',
+                          jac=private_gradient, options=opts, callback=cb)
+        theta = result.x
+        grad = private_gradient(theta, x, y)
+        norm = np.linalg.norm(grad, ord=2)
+
+        if not result.success:
+            if effective_gamma < 1e-04: 
+                gamma_mult *= 10
+            else:
+                gamma_mult = 1
+                gamma *= 2
+
+            return amp_run_classification(x, y, loss_func, grad_func, epsilon, delta, lambda_param,
+                      learning_rate=learning_rate, num_iters=None, l2_constraint=l2_constraint, 
+                      eps_frac=eps_frac, gamma=gamma, L=L, gamma_mult = gamma_mult)
+        else:
+            orig_gamma = 1/(n**2)
+            orig_grad = private_gradient(theta, x, y, use_gamma_mult=False)
+            orig_norm = np.linalg.norm(orig_grad, ord=2)
+
+            theta_mid = result.x
+            return theta_mid + noise_out, gamma
+
+
+class ApproximateMinimaPerturbationLR(Algorithm):
+    def run_classification(x, y, epsilon, delta, lambda_param,
+                           learning_rate=None, num_iters=None,
+                           l2_constraint=None, eps_frac=0.9,
+                           eps_out_frac=0.01,
+                           gamma=None, L=1):
+
+        return amp_run_classification(x, y, LogisticRegression.loss, LogisticRegression.gradient,
+                                           epsilon, delta, lambda_param,
+                                           learning_rate=learning_rate, num_iters=num_iters,
+                                           l2_constraint=l2_constraint, eps_frac=eps_frac,
+                                           eps_out_frac=eps_out_frac,
+                                           gamma=gamma, L=L)
+
+    def name():
+        return "Approximate minima perturbation with scipy minimize LR"
+
+class ApproximateMinimaPerturbationSVM(Algorithm):
+    def run_classification(x, y, epsilon, delta, lambda_param,
+                           learning_rate=None, num_iters=None,
+                           l2_constraint=None, eps_frac=0.9,
+                           eps_out_frac=0.01,
+                           gamma=None, L=1):
+
+        return amp_run_classification(x, y, HuberSVM.loss, HuberSVM.gradient,
+                                           epsilon, delta, lambda_param,
+                                           learning_rate=learning_rate, num_iters=num_iters,
+                                           l2_constraint=l2_constraint, eps_frac=eps_frac,
+                                           eps_out_frac=eps_out_frac,
+                                           gamma=gamma, L=L)
+
+    def name():
+        return "Approximate minima perturbation with scipy minimize SVM"
+