Merge pull request #425 from Yelp/eliu_gh_138_expose_cpp_struct_data_…

…members_to_python optimizer param struct improvements
Yelp · Dec 1, 2014 · 83141c7 · 83141c7
2 parents 53318ab + 44f645b
commit 83141c7
Show file tree

Hide file tree

Showing 20 changed files with 449 additions and 87 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -3,6 +3,8 @@
   * Added startup message to REST server including tips for OSX users (#400)
   * Added GPU support to ``cpp_wrappers.expected_improvement.multistart_expected_improvement_optimization``; requires ``max_num_threads == 1`` until future multi-GPU support (#368)
   * Added the COBYLA optimizer to the expected improvement optimization class. (#370)
+  * OptimizerParameter struct members now directly readable/writeable from Python; added EqualityComparisonMixin (#138)
+  * C++ GradientDescentParameters object now stores ``num_steps_averaged`` (but does not use it yet) (#391)
 
 * Changes
 

diff --git a/moe/optimal_learning/cpp/gpp_expected_improvement_demo.cpp b/moe/optimal_learning/cpp/gpp_expected_improvement_demo.cpp
@@ -199,7 +199,9 @@ int main() {
   int num_multistarts = 30;  // max number of multistarted locations
   int max_num_steps = 500;  // maximum number of GD iterations per restart
   int max_num_restarts = 20;  // number of restarts to run with GD
-  GradientDescentParameters gd_params(num_multistarts, max_num_steps, max_num_restarts, gamma,
+  int num_steps_averaged = 0;  // number of steps to use in polyak-ruppert averaging
+  GradientDescentParameters gd_params(num_multistarts, max_num_steps, max_num_restarts,
+                                      num_steps_averaged, gamma,
                                       pre_mult, max_relative_change, tolerance);
   // so the total number of GD iterations is at most:
   // num_multistarts * max_num_restarts * max_num_steps

diff --git a/moe/optimal_learning/cpp/gpp_expected_improvement_gpu_test.cpp b/moe/optimal_learning/cpp/gpp_expected_improvement_gpu_test.cpp
@@ -242,9 +242,10 @@ int CudaExpectedImprovementOptimizationMultipleSamplesTest() {
   const double tolerance = 1.0e-5;
   const int max_gradient_descent_steps = 100;
   const int max_num_restarts = 3;
+  const int num_steps_averaged = 0;
   const int num_multistarts = 20;
   GradientDescentParameters gd_params(num_multistarts, max_gradient_descent_steps,
-                                      max_num_restarts, gamma, pre_mult,
+                                      max_num_restarts, num_steps_averaged, gamma, pre_mult,
                                       max_relative_change, tolerance);
 
   // grid search parameters
@@ -403,8 +404,10 @@ int CudaExpectedImprovementOptimizationAnalyticTest() {
   const double tolerance = 1.0e-7;
   const int max_gradient_descent_steps = 1000;
   const int max_num_restarts = 10;
+  const int num_steps_averaged = 0;
   const int num_multistarts = 20;
-  GradientDescentParameters gd_params(num_multistarts, max_gradient_descent_steps, max_num_restarts,
+  GradientDescentParameters gd_params(num_multistarts, max_gradient_descent_steps,
+                                      max_num_restarts, num_steps_averaged,
                                       gamma, pre_mult, max_relative_change, tolerance);
 
   // grid search parameters

diff --git a/moe/optimal_learning/cpp/gpp_heuristic_expected_improvement_optimization_test.cpp b/moe/optimal_learning/cpp/gpp_heuristic_expected_improvement_optimization_test.cpp
@@ -306,9 +306,10 @@ int HeuristicExpectedImprovementOptimizationTestCore(EstimationPolicyTypes polic
   const double tolerance = 1.0e-12;
   const int max_gradient_descent_steps = 300;
   const int max_num_restarts = 5;
+  const int num_steps_averaged = 0;
   const int num_multistarts = 20;
   GradientDescentParameters gd_params(num_multistarts, max_gradient_descent_steps,
-                                      max_num_restarts, gamma, pre_mult,
+                                      max_num_restarts, num_steps_averaged, gamma, pre_mult,
                                       max_relative_change, tolerance);
 
   static const int kMaxNumThreads = 4;

diff --git a/moe/optimal_learning/cpp/gpp_hyper_and_EI_demo.cpp b/moe/optimal_learning/cpp/gpp_hyper_and_EI_demo.cpp
@@ -198,7 +198,9 @@ int main() {
   int num_multistarts = 10;  // max number of multistarted locations
   int max_num_steps = 500;  // maximum number of GD iterations per restart
   int max_num_restarts = 20;  // number of restarts to run with GD
-  GradientDescentParameters gd_params(num_multistarts, max_num_steps, max_num_restarts, gamma,
+  int num_steps_averaged = 0;  // number of steps to use in polyak-ruppert averaging
+  GradientDescentParameters gd_params(num_multistarts, max_num_steps, max_num_restarts,
+                                      num_steps_averaged, gamma,
                                       pre_mult, max_relative_change_ei, tolerance_ei);
 
   // EI evaluation parameters

diff --git a/moe/optimal_learning/cpp/gpp_math_test.cpp b/moe/optimal_learning/cpp/gpp_math_test.cpp
@@ -1051,7 +1051,10 @@ int MultithreadedEIOptimizationTest(ExpectedImprovementEvaluationMode ei_mode) {
 
   const int max_gradient_descent_steps = 250;
   const int max_num_restarts = 3;
-  GradientDescentParameters gd_params(0, max_gradient_descent_steps, max_num_restarts, gamma, pre_mult, max_relative_change, tolerance);
+  const int num_steps_averaged = 0;
+  GradientDescentParameters gd_params(0, max_gradient_descent_steps, max_num_restarts,
+                                      num_steps_averaged, gamma, pre_mult,
+                                      max_relative_change, tolerance);
 
   int max_mc_iterations = 967;
 
@@ -1259,8 +1262,10 @@ OL_WARN_UNUSED_RESULT int ExpectedImprovementOptimizationTestCore(ExpectedImprov
   const double tolerance = 1.0e-7;
   const int max_gradient_descent_steps = 1000;
   const int max_num_restarts = 10;
+  const int num_steps_averaged = 0;
   const int num_multistarts = 20;
-  GradientDescentParameters gd_params(num_multistarts, max_gradient_descent_steps, max_num_restarts,
+  GradientDescentParameters gd_params(num_multistarts, max_gradient_descent_steps,
+                                      max_num_restarts, num_steps_averaged,
                                       gamma, pre_mult, max_relative_change, tolerance);
 
   // grid search parameters
@@ -1459,9 +1464,10 @@ OL_WARN_UNUSED_RESULT int ExpectedImprovementOptimizationSimplexTestCore(Expecte
   const double tolerance = 1.0e-7;
   const int max_gradient_descent_steps = 1000;
   const int max_num_restarts = 10;
+  const int num_steps_averaged = 0;
   const int num_multistarts = 20;
   GradientDescentParameters gd_params(num_multistarts, max_gradient_descent_steps,
-                                      max_num_restarts, gamma, pre_mult,
+                                      max_num_restarts, num_steps_averaged, gamma, pre_mult,
                                       max_relative_change, tolerance);
 
   // grid search parameters
@@ -1673,9 +1679,10 @@ int ExpectedImprovementOptimizationMultipleSamplesTest() {
   const double tolerance = 1.0e-5;
   const int max_gradient_descent_steps = 250;
   const int max_num_restarts = 3;
+  const int num_steps_averaged = 0;
   const int num_multistarts = 20;
   GradientDescentParameters gd_params(num_multistarts, max_gradient_descent_steps,
-                                      max_num_restarts, gamma, pre_mult,
+                                      max_num_restarts, num_steps_averaged, gamma, pre_mult,
                                       max_relative_change, tolerance);
 
   // grid search parameters

diff --git a/moe/optimal_learning/cpp/gpp_model_selection_test.cpp b/moe/optimal_learning/cpp/gpp_model_selection_test.cpp
@@ -366,7 +366,8 @@ OL_WARN_UNUSED_RESULT int HyperparameterLikelihoodOptimizationTestCore(LogLikeli
   }
   const int max_gradient_descent_steps = 600;
   const int max_num_restarts = 5;
-  GradientDescentParameters gd_parameters(1, max_gradient_descent_steps, max_num_restarts, gamma, pre_mult, max_relative_change, tolerance);
+  const int num_steps_averaged = 0;
+  GradientDescentParameters gd_parameters(1, max_gradient_descent_steps, max_num_restarts, num_steps_averaged, gamma, pre_mult, max_relative_change, tolerance);
 
   int total_errors = 0;
   int current_errors = 0;

diff --git a/moe/optimal_learning/cpp/gpp_optimization.hpp b/moe/optimal_learning/cpp/gpp_optimization.hpp
@@ -551,6 +551,8 @@ struct OptimizationIOContainer final {
 };
 
 /*!\rst
+  TODO(GH-390): Implement Polyak-Ruppert Averaging for Gradient Descent
+
   Implements gradient-descrent to to find a locally optimal (maximal here) value of the specified objective function.
   Additional high-level discussion is provided in section 2a) in the header docs of this file.
 

diff --git a/moe/optimal_learning/cpp/gpp_optimization_test.cpp b/moe/optimal_learning/cpp/gpp_optimization_test.cpp
@@ -214,7 +214,10 @@ OL_WARN_UNUSED_RESULT int MockObjectiveGradientDescentOptimizationTestCore() {
   const double tolerance = 1.0e-12;
   const int max_gradient_descent_steps = 1000;
   const int max_num_restarts = 10;
-  GradientDescentParameters gd_parameters(1, max_gradient_descent_steps, max_num_restarts, gamma, pre_mult, max_relative_change, tolerance);
+  const int num_steps_averaged = 0;
+  GradientDescentParameters gd_parameters(1, max_gradient_descent_steps, max_num_restarts,
+                                          num_steps_averaged, gamma, pre_mult,
+                                          max_relative_change, tolerance);
 
   int total_errors = 0;
   int current_errors = 0;
@@ -341,7 +344,10 @@ OL_WARN_UNUSED_RESULT int MockObjectiveGradientDescentConstrainedOptimizationTes
   const double tolerance = 1.0e-12;
   const int max_gradient_descent_steps = 1000;
   const int max_num_restarts = 10;
-  GradientDescentParameters gd_parameters(1, max_gradient_descent_steps, max_num_restarts, gamma, pre_mult, max_relative_change, tolerance);
+  const int num_steps_averaged = 0;
+  GradientDescentParameters gd_parameters(1, max_gradient_descent_steps, max_num_restarts,
+                                          num_steps_averaged, gamma, pre_mult,
+                                          max_relative_change, tolerance);
 
   int total_errors = 0;
   int current_errors = 0;

diff --git a/moe/optimal_learning/cpp/gpp_optimizer_parameters.hpp b/moe/optimal_learning/cpp/gpp_optimizer_parameters.hpp
@@ -51,6 +51,20 @@ struct NullParameters {
   The total number of gradient descent steps is at most ``num_multistarts * max_num_steps * max_num_restarts``
   Generally, allowing more iterations leads to a better solution but costs more time.
 
+  **Averaging (TODO(GH-390): NOT IMPLEMTED YET)**
+
+  When optimizing stochastic objective functions, it can often be beneficial to average some number of gradient descent
+  steps to obtain the final result (vs just returning the last step).
+  Polyak-Ruppert averaging: postprocessing step where we replace ``x_n`` with:
+  ``\overbar{x} = \frac{1}{n - n_0} \sum_{t=n_0 + 1}^n x_t``
+  ``n_0 = 0`` averages all steps; ``n_0 = n - 1`` is equivalent to returning ``x_n`` directly.
+  Here, num_steps_averaged is ``n - n_0``.
+
+  * ``num_steps_averaged`` < 0: averages all steps
+  * ``num_steps_averaged`` == 0: do not average
+  * ``num_steps_averaged`` > 0 and <= ``max_num_steps``: average the specified number of steps
+  * ``max_steps_averaged`` > ``max_num_steps``: average all steps
+
   **Learning Rate**
 
   GD may be implemented using a learning rate: ``pre_mult * (i+1)^{-\gamma}``, where i is the current iteration
@@ -74,10 +88,14 @@ struct GradientDescentParameters {
     INPUTS:
     See member declarations below for a description of each parameter.
   \endrst*/
-  GradientDescentParameters(int num_multistarts_in, int max_num_steps_in, int max_num_restarts_in, double gamma_in, double pre_mult_in, double max_relative_change_in, double tolerance_in)
+  GradientDescentParameters(int num_multistarts_in, int max_num_steps_in,
+                            int max_num_restarts_in, int num_steps_averaged_in,
+                            double gamma_in, double pre_mult_in,
+                            double max_relative_change_in, double tolerance_in)
       : num_multistarts(num_multistarts_in),
         max_num_steps(max_num_steps_in),
         max_num_restarts(max_num_restarts_in),
+        num_steps_averaged(num_steps_averaged_in),
         gamma(gamma_in),
         pre_mult(pre_mult_in),
         max_relative_change(max_relative_change_in),
@@ -94,6 +112,11 @@ struct GradientDescentParameters {
   //! maximum number of gradient descent restarts, the we are allowed to call gradient descent.  Should be >= 2 as a minimum (suggest: 4-20)
   int max_num_restarts;
 
+  // polyak-ruppert averaging control
+  //! number of steps to use in polyak-ruppert averaging (see above)
+  //! (suggest: 10-50% of max_num_steps for stochastic problems, 0 otherwise)
+  int num_steps_averaged;
+
   // learning rate control
   //! exponent controlling rate of step size decrease (see struct docs or GradientDescentOptimizer) (suggest: 0.5-0.9)
   double gamma;
@@ -138,7 +161,9 @@ struct NewtonParameters {
     INPUTS:
     See member declarations below for a description of each parameter.
   \endrst*/
-  NewtonParameters(int num_multistarts_in, int max_num_steps_in, double gamma_in, double time_factor_in, double max_relative_change_in, double tolerance_in)
+  NewtonParameters(int num_multistarts_in, int max_num_steps_in, double gamma_in,
+                   double time_factor_in, double max_relative_change_in,
+                   double tolerance_in)
       : num_multistarts(num_multistarts_in),
         max_num_steps(max_num_steps_in),
         gamma(gamma_in),

diff --git a/moe/optimal_learning/cpp/gpp_python_common.cpp b/moe/optimal_learning/cpp/gpp_python_common.cpp
@@ -19,6 +19,7 @@
 
 #include <vector>  // NOLINT(build/include_order)
 
+#include <boost/python/args.hpp>  // NOLINT(build/include_order)
 #include <boost/python/class.hpp>  // NOLINT(build/include_order)
 #include <boost/python/enum.hpp>  // NOLINT(build/include_order)
 #include <boost/python/extract.hpp>  // NOLINT(build/include_order)
@@ -204,7 +205,8 @@ void ExportEnumTypes() {
 }
 
 void ExportOptimizerParameterStructs() {
-  boost::python::class_<GradientDescentParameters, boost::noncopyable>("GradientDescentParameters", boost::python::init<int, int, int, double, double, double, double>(R"%%(
+  boost::python::class_<GradientDescentParameters, boost::noncopyable>("GradientDescentParameters", boost::python::init<int, int, int, int, double, double, double, double>(
+      (boost::python::arg("num_multistarts"), "max_num_steps", "max_num_restarts", "num_steps_averaged", "gamma", "pre_mult", "max_relative_change", "tolerance"), R"%%(
     Constructor for a GradientDescentParameters object.
 
     :param num_multistarts: number of initial guesses to try in multistarted gradient descent (suggest: a few hundred)
@@ -213,6 +215,8 @@ void ExportOptimizerParameterStructs() {
     :type max_num_steps: int > 0
     :param max_num_restarts: maximum number of gradient descent restarts, the we are allowed to call gradient descent.  Should be >= 2 as a minimum (suggest: 4-20)
     :type max_num_restarts: int > 0
+    :param num_steps_averaged: number of steps to use in polyak-ruppert averaging (see above) (suggest: 10-50% of max_num_steps for stochastic problems, 0 otherwise) (UNUSED)
+    :type num_steps_averaged: int (range is clamped as described above)
     :param gamma: exponent controlling rate of step size decrease (see struct docs or GradientDescentOptimizer) (suggest: 0.5-0.9)
     :type gamma: float64 > 1.0
     :param pre_mult: scaling factor for step size (see struct docs or GradientDescentOptimizer) (suggest: 0.1-1.0)
@@ -223,9 +227,19 @@ void ExportOptimizerParameterStructs() {
     :param tolerance: when the magnitude of the gradient falls below this value OR we will not move farther than tolerance
         (e.g., at a boundary), stop.  (suggest: 1.0e-7)
     :type tolerance: float64 >= 0.0
-    )%%"));
+    )%%"))
+      .def_readwrite("num_multistarts", &GradientDescentParameters::num_multistarts, "number of initial guesses to try in multistarted gradient descent (suggest: a few hundred)")
+      .def_readwrite("max_num_steps", &GradientDescentParameters::max_num_steps, "maximum number of gradient descent iterations per restart (suggest: 200-1000)")
+      .def_readwrite("max_num_restarts", &GradientDescentParameters::max_num_restarts, "maximum number of gradient descent restarts, the we are allowed to call gradient descent.  Should be >= 2 as a minimum (suggest: 4-20)")
+      .def_readwrite("num_steps_averaged", &GradientDescentParameters::num_steps_averaged, "number of steps to use in polyak-ruppert averaging (suggest: 10-50% of max_num_steps for stochastic problems, 0 otherwise)")
+      .def_readwrite("gamma", &GradientDescentParameters::gamma, "exponent controlling rate of step size decrease (see struct docs or GradientDescentOptimizer) (suggest: 0.5-0.9)")
+      .def_readwrite("pre_mult", &GradientDescentParameters::pre_mult, "scaling factor for step size (see struct docs or GradientDescentOptimizer) (suggest: 0.1-1.0)")
+      .def_readwrite("max_relative_change", &GradientDescentParameters::max_relative_change, "max change allowed per GD iteration (as a relative fraction of current distance to wall), see ctor docstring")
+      .def_readwrite("tolerance", &GradientDescentParameters::tolerance, "when the magnitude of the gradient falls below this value OR we will not move farther than tolerance")
+      ;  // NOLINT, this is boost style
 
-  boost::python::class_<NewtonParameters, boost::noncopyable>("NewtonParameters", boost::python::init<int, int, double, double, double, double>(R"%%(
+  boost::python::class_<NewtonParameters, boost::noncopyable>("NewtonParameters", boost::python::init<int, int, double, double, double, double>(
+      (boost::python::arg("num_multistarts"), "max_num_steps", "gamma", "time_factor", "max_relative_change", "tolerance"), R"%%(
     Constructor for a NewtonParameters object.
 
     :param num_multistarts: number of initial guesses to try in multistarted newton (suggest: a few hundred)
@@ -240,7 +254,14 @@ void ExportOptimizerParameterStructs() {
     :type max_relative_change: float64 in [0, 1]
     :param tolerance: when the magnitude of the gradient falls below this value, stop (suggest: 1.0e-10)
     :type tolerance: float64 >= 0.0
-    )%%"));
+    )%%"))
+      .def_readwrite("num_multistarts", &NewtonParameters::num_multistarts, "number of initial guesses to try in multistarted gradient descent (suggest: a few hundred)")
+      .def_readwrite("max_num_steps", &NewtonParameters::max_num_steps, "maximum number of gradient descent iterations per restart (suggest: 200-1000)")
+      .def_readwrite("gamma", &NewtonParameters::gamma, "exponent controlling rate of time_factor growth (see class docs and NewtonOptimizer) (suggest: 1.01-1.1)")
+      .def_readwrite("time_factor", &NewtonParameters::time_factor, "initial amount of additive diagonal dominance (see class docs and NewtonOptimizer) (suggest: 1.0e-3-1.0e-1)")
+      .def_readwrite("max_relative_change", &NewtonParameters::max_relative_change, "max change allowed per update (as a relative fraction of current distance to wall) (Newton may ignore this) (suggest: 1.0)")
+      .def_readwrite("tolerance", &NewtonParameters::tolerance, "when the magnitude of the gradient falls below this value, stop (suggest: 1.0e-10)")
+      ;  // NOLINT, this is boost style
 }
 
 void ExportRandomnessContainer() {

diff --git a/moe/optimal_learning/cpp/gpp_python_expected_improvement.cpp b/moe/optimal_learning/cpp/gpp_python_expected_improvement.cpp
@@ -313,7 +313,7 @@ void DispatchHeuristicExpectedImprovementOptimization(const boost::python::objec
       int num_random_samples = boost::python::extract<int>(optimizer_parameters.attr("num_random_samples"));
 
       bool random_search_only = true;
-      GradientDescentParameters gradient_descent_parameters(0, 0, 0, 1.0, 1.0, 1.0, 0.0);  // dummy struct; we aren't using gradient descent
+      GradientDescentParameters gradient_descent_parameters(0, 0, 0, 0, 1.0, 1.0, 1.0, 0.0);  // dummy struct; we aren't using gradient descent
       ComputeHeuristicPointsToSample(gaussian_process, gradient_descent_parameters, domain,
                                      estimation_policy, thread_schedule, best_so_far,
                                      random_search_only, num_random_samples, num_to_sample,

diff --git a/moe/optimal_learning/cpp/gpp_python_gaussian_process.cpp b/moe/optimal_learning/cpp/gpp_python_gaussian_process.cpp
@@ -328,9 +328,9 @@ void ExportGaussianProcessFunctions() {
         Forces recomputation of all derived quantities for GP to remain consistent.
 
         :param new_points: coordinates of each new point to add
-        :param new_points: list of float64 with shape (num_new_points, dim)
+        :type new_points: list of float64 with shape (num_new_points, dim)
         :param new_points_value: function value at each new point
-        :param new_points_value: list of float64 with shape (num_new_points, )
+        :type new_points_value: list of float64 with shape (num_new_points, )
         :param new_points_noise_variance: \sigma_n^2 corresponding to the signal noise in measuring new_points_value
         :type new_points_noise_variance: list of float64 with shape (num_new_points, )
         :param num_new_points: number of new points to add to the GP
-Original file line number
+Diff line change
@@ Expand Up / @@ -551,6 +551,8 @@ struct OptimizationIOContainer final { @@
     };
     /*!\rst
+      TODO(GH-390): Implement Polyak-Ruppert Averaging for Gradient Descent
       Implements gradient-descrent to to find a locally optimal (maximal here) value of the specified objective function.
       Additional high-level discussion is provided in section 2a) in the header docs of this file.
@@ Expand Down @@