Skip to content

Commit

Permalink
Merge pull request #425 from Yelp/eliu_gh_138_expose_cpp_struct_data_…
Browse files Browse the repository at this point in the history
…members_to_python

optimizer param struct improvements
  • Loading branch information
suntzu86 committed Dec 1, 2014
2 parents 53318ab + 44f645b commit 83141c7
Show file tree
Hide file tree
Showing 20 changed files with 449 additions and 87 deletions.
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@
* Added startup message to REST server including tips for OSX users (#400)
* Added GPU support to ``cpp_wrappers.expected_improvement.multistart_expected_improvement_optimization``; requires ``max_num_threads == 1`` until future multi-GPU support (#368)
* Added the COBYLA optimizer to the expected improvement optimization class. (#370)
* OptimizerParameter struct members now directly readable/writeable from Python; added EqualityComparisonMixin (#138)
* C++ GradientDescentParameters object now stores ``num_steps_averaged`` (but does not use it yet) (#391)

* Changes

Expand Down
4 changes: 3 additions & 1 deletion moe/optimal_learning/cpp/gpp_expected_improvement_demo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -199,7 +199,9 @@ int main() {
int num_multistarts = 30; // max number of multistarted locations
int max_num_steps = 500; // maximum number of GD iterations per restart
int max_num_restarts = 20; // number of restarts to run with GD
GradientDescentParameters gd_params(num_multistarts, max_num_steps, max_num_restarts, gamma,
int num_steps_averaged = 0; // number of steps to use in polyak-ruppert averaging
GradientDescentParameters gd_params(num_multistarts, max_num_steps, max_num_restarts,
num_steps_averaged, gamma,
pre_mult, max_relative_change, tolerance);
// so the total number of GD iterations is at most:
// num_multistarts * max_num_restarts * max_num_steps
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -242,9 +242,10 @@ int CudaExpectedImprovementOptimizationMultipleSamplesTest() {
const double tolerance = 1.0e-5;
const int max_gradient_descent_steps = 100;
const int max_num_restarts = 3;
const int num_steps_averaged = 0;
const int num_multistarts = 20;
GradientDescentParameters gd_params(num_multistarts, max_gradient_descent_steps,
max_num_restarts, gamma, pre_mult,
max_num_restarts, num_steps_averaged, gamma, pre_mult,
max_relative_change, tolerance);

// grid search parameters
Expand Down Expand Up @@ -403,8 +404,10 @@ int CudaExpectedImprovementOptimizationAnalyticTest() {
const double tolerance = 1.0e-7;
const int max_gradient_descent_steps = 1000;
const int max_num_restarts = 10;
const int num_steps_averaged = 0;
const int num_multistarts = 20;
GradientDescentParameters gd_params(num_multistarts, max_gradient_descent_steps, max_num_restarts,
GradientDescentParameters gd_params(num_multistarts, max_gradient_descent_steps,
max_num_restarts, num_steps_averaged,
gamma, pre_mult, max_relative_change, tolerance);

// grid search parameters
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -306,9 +306,10 @@ int HeuristicExpectedImprovementOptimizationTestCore(EstimationPolicyTypes polic
const double tolerance = 1.0e-12;
const int max_gradient_descent_steps = 300;
const int max_num_restarts = 5;
const int num_steps_averaged = 0;
const int num_multistarts = 20;
GradientDescentParameters gd_params(num_multistarts, max_gradient_descent_steps,
max_num_restarts, gamma, pre_mult,
max_num_restarts, num_steps_averaged, gamma, pre_mult,
max_relative_change, tolerance);

static const int kMaxNumThreads = 4;
Expand Down
4 changes: 3 additions & 1 deletion moe/optimal_learning/cpp/gpp_hyper_and_EI_demo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -198,7 +198,9 @@ int main() {
int num_multistarts = 10; // max number of multistarted locations
int max_num_steps = 500; // maximum number of GD iterations per restart
int max_num_restarts = 20; // number of restarts to run with GD
GradientDescentParameters gd_params(num_multistarts, max_num_steps, max_num_restarts, gamma,
int num_steps_averaged = 0; // number of steps to use in polyak-ruppert averaging
GradientDescentParameters gd_params(num_multistarts, max_num_steps, max_num_restarts,
num_steps_averaged, gamma,
pre_mult, max_relative_change_ei, tolerance_ei);

// EI evaluation parameters
Expand Down
15 changes: 11 additions & 4 deletions moe/optimal_learning/cpp/gpp_math_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1051,7 +1051,10 @@ int MultithreadedEIOptimizationTest(ExpectedImprovementEvaluationMode ei_mode) {

const int max_gradient_descent_steps = 250;
const int max_num_restarts = 3;
GradientDescentParameters gd_params(0, max_gradient_descent_steps, max_num_restarts, gamma, pre_mult, max_relative_change, tolerance);
const int num_steps_averaged = 0;
GradientDescentParameters gd_params(0, max_gradient_descent_steps, max_num_restarts,
num_steps_averaged, gamma, pre_mult,
max_relative_change, tolerance);

int max_mc_iterations = 967;

Expand Down Expand Up @@ -1259,8 +1262,10 @@ OL_WARN_UNUSED_RESULT int ExpectedImprovementOptimizationTestCore(ExpectedImprov
const double tolerance = 1.0e-7;
const int max_gradient_descent_steps = 1000;
const int max_num_restarts = 10;
const int num_steps_averaged = 0;
const int num_multistarts = 20;
GradientDescentParameters gd_params(num_multistarts, max_gradient_descent_steps, max_num_restarts,
GradientDescentParameters gd_params(num_multistarts, max_gradient_descent_steps,
max_num_restarts, num_steps_averaged,
gamma, pre_mult, max_relative_change, tolerance);

// grid search parameters
Expand Down Expand Up @@ -1459,9 +1464,10 @@ OL_WARN_UNUSED_RESULT int ExpectedImprovementOptimizationSimplexTestCore(Expecte
const double tolerance = 1.0e-7;
const int max_gradient_descent_steps = 1000;
const int max_num_restarts = 10;
const int num_steps_averaged = 0;
const int num_multistarts = 20;
GradientDescentParameters gd_params(num_multistarts, max_gradient_descent_steps,
max_num_restarts, gamma, pre_mult,
max_num_restarts, num_steps_averaged, gamma, pre_mult,
max_relative_change, tolerance);

// grid search parameters
Expand Down Expand Up @@ -1673,9 +1679,10 @@ int ExpectedImprovementOptimizationMultipleSamplesTest() {
const double tolerance = 1.0e-5;
const int max_gradient_descent_steps = 250;
const int max_num_restarts = 3;
const int num_steps_averaged = 0;
const int num_multistarts = 20;
GradientDescentParameters gd_params(num_multistarts, max_gradient_descent_steps,
max_num_restarts, gamma, pre_mult,
max_num_restarts, num_steps_averaged, gamma, pre_mult,
max_relative_change, tolerance);

// grid search parameters
Expand Down
3 changes: 2 additions & 1 deletion moe/optimal_learning/cpp/gpp_model_selection_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -366,7 +366,8 @@ OL_WARN_UNUSED_RESULT int HyperparameterLikelihoodOptimizationTestCore(LogLikeli
}
const int max_gradient_descent_steps = 600;
const int max_num_restarts = 5;
GradientDescentParameters gd_parameters(1, max_gradient_descent_steps, max_num_restarts, gamma, pre_mult, max_relative_change, tolerance);
const int num_steps_averaged = 0;
GradientDescentParameters gd_parameters(1, max_gradient_descent_steps, max_num_restarts, num_steps_averaged, gamma, pre_mult, max_relative_change, tolerance);

int total_errors = 0;
int current_errors = 0;
Expand Down
2 changes: 2 additions & 0 deletions moe/optimal_learning/cpp/gpp_optimization.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -551,6 +551,8 @@ struct OptimizationIOContainer final {
};

/*!\rst
TODO(GH-390): Implement Polyak-Ruppert Averaging for Gradient Descent
Implements gradient-descrent to to find a locally optimal (maximal here) value of the specified objective function.
Additional high-level discussion is provided in section 2a) in the header docs of this file.
Expand Down
10 changes: 8 additions & 2 deletions moe/optimal_learning/cpp/gpp_optimization_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -214,7 +214,10 @@ OL_WARN_UNUSED_RESULT int MockObjectiveGradientDescentOptimizationTestCore() {
const double tolerance = 1.0e-12;
const int max_gradient_descent_steps = 1000;
const int max_num_restarts = 10;
GradientDescentParameters gd_parameters(1, max_gradient_descent_steps, max_num_restarts, gamma, pre_mult, max_relative_change, tolerance);
const int num_steps_averaged = 0;
GradientDescentParameters gd_parameters(1, max_gradient_descent_steps, max_num_restarts,
num_steps_averaged, gamma, pre_mult,
max_relative_change, tolerance);

int total_errors = 0;
int current_errors = 0;
Expand Down Expand Up @@ -341,7 +344,10 @@ OL_WARN_UNUSED_RESULT int MockObjectiveGradientDescentConstrainedOptimizationTes
const double tolerance = 1.0e-12;
const int max_gradient_descent_steps = 1000;
const int max_num_restarts = 10;
GradientDescentParameters gd_parameters(1, max_gradient_descent_steps, max_num_restarts, gamma, pre_mult, max_relative_change, tolerance);
const int num_steps_averaged = 0;
GradientDescentParameters gd_parameters(1, max_gradient_descent_steps, max_num_restarts,
num_steps_averaged, gamma, pre_mult,
max_relative_change, tolerance);

int total_errors = 0;
int current_errors = 0;
Expand Down
29 changes: 27 additions & 2 deletions moe/optimal_learning/cpp/gpp_optimizer_parameters.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,20 @@ struct NullParameters {
The total number of gradient descent steps is at most ``num_multistarts * max_num_steps * max_num_restarts``
Generally, allowing more iterations leads to a better solution but costs more time.
**Averaging (TODO(GH-390): NOT IMPLEMTED YET)**
When optimizing stochastic objective functions, it can often be beneficial to average some number of gradient descent
steps to obtain the final result (vs just returning the last step).
Polyak-Ruppert averaging: postprocessing step where we replace ``x_n`` with:
``\overbar{x} = \frac{1}{n - n_0} \sum_{t=n_0 + 1}^n x_t``
``n_0 = 0`` averages all steps; ``n_0 = n - 1`` is equivalent to returning ``x_n`` directly.
Here, num_steps_averaged is ``n - n_0``.
* ``num_steps_averaged`` < 0: averages all steps
* ``num_steps_averaged`` == 0: do not average
* ``num_steps_averaged`` > 0 and <= ``max_num_steps``: average the specified number of steps
* ``max_steps_averaged`` > ``max_num_steps``: average all steps
**Learning Rate**
GD may be implemented using a learning rate: ``pre_mult * (i+1)^{-\gamma}``, where i is the current iteration
Expand All @@ -74,10 +88,14 @@ struct GradientDescentParameters {
INPUTS:
See member declarations below for a description of each parameter.
\endrst*/
GradientDescentParameters(int num_multistarts_in, int max_num_steps_in, int max_num_restarts_in, double gamma_in, double pre_mult_in, double max_relative_change_in, double tolerance_in)
GradientDescentParameters(int num_multistarts_in, int max_num_steps_in,
int max_num_restarts_in, int num_steps_averaged_in,
double gamma_in, double pre_mult_in,
double max_relative_change_in, double tolerance_in)
: num_multistarts(num_multistarts_in),
max_num_steps(max_num_steps_in),
max_num_restarts(max_num_restarts_in),
num_steps_averaged(num_steps_averaged_in),
gamma(gamma_in),
pre_mult(pre_mult_in),
max_relative_change(max_relative_change_in),
Expand All @@ -94,6 +112,11 @@ struct GradientDescentParameters {
//! maximum number of gradient descent restarts, the we are allowed to call gradient descent. Should be >= 2 as a minimum (suggest: 4-20)
int max_num_restarts;

// polyak-ruppert averaging control
//! number of steps to use in polyak-ruppert averaging (see above)
//! (suggest: 10-50% of max_num_steps for stochastic problems, 0 otherwise)
int num_steps_averaged;

// learning rate control
//! exponent controlling rate of step size decrease (see struct docs or GradientDescentOptimizer) (suggest: 0.5-0.9)
double gamma;
Expand Down Expand Up @@ -138,7 +161,9 @@ struct NewtonParameters {
INPUTS:
See member declarations below for a description of each parameter.
\endrst*/
NewtonParameters(int num_multistarts_in, int max_num_steps_in, double gamma_in, double time_factor_in, double max_relative_change_in, double tolerance_in)
NewtonParameters(int num_multistarts_in, int max_num_steps_in, double gamma_in,
double time_factor_in, double max_relative_change_in,
double tolerance_in)
: num_multistarts(num_multistarts_in),
max_num_steps(max_num_steps_in),
gamma(gamma_in),
Expand Down
29 changes: 25 additions & 4 deletions moe/optimal_learning/cpp/gpp_python_common.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@

#include <vector> // NOLINT(build/include_order)

#include <boost/python/args.hpp> // NOLINT(build/include_order)
#include <boost/python/class.hpp> // NOLINT(build/include_order)
#include <boost/python/enum.hpp> // NOLINT(build/include_order)
#include <boost/python/extract.hpp> // NOLINT(build/include_order)
Expand Down Expand Up @@ -204,7 +205,8 @@ void ExportEnumTypes() {
}

void ExportOptimizerParameterStructs() {
boost::python::class_<GradientDescentParameters, boost::noncopyable>("GradientDescentParameters", boost::python::init<int, int, int, double, double, double, double>(R"%%(
boost::python::class_<GradientDescentParameters, boost::noncopyable>("GradientDescentParameters", boost::python::init<int, int, int, int, double, double, double, double>(
(boost::python::arg("num_multistarts"), "max_num_steps", "max_num_restarts", "num_steps_averaged", "gamma", "pre_mult", "max_relative_change", "tolerance"), R"%%(
Constructor for a GradientDescentParameters object.
:param num_multistarts: number of initial guesses to try in multistarted gradient descent (suggest: a few hundred)
Expand All @@ -213,6 +215,8 @@ void ExportOptimizerParameterStructs() {
:type max_num_steps: int > 0
:param max_num_restarts: maximum number of gradient descent restarts, the we are allowed to call gradient descent. Should be >= 2 as a minimum (suggest: 4-20)
:type max_num_restarts: int > 0
:param num_steps_averaged: number of steps to use in polyak-ruppert averaging (see above) (suggest: 10-50% of max_num_steps for stochastic problems, 0 otherwise) (UNUSED)
:type num_steps_averaged: int (range is clamped as described above)
:param gamma: exponent controlling rate of step size decrease (see struct docs or GradientDescentOptimizer) (suggest: 0.5-0.9)
:type gamma: float64 > 1.0
:param pre_mult: scaling factor for step size (see struct docs or GradientDescentOptimizer) (suggest: 0.1-1.0)
Expand All @@ -223,9 +227,19 @@ void ExportOptimizerParameterStructs() {
:param tolerance: when the magnitude of the gradient falls below this value OR we will not move farther than tolerance
(e.g., at a boundary), stop. (suggest: 1.0e-7)
:type tolerance: float64 >= 0.0
)%%"));
)%%"))
.def_readwrite("num_multistarts", &GradientDescentParameters::num_multistarts, "number of initial guesses to try in multistarted gradient descent (suggest: a few hundred)")
.def_readwrite("max_num_steps", &GradientDescentParameters::max_num_steps, "maximum number of gradient descent iterations per restart (suggest: 200-1000)")
.def_readwrite("max_num_restarts", &GradientDescentParameters::max_num_restarts, "maximum number of gradient descent restarts, the we are allowed to call gradient descent. Should be >= 2 as a minimum (suggest: 4-20)")
.def_readwrite("num_steps_averaged", &GradientDescentParameters::num_steps_averaged, "number of steps to use in polyak-ruppert averaging (suggest: 10-50% of max_num_steps for stochastic problems, 0 otherwise)")
.def_readwrite("gamma", &GradientDescentParameters::gamma, "exponent controlling rate of step size decrease (see struct docs or GradientDescentOptimizer) (suggest: 0.5-0.9)")
.def_readwrite("pre_mult", &GradientDescentParameters::pre_mult, "scaling factor for step size (see struct docs or GradientDescentOptimizer) (suggest: 0.1-1.0)")
.def_readwrite("max_relative_change", &GradientDescentParameters::max_relative_change, "max change allowed per GD iteration (as a relative fraction of current distance to wall), see ctor docstring")
.def_readwrite("tolerance", &GradientDescentParameters::tolerance, "when the magnitude of the gradient falls below this value OR we will not move farther than tolerance")
; // NOLINT, this is boost style

boost::python::class_<NewtonParameters, boost::noncopyable>("NewtonParameters", boost::python::init<int, int, double, double, double, double>(R"%%(
boost::python::class_<NewtonParameters, boost::noncopyable>("NewtonParameters", boost::python::init<int, int, double, double, double, double>(
(boost::python::arg("num_multistarts"), "max_num_steps", "gamma", "time_factor", "max_relative_change", "tolerance"), R"%%(
Constructor for a NewtonParameters object.
:param num_multistarts: number of initial guesses to try in multistarted newton (suggest: a few hundred)
Expand All @@ -240,7 +254,14 @@ void ExportOptimizerParameterStructs() {
:type max_relative_change: float64 in [0, 1]
:param tolerance: when the magnitude of the gradient falls below this value, stop (suggest: 1.0e-10)
:type tolerance: float64 >= 0.0
)%%"));
)%%"))
.def_readwrite("num_multistarts", &NewtonParameters::num_multistarts, "number of initial guesses to try in multistarted gradient descent (suggest: a few hundred)")
.def_readwrite("max_num_steps", &NewtonParameters::max_num_steps, "maximum number of gradient descent iterations per restart (suggest: 200-1000)")
.def_readwrite("gamma", &NewtonParameters::gamma, "exponent controlling rate of time_factor growth (see class docs and NewtonOptimizer) (suggest: 1.01-1.1)")
.def_readwrite("time_factor", &NewtonParameters::time_factor, "initial amount of additive diagonal dominance (see class docs and NewtonOptimizer) (suggest: 1.0e-3-1.0e-1)")
.def_readwrite("max_relative_change", &NewtonParameters::max_relative_change, "max change allowed per update (as a relative fraction of current distance to wall) (Newton may ignore this) (suggest: 1.0)")
.def_readwrite("tolerance", &NewtonParameters::tolerance, "when the magnitude of the gradient falls below this value, stop (suggest: 1.0e-10)")
; // NOLINT, this is boost style
}

void ExportRandomnessContainer() {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -313,7 +313,7 @@ void DispatchHeuristicExpectedImprovementOptimization(const boost::python::objec
int num_random_samples = boost::python::extract<int>(optimizer_parameters.attr("num_random_samples"));

bool random_search_only = true;
GradientDescentParameters gradient_descent_parameters(0, 0, 0, 1.0, 1.0, 1.0, 0.0); // dummy struct; we aren't using gradient descent
GradientDescentParameters gradient_descent_parameters(0, 0, 0, 0, 1.0, 1.0, 1.0, 0.0); // dummy struct; we aren't using gradient descent
ComputeHeuristicPointsToSample(gaussian_process, gradient_descent_parameters, domain,
estimation_policy, thread_schedule, best_so_far,
random_search_only, num_random_samples, num_to_sample,
Expand Down
4 changes: 2 additions & 2 deletions moe/optimal_learning/cpp/gpp_python_gaussian_process.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -328,9 +328,9 @@ void ExportGaussianProcessFunctions() {
Forces recomputation of all derived quantities for GP to remain consistent.
:param new_points: coordinates of each new point to add
:param new_points: list of float64 with shape (num_new_points, dim)
:type new_points: list of float64 with shape (num_new_points, dim)
:param new_points_value: function value at each new point
:param new_points_value: list of float64 with shape (num_new_points, )
:type new_points_value: list of float64 with shape (num_new_points, )
:param new_points_noise_variance: \sigma_n^2 corresponding to the signal noise in measuring new_points_value
:type new_points_noise_variance: list of float64 with shape (num_new_points, )
:param num_new_points: number of new points to add to the GP
Expand Down
Loading

0 comments on commit 83141c7

Please sign in to comment.