Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

optimizer param struct improvements #425

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@
* Added startup message to REST server including tips for OSX users (#400)
* Added GPU support to ``cpp_wrappers.expected_improvement.multistart_expected_improvement_optimization``; requires ``max_num_threads == 1`` until future multi-GPU support (#368)
* Added the COBYLA optimizer to the expected improvement optimization class. (#370)
* OptimizerParameter struct members now directly readable/writeable from Python; added EqualityComparisonMixin (#138)
* C++ GradientDescentParameters object now stores ``num_steps_averaged`` (but does not use it yet) (#391)

* Changes

Expand Down
4 changes: 3 additions & 1 deletion moe/optimal_learning/cpp/gpp_expected_improvement_demo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -199,7 +199,9 @@ int main() {
int num_multistarts = 30; // max number of multistarted locations
int max_num_steps = 500; // maximum number of GD iterations per restart
int max_num_restarts = 20; // number of restarts to run with GD
GradientDescentParameters gd_params(num_multistarts, max_num_steps, max_num_restarts, gamma,
int num_steps_averaged = 0; // number of steps to use in polyak-ruppert averaging
GradientDescentParameters gd_params(num_multistarts, max_num_steps, max_num_restarts,
num_steps_averaged, gamma,
pre_mult, max_relative_change, tolerance);
// so the total number of GD iterations is at most:
// num_multistarts * max_num_restarts * max_num_steps
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -242,9 +242,10 @@ int CudaExpectedImprovementOptimizationMultipleSamplesTest() {
const double tolerance = 1.0e-5;
const int max_gradient_descent_steps = 100;
const int max_num_restarts = 3;
const int num_steps_averaged = 0;
const int num_multistarts = 20;
GradientDescentParameters gd_params(num_multistarts, max_gradient_descent_steps,
max_num_restarts, gamma, pre_mult,
max_num_restarts, num_steps_averaged, gamma, pre_mult,
max_relative_change, tolerance);

// grid search parameters
Expand Down Expand Up @@ -403,8 +404,10 @@ int CudaExpectedImprovementOptimizationAnalyticTest() {
const double tolerance = 1.0e-7;
const int max_gradient_descent_steps = 1000;
const int max_num_restarts = 10;
const int num_steps_averaged = 0;
const int num_multistarts = 20;
GradientDescentParameters gd_params(num_multistarts, max_gradient_descent_steps, max_num_restarts,
GradientDescentParameters gd_params(num_multistarts, max_gradient_descent_steps,
max_num_restarts, num_steps_averaged,
gamma, pre_mult, max_relative_change, tolerance);

// grid search parameters
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -306,9 +306,10 @@ int HeuristicExpectedImprovementOptimizationTestCore(EstimationPolicyTypes polic
const double tolerance = 1.0e-12;
const int max_gradient_descent_steps = 300;
const int max_num_restarts = 5;
const int num_steps_averaged = 0;
const int num_multistarts = 20;
GradientDescentParameters gd_params(num_multistarts, max_gradient_descent_steps,
max_num_restarts, gamma, pre_mult,
max_num_restarts, num_steps_averaged, gamma, pre_mult,
max_relative_change, tolerance);

static const int kMaxNumThreads = 4;
Expand Down
4 changes: 3 additions & 1 deletion moe/optimal_learning/cpp/gpp_hyper_and_EI_demo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -198,7 +198,9 @@ int main() {
int num_multistarts = 10; // max number of multistarted locations
int max_num_steps = 500; // maximum number of GD iterations per restart
int max_num_restarts = 20; // number of restarts to run with GD
GradientDescentParameters gd_params(num_multistarts, max_num_steps, max_num_restarts, gamma,
int num_steps_averaged = 0; // number of steps to use in polyak-ruppert averaging
GradientDescentParameters gd_params(num_multistarts, max_num_steps, max_num_restarts,
num_steps_averaged, gamma,
pre_mult, max_relative_change_ei, tolerance_ei);

// EI evaluation parameters
Expand Down
15 changes: 11 additions & 4 deletions moe/optimal_learning/cpp/gpp_math_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1051,7 +1051,10 @@ int MultithreadedEIOptimizationTest(ExpectedImprovementEvaluationMode ei_mode) {

const int max_gradient_descent_steps = 250;
const int max_num_restarts = 3;
GradientDescentParameters gd_params(0, max_gradient_descent_steps, max_num_restarts, gamma, pre_mult, max_relative_change, tolerance);
const int num_steps_averaged = 0;
GradientDescentParameters gd_params(0, max_gradient_descent_steps, max_num_restarts,
num_steps_averaged, gamma, pre_mult,
max_relative_change, tolerance);

int max_mc_iterations = 967;

Expand Down Expand Up @@ -1259,8 +1262,10 @@ OL_WARN_UNUSED_RESULT int ExpectedImprovementOptimizationTestCore(ExpectedImprov
const double tolerance = 1.0e-7;
const int max_gradient_descent_steps = 1000;
const int max_num_restarts = 10;
const int num_steps_averaged = 0;
const int num_multistarts = 20;
GradientDescentParameters gd_params(num_multistarts, max_gradient_descent_steps, max_num_restarts,
GradientDescentParameters gd_params(num_multistarts, max_gradient_descent_steps,
max_num_restarts, num_steps_averaged,
gamma, pre_mult, max_relative_change, tolerance);

// grid search parameters
Expand Down Expand Up @@ -1459,9 +1464,10 @@ OL_WARN_UNUSED_RESULT int ExpectedImprovementOptimizationSimplexTestCore(Expecte
const double tolerance = 1.0e-7;
const int max_gradient_descent_steps = 1000;
const int max_num_restarts = 10;
const int num_steps_averaged = 0;
const int num_multistarts = 20;
GradientDescentParameters gd_params(num_multistarts, max_gradient_descent_steps,
max_num_restarts, gamma, pre_mult,
max_num_restarts, num_steps_averaged, gamma, pre_mult,
max_relative_change, tolerance);

// grid search parameters
Expand Down Expand Up @@ -1673,9 +1679,10 @@ int ExpectedImprovementOptimizationMultipleSamplesTest() {
const double tolerance = 1.0e-5;
const int max_gradient_descent_steps = 250;
const int max_num_restarts = 3;
const int num_steps_averaged = 0;
const int num_multistarts = 20;
GradientDescentParameters gd_params(num_multistarts, max_gradient_descent_steps,
max_num_restarts, gamma, pre_mult,
max_num_restarts, num_steps_averaged, gamma, pre_mult,
max_relative_change, tolerance);

// grid search parameters
Expand Down
3 changes: 2 additions & 1 deletion moe/optimal_learning/cpp/gpp_model_selection_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -366,7 +366,8 @@ OL_WARN_UNUSED_RESULT int HyperparameterLikelihoodOptimizationTestCore(LogLikeli
}
const int max_gradient_descent_steps = 600;
const int max_num_restarts = 5;
GradientDescentParameters gd_parameters(1, max_gradient_descent_steps, max_num_restarts, gamma, pre_mult, max_relative_change, tolerance);
const int num_steps_averaged = 0;
GradientDescentParameters gd_parameters(1, max_gradient_descent_steps, max_num_restarts, num_steps_averaged, gamma, pre_mult, max_relative_change, tolerance);

int total_errors = 0;
int current_errors = 0;
Expand Down
2 changes: 2 additions & 0 deletions moe/optimal_learning/cpp/gpp_optimization.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -551,6 +551,8 @@ struct OptimizationIOContainer final {
};

/*!\rst
TODO(GH-390): Implement Polyak-Ruppert Averaging for Gradient Descent

Implements gradient-descrent to to find a locally optimal (maximal here) value of the specified objective function.
Additional high-level discussion is provided in section 2a) in the header docs of this file.

Expand Down
10 changes: 8 additions & 2 deletions moe/optimal_learning/cpp/gpp_optimization_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -214,7 +214,10 @@ OL_WARN_UNUSED_RESULT int MockObjectiveGradientDescentOptimizationTestCore() {
const double tolerance = 1.0e-12;
const int max_gradient_descent_steps = 1000;
const int max_num_restarts = 10;
GradientDescentParameters gd_parameters(1, max_gradient_descent_steps, max_num_restarts, gamma, pre_mult, max_relative_change, tolerance);
const int num_steps_averaged = 0;
GradientDescentParameters gd_parameters(1, max_gradient_descent_steps, max_num_restarts,
num_steps_averaged, gamma, pre_mult,
max_relative_change, tolerance);

int total_errors = 0;
int current_errors = 0;
Expand Down Expand Up @@ -341,7 +344,10 @@ OL_WARN_UNUSED_RESULT int MockObjectiveGradientDescentConstrainedOptimizationTes
const double tolerance = 1.0e-12;
const int max_gradient_descent_steps = 1000;
const int max_num_restarts = 10;
GradientDescentParameters gd_parameters(1, max_gradient_descent_steps, max_num_restarts, gamma, pre_mult, max_relative_change, tolerance);
const int num_steps_averaged = 0;
GradientDescentParameters gd_parameters(1, max_gradient_descent_steps, max_num_restarts,
num_steps_averaged, gamma, pre_mult,
max_relative_change, tolerance);

int total_errors = 0;
int current_errors = 0;
Expand Down
29 changes: 27 additions & 2 deletions moe/optimal_learning/cpp/gpp_optimizer_parameters.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,20 @@ struct NullParameters {
The total number of gradient descent steps is at most ``num_multistarts * max_num_steps * max_num_restarts``
Generally, allowing more iterations leads to a better solution but costs more time.

**Averaging (TODO(GH-390): NOT IMPLEMTED YET)**

When optimizing stochastic objective functions, it can often be beneficial to average some number of gradient descent
steps to obtain the final result (vs just returning the last step).
Polyak-Ruppert averaging: postprocessing step where we replace ``x_n`` with:
``\overbar{x} = \frac{1}{n - n_0} \sum_{t=n_0 + 1}^n x_t``
``n_0 = 0`` averages all steps; ``n_0 = n - 1`` is equivalent to returning ``x_n`` directly.
Here, num_steps_averaged is ``n - n_0``.

* ``num_steps_averaged`` < 0: averages all steps
* ``num_steps_averaged`` == 0: do not average
* ``num_steps_averaged`` > 0 and <= ``max_num_steps``: average the specified number of steps
* ``max_steps_averaged`` > ``max_num_steps``: average all steps

**Learning Rate**

GD may be implemented using a learning rate: ``pre_mult * (i+1)^{-\gamma}``, where i is the current iteration
Expand All @@ -74,10 +88,14 @@ struct GradientDescentParameters {
INPUTS:
See member declarations below for a description of each parameter.
\endrst*/
GradientDescentParameters(int num_multistarts_in, int max_num_steps_in, int max_num_restarts_in, double gamma_in, double pre_mult_in, double max_relative_change_in, double tolerance_in)
GradientDescentParameters(int num_multistarts_in, int max_num_steps_in,
int max_num_restarts_in, int num_steps_averaged_in,
double gamma_in, double pre_mult_in,
double max_relative_change_in, double tolerance_in)
: num_multistarts(num_multistarts_in),
max_num_steps(max_num_steps_in),
max_num_restarts(max_num_restarts_in),
num_steps_averaged(num_steps_averaged_in),
gamma(gamma_in),
pre_mult(pre_mult_in),
max_relative_change(max_relative_change_in),
Expand All @@ -94,6 +112,11 @@ struct GradientDescentParameters {
//! maximum number of gradient descent restarts, the we are allowed to call gradient descent. Should be >= 2 as a minimum (suggest: 4-20)
int max_num_restarts;

// polyak-ruppert averaging control
//! number of steps to use in polyak-ruppert averaging (see above)
//! (suggest: 10-50% of max_num_steps for stochastic problems, 0 otherwise)
int num_steps_averaged;

// learning rate control
//! exponent controlling rate of step size decrease (see struct docs or GradientDescentOptimizer) (suggest: 0.5-0.9)
double gamma;
Expand Down Expand Up @@ -138,7 +161,9 @@ struct NewtonParameters {
INPUTS:
See member declarations below for a description of each parameter.
\endrst*/
NewtonParameters(int num_multistarts_in, int max_num_steps_in, double gamma_in, double time_factor_in, double max_relative_change_in, double tolerance_in)
NewtonParameters(int num_multistarts_in, int max_num_steps_in, double gamma_in,
double time_factor_in, double max_relative_change_in,
double tolerance_in)
: num_multistarts(num_multistarts_in),
max_num_steps(max_num_steps_in),
gamma(gamma_in),
Expand Down
29 changes: 25 additions & 4 deletions moe/optimal_learning/cpp/gpp_python_common.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@

#include <vector> // NOLINT(build/include_order)

#include <boost/python/args.hpp> // NOLINT(build/include_order)
#include <boost/python/class.hpp> // NOLINT(build/include_order)
#include <boost/python/enum.hpp> // NOLINT(build/include_order)
#include <boost/python/extract.hpp> // NOLINT(build/include_order)
Expand Down Expand Up @@ -204,7 +205,8 @@ void ExportEnumTypes() {
}

void ExportOptimizerParameterStructs() {
boost::python::class_<GradientDescentParameters, boost::noncopyable>("GradientDescentParameters", boost::python::init<int, int, int, double, double, double, double>(R"%%(
boost::python::class_<GradientDescentParameters, boost::noncopyable>("GradientDescentParameters", boost::python::init<int, int, int, int, double, double, double, double>(
(boost::python::arg("num_multistarts"), "max_num_steps", "max_num_restarts", "num_steps_averaged", "gamma", "pre_mult", "max_relative_change", "tolerance"), R"%%(
Constructor for a GradientDescentParameters object.

:param num_multistarts: number of initial guesses to try in multistarted gradient descent (suggest: a few hundred)
Expand All @@ -213,6 +215,8 @@ void ExportOptimizerParameterStructs() {
:type max_num_steps: int > 0
:param max_num_restarts: maximum number of gradient descent restarts, the we are allowed to call gradient descent. Should be >= 2 as a minimum (suggest: 4-20)
:type max_num_restarts: int > 0
:param num_steps_averaged: number of steps to use in polyak-ruppert averaging (see above) (suggest: 10-50% of max_num_steps for stochastic problems, 0 otherwise) (UNUSED)
:type num_steps_averaged: int (range is clamped as described above)
:param gamma: exponent controlling rate of step size decrease (see struct docs or GradientDescentOptimizer) (suggest: 0.5-0.9)
:type gamma: float64 > 1.0
:param pre_mult: scaling factor for step size (see struct docs or GradientDescentOptimizer) (suggest: 0.1-1.0)
Expand All @@ -223,9 +227,19 @@ void ExportOptimizerParameterStructs() {
:param tolerance: when the magnitude of the gradient falls below this value OR we will not move farther than tolerance
(e.g., at a boundary), stop. (suggest: 1.0e-7)
:type tolerance: float64 >= 0.0
)%%"));
)%%"))
.def_readwrite("num_multistarts", &GradientDescentParameters::num_multistarts, "number of initial guesses to try in multistarted gradient descent (suggest: a few hundred)")
.def_readwrite("max_num_steps", &GradientDescentParameters::max_num_steps, "maximum number of gradient descent iterations per restart (suggest: 200-1000)")
.def_readwrite("max_num_restarts", &GradientDescentParameters::max_num_restarts, "maximum number of gradient descent restarts, the we are allowed to call gradient descent. Should be >= 2 as a minimum (suggest: 4-20)")
.def_readwrite("num_steps_averaged", &GradientDescentParameters::num_steps_averaged, "number of steps to use in polyak-ruppert averaging (suggest: 10-50% of max_num_steps for stochastic problems, 0 otherwise)")
.def_readwrite("gamma", &GradientDescentParameters::gamma, "exponent controlling rate of step size decrease (see struct docs or GradientDescentOptimizer) (suggest: 0.5-0.9)")
.def_readwrite("pre_mult", &GradientDescentParameters::pre_mult, "scaling factor for step size (see struct docs or GradientDescentOptimizer) (suggest: 0.1-1.0)")
.def_readwrite("max_relative_change", &GradientDescentParameters::max_relative_change, "max change allowed per GD iteration (as a relative fraction of current distance to wall), see ctor docstring")
.def_readwrite("tolerance", &GradientDescentParameters::tolerance, "when the magnitude of the gradient falls below this value OR we will not move farther than tolerance")
; // NOLINT, this is boost style

boost::python::class_<NewtonParameters, boost::noncopyable>("NewtonParameters", boost::python::init<int, int, double, double, double, double>(R"%%(
boost::python::class_<NewtonParameters, boost::noncopyable>("NewtonParameters", boost::python::init<int, int, double, double, double, double>(
(boost::python::arg("num_multistarts"), "max_num_steps", "gamma", "time_factor", "max_relative_change", "tolerance"), R"%%(
Constructor for a NewtonParameters object.

:param num_multistarts: number of initial guesses to try in multistarted newton (suggest: a few hundred)
Expand All @@ -240,7 +254,14 @@ void ExportOptimizerParameterStructs() {
:type max_relative_change: float64 in [0, 1]
:param tolerance: when the magnitude of the gradient falls below this value, stop (suggest: 1.0e-10)
:type tolerance: float64 >= 0.0
)%%"));
)%%"))
.def_readwrite("num_multistarts", &NewtonParameters::num_multistarts, "number of initial guesses to try in multistarted gradient descent (suggest: a few hundred)")
.def_readwrite("max_num_steps", &NewtonParameters::max_num_steps, "maximum number of gradient descent iterations per restart (suggest: 200-1000)")
.def_readwrite("gamma", &NewtonParameters::gamma, "exponent controlling rate of time_factor growth (see class docs and NewtonOptimizer) (suggest: 1.01-1.1)")
.def_readwrite("time_factor", &NewtonParameters::time_factor, "initial amount of additive diagonal dominance (see class docs and NewtonOptimizer) (suggest: 1.0e-3-1.0e-1)")
.def_readwrite("max_relative_change", &NewtonParameters::max_relative_change, "max change allowed per update (as a relative fraction of current distance to wall) (Newton may ignore this) (suggest: 1.0)")
.def_readwrite("tolerance", &NewtonParameters::tolerance, "when the magnitude of the gradient falls below this value, stop (suggest: 1.0e-10)")
; // NOLINT, this is boost style
}

void ExportRandomnessContainer() {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -313,7 +313,7 @@ void DispatchHeuristicExpectedImprovementOptimization(const boost::python::objec
int num_random_samples = boost::python::extract<int>(optimizer_parameters.attr("num_random_samples"));

bool random_search_only = true;
GradientDescentParameters gradient_descent_parameters(0, 0, 0, 1.0, 1.0, 1.0, 0.0); // dummy struct; we aren't using gradient descent
GradientDescentParameters gradient_descent_parameters(0, 0, 0, 0, 1.0, 1.0, 1.0, 0.0); // dummy struct; we aren't using gradient descent
ComputeHeuristicPointsToSample(gaussian_process, gradient_descent_parameters, domain,
estimation_policy, thread_schedule, best_so_far,
random_search_only, num_random_samples, num_to_sample,
Expand Down
4 changes: 2 additions & 2 deletions moe/optimal_learning/cpp/gpp_python_gaussian_process.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -328,9 +328,9 @@ void ExportGaussianProcessFunctions() {
Forces recomputation of all derived quantities for GP to remain consistent.

:param new_points: coordinates of each new point to add
:param new_points: list of float64 with shape (num_new_points, dim)
:type new_points: list of float64 with shape (num_new_points, dim)
:param new_points_value: function value at each new point
:param new_points_value: list of float64 with shape (num_new_points, )
:type new_points_value: list of float64 with shape (num_new_points, )
:param new_points_noise_variance: \sigma_n^2 corresponding to the signal noise in measuring new_points_value
:type new_points_noise_variance: list of float64 with shape (num_new_points, )
:param num_new_points: number of new points to add to the GP
Expand Down
Loading