From e45dcff27eaee9f072c9f9538dd16e27afe1ef27 Mon Sep 17 00:00:00 2001 From: MRXLT Date: Fri, 14 Aug 2020 18:15:06 +0800 Subject: [PATCH 01/30] add doc; notest --- python/paddle/fluid/optimizer.py | 170 ++++++++++++++++++++----------- 1 file changed, 112 insertions(+), 58 deletions(-) diff --git a/python/paddle/fluid/optimizer.py b/python/paddle/fluid/optimizer.py index 740db0d4b9e35..9706c8f2f41d4 100644 --- a/python/paddle/fluid/optimizer.py +++ b/python/paddle/fluid/optimizer.py @@ -188,7 +188,7 @@ def set_dict(self, state_dict): para_state_dict, opti_state_dict = fluid.load_dygraph( "paddle_dy") - adam.set_dict(opti_state_dict) + adam.load_state_dict(opti_state_dict) ''' @@ -1141,7 +1141,7 @@ def _append_optimize_op(self, block, param_and_grad): class DGCMomentumOptimizer(Optimizer): """ - :api_attr: Static Graph + :api_attr: Static Graph DGC (Deep Gradient Compression) Momentum Optimizer. Original paper is https://arxiv.org/abs/1712.01887 @@ -1804,29 +1804,22 @@ class AdamOptimizer(Optimizer): learning\_rate & = learning\_rate * \\ \\frac{\sqrt{1 - {\\beta}_2^t}}{1 - {\\beta}_1^t} - param\_out & = param - learning\_rate * \\frac{moment\_1}{\sqrt{moment\_2} + \epsilon} + param\_out & = param - learning\_rate * \frac{moment\_1}{\sqrt{moment\_2} + \epsilon} Related paper: `Adam: A Method for Stochastic Optimization `_ Args: - learning_rate (float|Variable, optional): The learning rate used to update ``Parameter``. - It can be a float value or a ``Variable`` with a float type. The default value is 0.001. - beta1 (float|Variable, optional): The exponential decay rate for the 1st moment estimates. - It should be a float number or a Variable with shape [1] and data type as float32. - The default value is 0.9. - beta2 (float|Variable, optional): The exponential decay rate for the 2nd moment estimates. - It should be a float number or a Variable with shape [1] and data type as float32. - The default value is 0.999. - epsilon (float, optional): A small float value for numerical stability. + lr (float|Tensor, optional): The learning rate used to update ``Parameter``. + It can be a float value or a ``Tensor`` with a float type. The default value is 0.001. + betas (list[float]|list[Tensor], optional): The list of exponential decay rate for moment estimates. + The elements of list should be float numbers or Tensor with shape [1] and data type as float32. + The default value is [0.9, 0.999]. + eps (float, optional): A small float value for numerical stability. The default value is 1e-08. - parameter_list (Iterable, optional): Iterable of ``Variable`` names to update to minimize ``loss``. \ + params (Iterable, optional): Iterable of ``Tensor`` names to update to minimize ``loss``. \ This parameter is required in dygraph mode. \ The default value is None in static mode, at this time all parameters will be updated. - regularization (WeightDecayRegularizer, optional): The strategy of regularization. There are two method: \ - :ref:`api_fluid_regularizer_L1Decay` , :ref:`api_fluid_regularizer_L2Decay` . If a parameter has set \ - regularizer using :ref:`api_fluid_ParamAttr` already, the regularization setting here in optimizer will be \ - ignored for this parameter. Otherwise, the regularization setting here in optimizer will take effect. \ - Default None, meaning there is no regularization. + weight_decay (float, optional): Weight decay of L2 regularization. The default value is 0. grad_clip (GradientClipBase, optional): Gradient cliping strategy, it's an instance of some derived class of ``GradientClipBase`` . There are three cliping strategies ( :ref:`api_fluid_clip_GradientClipByGlobalNorm` , :ref:`api_fluid_clip_GradientClipByNorm` , @@ -1857,7 +1850,7 @@ class AdamOptimizer(Optimizer): cost = fluid.layers.square_error_cost(input=y_predict, label=y) avg_cost = fluid.layers.mean(cost) - adam_optimizer = fluid.optimizer.AdamOptimizer(0.01) + adam_optimizer = fluid.optimizer.Adam(0.01) adam_optimizer.minimize(avg_cost) fetch_list = [avg_cost] @@ -1871,7 +1864,7 @@ class AdamOptimizer(Optimizer): .. code-block:: python - # Adam with beta1/beta2 as Variable + # Adam with betas as list[Tensor] import paddle import paddle.fluid as fluid import paddle.fluid.layers.learning_rate_scheduler as lr_scheduler @@ -1885,7 +1878,7 @@ class AdamOptimizer(Optimizer): cost = fluid.layers.square_error_cost(input=y_predict, label=y) avg_cost = fluid.layers.mean(cost) - # define beta decay variable + # define beta decay Tensor def get_decayed_betas(beta1_init, beta2_init, decay_steps, decay_rate): global_step = lr_scheduler._decay_step_counter() @@ -1910,13 +1903,12 @@ def get_decayed_betas(beta1_init, beta2_init, decay_steps, decay_rate): fluid.layers.assign(decayed_beta1, beta1) fluid.layers.assign(decayed_beta2, beta2) - return beta1, beta2 + return [beta1, beta2] - beta1, beta2 = get_decayed_betas(0.9, 0.99, 1e5, 0.9) - adam_optimizer = fluid.optimizer.AdamOptimizer( - learning_rate=0.01, - beta1=beta1, - beta2=beta2) + betas = get_decayed_betas(0.9, 0.99, 1e5, 0.9) + adam_optimizer = fluid.optimizer.Adam( + lr=0.01, + betas=betas) adam_optimizer.minimize(avg_cost) fetch_list = [avg_cost] @@ -2077,22 +2069,17 @@ class AdamaxOptimizer(Optimizer): it is added here for numerical stability to prevent the division by 0 error. Args: - learning_rate (float|Variable, optional): The learning rate used to update ``Parameter``. - It can be a float value or a ``Variable`` with a float type. The default value is 0.001. - beta1 (float, optional): The exponential decay rate for the 1st moment estimates. - The default value is 0.9. - beta2 (float, optional): The exponential decay rate for the 2nd moment estimates. - The default value is 0.999. + lr (float|Tensor, optional): The learning rate used to update ``Parameter``. + It can be a float value or a ``Tensor`` with a float type. The default value is 0.001. + betas (list[float]|list[Tensor], optional): The list of exponential decay rate for moment estimates. + The elements of list should be float numbers or Tensor with shape [1] and data type as float32. + The default value is [0.9, 0.999]. epsilon (float, optional): A small float value for numerical stability. The default value is 1e-08. - parameter_list (Iterable, optional): Iterable of ``Variable`` names to update to minimize ``loss``. \ + parms (Iterable, optional): Iterable of ``Tensor`` names to update to minimize ``loss``. \ This parameter is required in dygraph mode. \ The default value is None in static mode, at this time all parameters will be updated. - regularization (WeightDecayRegularizer, optional): The strategy of regularization. There are two method: \ - :ref:`api_fluid_regularizer_L1Decay` , :ref:`api_fluid_regularizer_L2Decay` . If a parameter has set \ - regularizer using :ref:`api_fluid_ParamAttr` already, the regularization setting here in optimizer will be \ - ignored for this parameter. Otherwise, the regularization setting here in optimizer will take effect. \ - Default None, meaning there is no regularization. + weight_decay (float, optional): Weight decay of L2 regularization. The default value is 0. grad_clip (GradientClipBase, optional): Gradient cliping strategy, it's an instance of some derived class of ``GradientClipBase`` . There are three cliping strategies ( :ref:`api_fluid_clip_GradientClipByGlobalNorm` , :ref:`api_fluid_clip_GradientClipByNorm` , @@ -2102,7 +2089,7 @@ class AdamaxOptimizer(Optimizer): The default value is None. **Notes**: - **Currently, AdamaxOptimizer doesn't support sparse parameter optimization.** + **Currently, Adamax doesn't support sparse parameter optimization.** Examples: .. code-block:: python @@ -2120,7 +2107,7 @@ class AdamaxOptimizer(Optimizer): data = fluid.data(name='X', shape=[None, 1], dtype='float32') hidden = fluid.layers.fc(input=data, size=10) loss = fluid.layers.mean(hidden) - adam = fluid.optimizer.AdamaxOptimizer(learning_rate=0.2) + adam = fluid.optimizer.Adamax(lr=0.2) adam.minimize(loss) # Run the startup program once and only once. @@ -2221,6 +2208,77 @@ def _finish_update(self, block, parameters_and_grads): attrs={"scale": self._beta1}, stop_gradient=True) +class AdmaW(optimizer): + """ + The AdamaW optimizer is implemented based on the AdamaW Optimization + in paper `DECOUPLED WEIGHT DECAY REGULARIZATION `_. + it can resolves the problem of L2 regularization failure in the Adam optimizer. + + .. math:: + + t & = t + 1 + + moment\_1\_out & = {\\beta}_1 * moment\_1 + (1 - {\\beta}_1) * grad + + moemnt\_2\_out & = {\\beta}_2 * moment\_2 + (1 - {\\beta}_2) * grad * grad + + learning\_rate & = learning\_rate * \ + \frac{\sqrt{1 - {\\beta}_2^t}}{1 - {beta}_1^t} + + param\_out & = param - learning\_rate * (\frac{moment\_1}{\sqrt{moment\_2} + \epsilon} + \lambda * param) + + + Args: + lr (float|Tensor, optional): The learning rate used to update ``Parameter``. + It can be a float value or a ``Tensor`` with a float type. The default value is 0.001. + betas (list[float]|list[Tensor], optional): The list of exponential decay rate for moment estimates. + The elements of list should be float numbers or Tensor with shape [1] and data type as float32. + The default value is [0.9, 0.999]. + epsilon (float, optional): A small float value for numerical stability. + The default value is 1e-08. + parms (Iterable, optional): Iterable of ``Tensor`` names to update to minimize ``loss``. \ + This parameter is required in dygraph mode. \ + The default value is None in static mode, at this time all parameters will be updated. + weight_decay (float, optional): Weight decay of L2 regularization. The default value is 0. + grad_clip (GradientClipBase, optional): Gradient cliping strategy, it's an instance of + some derived class of ``GradientClipBase`` . There are three cliping strategies + ( :ref:`api_fluid_clip_GradientClipByGlobalNorm` , :ref:`api_fluid_clip_GradientClipByNorm` , + :ref:`api_fluid_clip_GradientClipByValue` ). Default None, meaning there is no gradient clipping. + name (str, optional): Normally there is no need for user to set this property. + For more information, please refer to :ref:`api_guide_Name`. + The default value is None. + + **Notes**: + **Currently, Adamax doesn't support sparse parameter optimization.** + + Examples: + .. code-block:: python + + import paddle.fluid as fluid + import numpy + + # First create the Executor. + place = fluid.CPUPlace() # fluid.CUDAPlace(0) + exe = fluid.Executor(place) + + train_program = fluid.Program() + startup_program = fluid.Program() + with fluid.program_guard(train_program, startup_program): + data = fluid.data(name='X', shape=[None, 1], dtype='float32') + hidden = fluid.layers.fc(input=data, size=10) + loss = fluid.layers.mean(hidden) + adam = fluid.optimizer.AdamW(lr=0.2) + adam.minimize(loss) + + # Run the startup program once and only once. + exe.run(startup_program) + + x = numpy.random.random(size=(10, 1)).astype('float32') + outs = exe.run(program=train_program, + feed={'X': x}, + fetch_list=[loss.name]) + """ + class DpsgdOptimizer(Optimizer): """ @@ -2595,9 +2653,9 @@ class RMSPropOptimizer(Optimizer): Parameters: - learning_rate(float): Global learning rate. - rho(float): rho is :math: `\\rho` in equation, default is 0.95. - epsilon(float): :math: `\\epsilon` in equation is smoothing term to + lr(float): Global learning rate. + alpha(float): rho is :math: `\\rho` in equation, default is 0.95. + eps(float): :math: `\\epsilon` in equation is smoothing term to avoid division by zero, default is 1e-6. momentum(float): :math:`\\beta` in equation is the momentum term, default is 0.0. @@ -2605,14 +2663,10 @@ class RMSPropOptimizer(Optimizer): the gradient; if False, by the uncentered second moment. Setting this to True may help with training, but is slightly more expensive in terms of computation and memory. Defaults to False. - parameter_list (Iterable, optional): Iterable of ``Variable`` names to update to minimize ``loss``. \ + params (Iterable, optional): Iterable of ``Tensor`` names to update to minimize ``loss``. \ This parameter is required in dygraph mode. \ The default value is None in static mode, at this time all parameters will be updated. - regularization (WeightDecayRegularizer, optional): The strategy of regularization. There are two method: \ - :ref:`api_fluid_regularizer_L1Decay` , :ref:`api_fluid_regularizer_L2Decay` . If a parameter has set \ - regularizer using :ref:`api_fluid_ParamAttr` already, the regularization setting here in optimizer will be \ - ignored for this parameter. Otherwise, the regularization setting here in optimizer will take effect. \ - Default None, meaning there is no regularization. + weight_decay (float, optional): Weight decay of L2 regularization. The default value is 0. grad_clip (GradientClipBase, optional): Gradient cliping strategy, it's an instance of some derived class of ``GradientClipBase`` . There are three cliping strategies ( :ref:`api_fluid_clip_GradientClipByGlobalNorm` , :ref:`api_fluid_clip_GradientClipByNorm` , @@ -2621,7 +2675,7 @@ class RMSPropOptimizer(Optimizer): For details, please refer to :ref:`api_guide_Name`. Default is None. Raises: - ValueError: If learning_rate, rho, epsilon, momentum are None. + ValueError: If lr, rho, epsilon, momentum are None. Examples: .. code-block:: python @@ -2639,7 +2693,7 @@ class RMSPropOptimizer(Optimizer): cost = fluid.layers.square_error_cost(input=y_predict, label=y) avg_cost = fluid.layers.mean(cost) - rms_optimizer = fluid.optimizer.RMSProp(learning_rate=0.1) + rms_optimizer = fluid.optimizer.RMSProp(lr=0.1) rms_optimizer.minimize(avg_cost) fetch_list = [avg_cost] @@ -3067,7 +3121,7 @@ def _append_optimize_op(self, block, param_and_grad): class ModelAverage(Optimizer): """ - :api_attr: Static Graph + :api_attr: Static Graph The ModelAverage optimizer accumulates specific continuous historical parameters during training. The accumulated historical range can be controlled by the passed @@ -3376,7 +3430,7 @@ def restore(self, executor): class ExponentialMovingAverage(object): """ - :api_attr: Static Graph + :api_attr: Static Graph Compute the moving average of parameters with exponential decay. Given a parameter :math:`\\theta`, its exponential moving average (EMA) @@ -3626,7 +3680,7 @@ def restore(self, executor): class PipelineOptimizer(object): """ - :api_attr: Static Graph + :api_attr: Static Graph Pipeline Optimizer: Make a program to run as pipeline, that is splitting a program into multiple sections (sub-programs) and each section run on a @@ -4477,7 +4531,7 @@ def minimize(self, class RecomputeOptimizer(Optimizer): """ - :api_attr: Static Graph + :api_attr: Static Graph Recompute Optimizer Wrapper @@ -4562,7 +4616,7 @@ def _set_checkpoints(self, checkpoints): def load(self, stat_dict): """ - :api_attr: Static Graph + :api_attr: Static Graph load function is not supported by Recompute Optimizer for now. :return: None @@ -4786,7 +4840,7 @@ def minimize(self, class LookaheadOptimizer(object): """ - :api_attr: Static Graph + :api_attr: Static Graph This implements the Lookahead optimizer of the paper : https://arxiv.org/abs/1907.08610. From 85b3f92d2d72352fed75d2627390baf48fd35541 Mon Sep 17 00:00:00 2001 From: MRXLT Date: Fri, 14 Aug 2020 18:28:16 +0800 Subject: [PATCH 02/30] fix doc; notest --- python/paddle/fluid/optimizer.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/python/paddle/fluid/optimizer.py b/python/paddle/fluid/optimizer.py index 9706c8f2f41d4..f205d10965a75 100644 --- a/python/paddle/fluid/optimizer.py +++ b/python/paddle/fluid/optimizer.py @@ -2198,7 +2198,7 @@ def _finish_update(self, block, parameters_and_grads): if grad is None or param.trainable is False: continue with param.block.program._optimized_guard( - [param, grad]), name_scope('adamx'): + [param, grad]), name_scope('adamax'): beta1_pow_acc = self._get_accumulator(self._beta1_pow_acc_str, param) block.append_op( @@ -2208,9 +2208,9 @@ def _finish_update(self, block, parameters_and_grads): attrs={"scale": self._beta1}, stop_gradient=True) -class AdmaW(optimizer): +class AdamW(optimizer): """ - The AdamaW optimizer is implemented based on the AdamaW Optimization + The AdamW optimizer is implemented based on the AdamW Optimization in paper `DECOUPLED WEIGHT DECAY REGULARIZATION `_. it can resolves the problem of L2 regularization failure in the Adam optimizer. @@ -2249,7 +2249,7 @@ class AdmaW(optimizer): The default value is None. **Notes**: - **Currently, Adamax doesn't support sparse parameter optimization.** + **Currently, AdamW doesn't support sparse parameter optimization.** Examples: .. code-block:: python From cbcd9506912829422372ee58328530a3267532b6 Mon Sep 17 00:00:00 2001 From: MRXLT Date: Fri, 14 Aug 2020 18:51:48 +0800 Subject: [PATCH 03/30] update doc; notest --- python/paddle/fluid/optimizer.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/python/paddle/fluid/optimizer.py b/python/paddle/fluid/optimizer.py index f205d10965a75..8e3fe1fc8dd7f 100644 --- a/python/paddle/fluid/optimizer.py +++ b/python/paddle/fluid/optimizer.py @@ -1804,7 +1804,7 @@ class AdamOptimizer(Optimizer): learning\_rate & = learning\_rate * \\ \\frac{\sqrt{1 - {\\beta}_2^t}}{1 - {\\beta}_1^t} - param\_out & = param - learning\_rate * \frac{moment\_1}{\sqrt{moment\_2} + \epsilon} + param\_out & = param - learning\_rate * \\frac{moment\_1}{\sqrt{moment\_2} + \epsilon} Related paper: `Adam: A Method for Stochastic Optimization `_ @@ -2222,10 +2222,10 @@ class AdamW(optimizer): moemnt\_2\_out & = {\\beta}_2 * moment\_2 + (1 - {\\beta}_2) * grad * grad - learning\_rate & = learning\_rate * \ - \frac{\sqrt{1 - {\\beta}_2^t}}{1 - {beta}_1^t} + learning\_rate & = learning\_rate * \\ + \\frac{\sqrt{1 - {\\beta}_2^t}}{1 - {beta}_1^t} - param\_out & = param - learning\_rate * (\frac{moment\_1}{\sqrt{moment\_2} + \epsilon} + \lambda * param) + param\_out & = param - learning\_rate * (\\frac{moment\_1}{\sqrt{moment\_2} + \epsilon} + \lambda * param) Args: From 9661a540871106a90a0f1faabf31272aae1df66a Mon Sep 17 00:00:00 2001 From: MRXLT Date: Fri, 14 Aug 2020 18:51:48 +0800 Subject: [PATCH 04/30] refine optimizer && adam --- python/paddle/fluid/optimizer.py | 172 +- .../fluid/tests/unittests/test_adam_op.py | 34 + python/paddle/optimizer/__init__.py | 7 +- python/paddle/optimizer/optimizer.py | 5248 +++++++++++++++++ 4 files changed, 5344 insertions(+), 117 deletions(-) create mode 100644 python/paddle/optimizer/optimizer.py diff --git a/python/paddle/fluid/optimizer.py b/python/paddle/fluid/optimizer.py index f205d10965a75..740db0d4b9e35 100644 --- a/python/paddle/fluid/optimizer.py +++ b/python/paddle/fluid/optimizer.py @@ -188,7 +188,7 @@ def set_dict(self, state_dict): para_state_dict, opti_state_dict = fluid.load_dygraph( "paddle_dy") - adam.load_state_dict(opti_state_dict) + adam.set_dict(opti_state_dict) ''' @@ -1141,7 +1141,7 @@ def _append_optimize_op(self, block, param_and_grad): class DGCMomentumOptimizer(Optimizer): """ - :api_attr: Static Graph + :api_attr: Static Graph DGC (Deep Gradient Compression) Momentum Optimizer. Original paper is https://arxiv.org/abs/1712.01887 @@ -1804,22 +1804,29 @@ class AdamOptimizer(Optimizer): learning\_rate & = learning\_rate * \\ \\frac{\sqrt{1 - {\\beta}_2^t}}{1 - {\\beta}_1^t} - param\_out & = param - learning\_rate * \frac{moment\_1}{\sqrt{moment\_2} + \epsilon} + param\_out & = param - learning\_rate * \\frac{moment\_1}{\sqrt{moment\_2} + \epsilon} Related paper: `Adam: A Method for Stochastic Optimization `_ Args: - lr (float|Tensor, optional): The learning rate used to update ``Parameter``. - It can be a float value or a ``Tensor`` with a float type. The default value is 0.001. - betas (list[float]|list[Tensor], optional): The list of exponential decay rate for moment estimates. - The elements of list should be float numbers or Tensor with shape [1] and data type as float32. - The default value is [0.9, 0.999]. - eps (float, optional): A small float value for numerical stability. + learning_rate (float|Variable, optional): The learning rate used to update ``Parameter``. + It can be a float value or a ``Variable`` with a float type. The default value is 0.001. + beta1 (float|Variable, optional): The exponential decay rate for the 1st moment estimates. + It should be a float number or a Variable with shape [1] and data type as float32. + The default value is 0.9. + beta2 (float|Variable, optional): The exponential decay rate for the 2nd moment estimates. + It should be a float number or a Variable with shape [1] and data type as float32. + The default value is 0.999. + epsilon (float, optional): A small float value for numerical stability. The default value is 1e-08. - params (Iterable, optional): Iterable of ``Tensor`` names to update to minimize ``loss``. \ + parameter_list (Iterable, optional): Iterable of ``Variable`` names to update to minimize ``loss``. \ This parameter is required in dygraph mode. \ The default value is None in static mode, at this time all parameters will be updated. - weight_decay (float, optional): Weight decay of L2 regularization. The default value is 0. + regularization (WeightDecayRegularizer, optional): The strategy of regularization. There are two method: \ + :ref:`api_fluid_regularizer_L1Decay` , :ref:`api_fluid_regularizer_L2Decay` . If a parameter has set \ + regularizer using :ref:`api_fluid_ParamAttr` already, the regularization setting here in optimizer will be \ + ignored for this parameter. Otherwise, the regularization setting here in optimizer will take effect. \ + Default None, meaning there is no regularization. grad_clip (GradientClipBase, optional): Gradient cliping strategy, it's an instance of some derived class of ``GradientClipBase`` . There are three cliping strategies ( :ref:`api_fluid_clip_GradientClipByGlobalNorm` , :ref:`api_fluid_clip_GradientClipByNorm` , @@ -1850,7 +1857,7 @@ class AdamOptimizer(Optimizer): cost = fluid.layers.square_error_cost(input=y_predict, label=y) avg_cost = fluid.layers.mean(cost) - adam_optimizer = fluid.optimizer.Adam(0.01) + adam_optimizer = fluid.optimizer.AdamOptimizer(0.01) adam_optimizer.minimize(avg_cost) fetch_list = [avg_cost] @@ -1864,7 +1871,7 @@ class AdamOptimizer(Optimizer): .. code-block:: python - # Adam with betas as list[Tensor] + # Adam with beta1/beta2 as Variable import paddle import paddle.fluid as fluid import paddle.fluid.layers.learning_rate_scheduler as lr_scheduler @@ -1878,7 +1885,7 @@ class AdamOptimizer(Optimizer): cost = fluid.layers.square_error_cost(input=y_predict, label=y) avg_cost = fluid.layers.mean(cost) - # define beta decay Tensor + # define beta decay variable def get_decayed_betas(beta1_init, beta2_init, decay_steps, decay_rate): global_step = lr_scheduler._decay_step_counter() @@ -1903,12 +1910,13 @@ def get_decayed_betas(beta1_init, beta2_init, decay_steps, decay_rate): fluid.layers.assign(decayed_beta1, beta1) fluid.layers.assign(decayed_beta2, beta2) - return [beta1, beta2] + return beta1, beta2 - betas = get_decayed_betas(0.9, 0.99, 1e5, 0.9) - adam_optimizer = fluid.optimizer.Adam( - lr=0.01, - betas=betas) + beta1, beta2 = get_decayed_betas(0.9, 0.99, 1e5, 0.9) + adam_optimizer = fluid.optimizer.AdamOptimizer( + learning_rate=0.01, + beta1=beta1, + beta2=beta2) adam_optimizer.minimize(avg_cost) fetch_list = [avg_cost] @@ -2069,17 +2077,22 @@ class AdamaxOptimizer(Optimizer): it is added here for numerical stability to prevent the division by 0 error. Args: - lr (float|Tensor, optional): The learning rate used to update ``Parameter``. - It can be a float value or a ``Tensor`` with a float type. The default value is 0.001. - betas (list[float]|list[Tensor], optional): The list of exponential decay rate for moment estimates. - The elements of list should be float numbers or Tensor with shape [1] and data type as float32. - The default value is [0.9, 0.999]. + learning_rate (float|Variable, optional): The learning rate used to update ``Parameter``. + It can be a float value or a ``Variable`` with a float type. The default value is 0.001. + beta1 (float, optional): The exponential decay rate for the 1st moment estimates. + The default value is 0.9. + beta2 (float, optional): The exponential decay rate for the 2nd moment estimates. + The default value is 0.999. epsilon (float, optional): A small float value for numerical stability. The default value is 1e-08. - parms (Iterable, optional): Iterable of ``Tensor`` names to update to minimize ``loss``. \ + parameter_list (Iterable, optional): Iterable of ``Variable`` names to update to minimize ``loss``. \ This parameter is required in dygraph mode. \ The default value is None in static mode, at this time all parameters will be updated. - weight_decay (float, optional): Weight decay of L2 regularization. The default value is 0. + regularization (WeightDecayRegularizer, optional): The strategy of regularization. There are two method: \ + :ref:`api_fluid_regularizer_L1Decay` , :ref:`api_fluid_regularizer_L2Decay` . If a parameter has set \ + regularizer using :ref:`api_fluid_ParamAttr` already, the regularization setting here in optimizer will be \ + ignored for this parameter. Otherwise, the regularization setting here in optimizer will take effect. \ + Default None, meaning there is no regularization. grad_clip (GradientClipBase, optional): Gradient cliping strategy, it's an instance of some derived class of ``GradientClipBase`` . There are three cliping strategies ( :ref:`api_fluid_clip_GradientClipByGlobalNorm` , :ref:`api_fluid_clip_GradientClipByNorm` , @@ -2089,7 +2102,7 @@ class AdamaxOptimizer(Optimizer): The default value is None. **Notes**: - **Currently, Adamax doesn't support sparse parameter optimization.** + **Currently, AdamaxOptimizer doesn't support sparse parameter optimization.** Examples: .. code-block:: python @@ -2107,7 +2120,7 @@ class AdamaxOptimizer(Optimizer): data = fluid.data(name='X', shape=[None, 1], dtype='float32') hidden = fluid.layers.fc(input=data, size=10) loss = fluid.layers.mean(hidden) - adam = fluid.optimizer.Adamax(lr=0.2) + adam = fluid.optimizer.AdamaxOptimizer(learning_rate=0.2) adam.minimize(loss) # Run the startup program once and only once. @@ -2198,7 +2211,7 @@ def _finish_update(self, block, parameters_and_grads): if grad is None or param.trainable is False: continue with param.block.program._optimized_guard( - [param, grad]), name_scope('adamax'): + [param, grad]), name_scope('adamx'): beta1_pow_acc = self._get_accumulator(self._beta1_pow_acc_str, param) block.append_op( @@ -2208,77 +2221,6 @@ def _finish_update(self, block, parameters_and_grads): attrs={"scale": self._beta1}, stop_gradient=True) -class AdamW(optimizer): - """ - The AdamW optimizer is implemented based on the AdamW Optimization - in paper `DECOUPLED WEIGHT DECAY REGULARIZATION `_. - it can resolves the problem of L2 regularization failure in the Adam optimizer. - - .. math:: - - t & = t + 1 - - moment\_1\_out & = {\\beta}_1 * moment\_1 + (1 - {\\beta}_1) * grad - - moemnt\_2\_out & = {\\beta}_2 * moment\_2 + (1 - {\\beta}_2) * grad * grad - - learning\_rate & = learning\_rate * \ - \frac{\sqrt{1 - {\\beta}_2^t}}{1 - {beta}_1^t} - - param\_out & = param - learning\_rate * (\frac{moment\_1}{\sqrt{moment\_2} + \epsilon} + \lambda * param) - - - Args: - lr (float|Tensor, optional): The learning rate used to update ``Parameter``. - It can be a float value or a ``Tensor`` with a float type. The default value is 0.001. - betas (list[float]|list[Tensor], optional): The list of exponential decay rate for moment estimates. - The elements of list should be float numbers or Tensor with shape [1] and data type as float32. - The default value is [0.9, 0.999]. - epsilon (float, optional): A small float value for numerical stability. - The default value is 1e-08. - parms (Iterable, optional): Iterable of ``Tensor`` names to update to minimize ``loss``. \ - This parameter is required in dygraph mode. \ - The default value is None in static mode, at this time all parameters will be updated. - weight_decay (float, optional): Weight decay of L2 regularization. The default value is 0. - grad_clip (GradientClipBase, optional): Gradient cliping strategy, it's an instance of - some derived class of ``GradientClipBase`` . There are three cliping strategies - ( :ref:`api_fluid_clip_GradientClipByGlobalNorm` , :ref:`api_fluid_clip_GradientClipByNorm` , - :ref:`api_fluid_clip_GradientClipByValue` ). Default None, meaning there is no gradient clipping. - name (str, optional): Normally there is no need for user to set this property. - For more information, please refer to :ref:`api_guide_Name`. - The default value is None. - - **Notes**: - **Currently, AdamW doesn't support sparse parameter optimization.** - - Examples: - .. code-block:: python - - import paddle.fluid as fluid - import numpy - - # First create the Executor. - place = fluid.CPUPlace() # fluid.CUDAPlace(0) - exe = fluid.Executor(place) - - train_program = fluid.Program() - startup_program = fluid.Program() - with fluid.program_guard(train_program, startup_program): - data = fluid.data(name='X', shape=[None, 1], dtype='float32') - hidden = fluid.layers.fc(input=data, size=10) - loss = fluid.layers.mean(hidden) - adam = fluid.optimizer.AdamW(lr=0.2) - adam.minimize(loss) - - # Run the startup program once and only once. - exe.run(startup_program) - - x = numpy.random.random(size=(10, 1)).astype('float32') - outs = exe.run(program=train_program, - feed={'X': x}, - fetch_list=[loss.name]) - """ - class DpsgdOptimizer(Optimizer): """ @@ -2653,9 +2595,9 @@ class RMSPropOptimizer(Optimizer): Parameters: - lr(float): Global learning rate. - alpha(float): rho is :math: `\\rho` in equation, default is 0.95. - eps(float): :math: `\\epsilon` in equation is smoothing term to + learning_rate(float): Global learning rate. + rho(float): rho is :math: `\\rho` in equation, default is 0.95. + epsilon(float): :math: `\\epsilon` in equation is smoothing term to avoid division by zero, default is 1e-6. momentum(float): :math:`\\beta` in equation is the momentum term, default is 0.0. @@ -2663,10 +2605,14 @@ class RMSPropOptimizer(Optimizer): the gradient; if False, by the uncentered second moment. Setting this to True may help with training, but is slightly more expensive in terms of computation and memory. Defaults to False. - params (Iterable, optional): Iterable of ``Tensor`` names to update to minimize ``loss``. \ + parameter_list (Iterable, optional): Iterable of ``Variable`` names to update to minimize ``loss``. \ This parameter is required in dygraph mode. \ The default value is None in static mode, at this time all parameters will be updated. - weight_decay (float, optional): Weight decay of L2 regularization. The default value is 0. + regularization (WeightDecayRegularizer, optional): The strategy of regularization. There are two method: \ + :ref:`api_fluid_regularizer_L1Decay` , :ref:`api_fluid_regularizer_L2Decay` . If a parameter has set \ + regularizer using :ref:`api_fluid_ParamAttr` already, the regularization setting here in optimizer will be \ + ignored for this parameter. Otherwise, the regularization setting here in optimizer will take effect. \ + Default None, meaning there is no regularization. grad_clip (GradientClipBase, optional): Gradient cliping strategy, it's an instance of some derived class of ``GradientClipBase`` . There are three cliping strategies ( :ref:`api_fluid_clip_GradientClipByGlobalNorm` , :ref:`api_fluid_clip_GradientClipByNorm` , @@ -2675,7 +2621,7 @@ class RMSPropOptimizer(Optimizer): For details, please refer to :ref:`api_guide_Name`. Default is None. Raises: - ValueError: If lr, rho, epsilon, momentum are None. + ValueError: If learning_rate, rho, epsilon, momentum are None. Examples: .. code-block:: python @@ -2693,7 +2639,7 @@ class RMSPropOptimizer(Optimizer): cost = fluid.layers.square_error_cost(input=y_predict, label=y) avg_cost = fluid.layers.mean(cost) - rms_optimizer = fluid.optimizer.RMSProp(lr=0.1) + rms_optimizer = fluid.optimizer.RMSProp(learning_rate=0.1) rms_optimizer.minimize(avg_cost) fetch_list = [avg_cost] @@ -3121,7 +3067,7 @@ def _append_optimize_op(self, block, param_and_grad): class ModelAverage(Optimizer): """ - :api_attr: Static Graph + :api_attr: Static Graph The ModelAverage optimizer accumulates specific continuous historical parameters during training. The accumulated historical range can be controlled by the passed @@ -3430,7 +3376,7 @@ def restore(self, executor): class ExponentialMovingAverage(object): """ - :api_attr: Static Graph + :api_attr: Static Graph Compute the moving average of parameters with exponential decay. Given a parameter :math:`\\theta`, its exponential moving average (EMA) @@ -3680,7 +3626,7 @@ def restore(self, executor): class PipelineOptimizer(object): """ - :api_attr: Static Graph + :api_attr: Static Graph Pipeline Optimizer: Make a program to run as pipeline, that is splitting a program into multiple sections (sub-programs) and each section run on a @@ -4531,7 +4477,7 @@ def minimize(self, class RecomputeOptimizer(Optimizer): """ - :api_attr: Static Graph + :api_attr: Static Graph Recompute Optimizer Wrapper @@ -4616,7 +4562,7 @@ def _set_checkpoints(self, checkpoints): def load(self, stat_dict): """ - :api_attr: Static Graph + :api_attr: Static Graph load function is not supported by Recompute Optimizer for now. :return: None @@ -4840,7 +4786,7 @@ def minimize(self, class LookaheadOptimizer(object): """ - :api_attr: Static Graph + :api_attr: Static Graph This implements the Lookahead optimizer of the paper : https://arxiv.org/abs/1907.08610. diff --git a/python/paddle/fluid/tests/unittests/test_adam_op.py b/python/paddle/fluid/tests/unittests/test_adam_op.py index 7a7099b7113c8..3a35006e8cbc6 100644 --- a/python/paddle/fluid/tests/unittests/test_adam_op.py +++ b/python/paddle/fluid/tests/unittests/test_adam_op.py @@ -20,6 +20,7 @@ from paddle.fluid import core from paddle.fluid.op import Operator import paddle.fluid as fluid +import paddle class TestAdamOp1(OpTest): @@ -443,5 +444,38 @@ def test_with_place(place, shape): test_with_place(place, shape) +class TestAdamOpBetasV2(unittest.TestCase): + def test_adam_op(self): + exe = fluid.Executor(place, shape) + train_prog = fluid.Program() + startup = fluid.Program() + with fluid.program_guard(train_prog, startup): + with fluid.unique_name.guard(): + data = fluid.data(name="data", shape=shape) + conv = fluid.layers.conv2d(data, 8, 3) + loss = fluid.layers.reduce_mean(conv) + + beta1 = fluid.layers.create_global_var( + shape=[1], value=0.85, dtype='float32', persistable=True) + beta2 = fluid.layers.create_global_var( + shape=[1], value=0.95, dtype='float32', persistable=True) + betas = [beta1, beta2] + opt = paddle.optimizer.Adam( + lr=1e-5, betas=betas, weight_decay=0.01) + opt.minimize(loss) + + exe.run(startup) + data_np = np.random.random(shape).astype('float32') + rets = exe.run(train_prog, feed={"data": data_np}, fetch_list=[loss]) + assert rets[0] is not None + + shape = [2, 3, 8, 8] + places = [fluid.CPUPlace()] + if core.is_compiled_with_cuda(): + places.append(fluid.CUDAPlace(0)) + for place in places: + test_with_place(place, shape) + + if __name__ == "__main__": unittest.main() diff --git a/python/paddle/optimizer/__init__.py b/python/paddle/optimizer/__init__.py index 4dc3cf397aea5..26b6c20af1176 100644 --- a/python/paddle/optimizer/__init__.py +++ b/python/paddle/optimizer/__init__.py @@ -14,7 +14,7 @@ __all__ = [ 'Adadelta', 'AdadeltaOptimizer', 'Adagrad', 'AdagradOptimizer', 'Adam', - 'Adamax', 'AdamaxOptimizer', 'AdamOptimizer', 'DecayedAdagrad', + 'Adamax', 'DecayedAdagrad', 'AdamW' 'DecayedAdagradOptimizer', 'DGCMomentumOptimizer', 'Dpsgd', 'DpsgdOptimizer', 'ExponentialMovingAverage', 'Ftrl', 'FtrlOptimizer', 'LambOptimizer', 'LarsMomentum', 'LarsMomentumOptimizer', @@ -24,9 +24,8 @@ ] -from ..fluid.optimizer import SGD, Momentum, Adagrad, Adam, Adamax, Dpsgd, DecayedAdagrad, \ - Ftrl, SGDOptimizer, MomentumOptimizer, AdagradOptimizer, \ - AdamOptimizer, AdamaxOptimizer, DpsgdOptimizer, \ +from .optimizer import SGD, Momentum, Adagrad, Adam, Adamax, AdamW, Dpsgd, DecayedAdagrad, \ + Ftrl, SGDOptimizer, MomentumOptimizer, AdagradOptimizer, DpsgdOptimizer, \ DecayedAdagradOptimizer, RMSPropOptimizer, FtrlOptimizer, Adadelta, \ AdadeltaOptimizer, ModelAverage, LarsMomentum, \ LarsMomentumOptimizer, DGCMomentumOptimizer, LambOptimizer, \ diff --git a/python/paddle/optimizer/optimizer.py b/python/paddle/optimizer/optimizer.py new file mode 100644 index 0000000000000..0f7f9ccba38e0 --- /dev/null +++ b/python/paddle/optimizer/optimizer.py @@ -0,0 +1,5248 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import numpy as np +import six +import logging +from collections import defaultdict + +from paddle.fluid.distribute_lookup_table import find_distributed_lookup_table +from paddle.fluid.framework import Program, Variable, name_scope, default_main_program, default_startup_program, device_guard + +from ..fluid import framework +from ..fluid import layers +from ..fluid import unique_name +from ..fluid.backward import append_backward, _some_in_set_, _append_grad_suffix_, _get_no_grad_set_name +from ..fluid.clip import GradientClipBase, GradientClipByNorm, error_clip_callback, append_gradient_clip_ops +from ..fluid.framework import program_guard +from ..fluid.initializer import Constant +from ..fluid.layer_helper import LayerHelper +from ..fluid.layers import ops +from ..fluid.regularizer import append_regularization_ops +from ..fluid.dygraph import base as imperative_base +from ..fluid.dygraph import no_grad +from ..fluid.dygraph.learning_rate_scheduler import LearningRateDecay, _LearningRateEpochDecay +from paddle.fluid import core +from paddle.fluid.layers import tensor +from paddle.fluid.regularizer import L2Decay +from functools import reduce +from ..fluid.wrapped_decorator import signature_safe_contextmanager +from .. import compat as cpt + +__all__ = [ + 'SGD', 'Momentum', 'Adagrad', 'Adam', 'Adamax', 'Dpsgd', 'DecayedAdagrad', + 'Ftrl', 'SGDOptimizer', 'MomentumOptimizer', 'AdagradOptimizer', + 'AdamaxOptimizer', 'DpsgdOptimizer', 'DecayedAdagradOptimizer', + 'RMSPropOptimizer', 'FtrlOptimizer', 'Adadelta', 'AdadeltaOptimizer', + 'ModelAverage', 'LarsMomentum', 'LarsMomentumOptimizer', 'LambOptimizer', + 'ExponentialMovingAverage', 'PipelineOptimizer', 'LookaheadOptimizer', + 'RecomputeOptimizer' +] + + +class Optimizer(object): + """Optimizer Base class. + + Define the common interface of an optimizer. + User should not use this class directly, + but need to use one of it's implementation. + """ + + @imperative_base.no_grad() + def __init__(self, + learning_rate, + parameter_list=None, + regularization=None, + grad_clip=None, + name=None): + self._parameter_list = list( + parameter_list) if parameter_list is not None else None + self._name = name + if framework.in_dygraph_mode(): + if not isinstance(learning_rate, float) and \ + not isinstance(learning_rate, LearningRateDecay): + raise TypeError( + "learning rate should be float or LearningRateDecay, got %s here" + % type(learning_rate)) + if self._parameter_list is None: + raise AttributeError( + "parameter_list argument given to the Optimizer should not be None in dygraph mode." + ) + if regularization is not None: + for param in self._parameter_list: + if param.regularizer is not None: + logging.info( + "If regularizer of a Parameter has been set by 'fluid.ParamAttr' or 'fluid.WeightNormParamAttr' already. " + "The Regularization[%s] in Optimizer will not take effect, and it will only be applied to other Parameters!" + % regularization.__str__()) + break + else: + if not isinstance(learning_rate, float) and \ + not isinstance(learning_rate, framework.Variable): + raise TypeError( + "learning rate should be float or Variable, got %s here" % + type(learning_rate)) + + if grad_clip is not None: + if not isinstance(grad_clip, GradientClipBase): + raise TypeError( + "'grad_clip' should be an instance of GradientClipBase's derived class" + ) + self.regularization = regularization + self._grad_clip = grad_clip + self._learning_rate = learning_rate + # the learning rate type should be inferenced from loss + self._dtype = None + # each program should have a independent learning rate + # program -> Variable(learning_rate) + self._learning_rate_map = dict() + if isinstance(self._learning_rate, framework.Variable): + self._learning_rate_map[framework.default_main_program( + )] = self._learning_rate + # Dictionary of accumulators. Some optimizer subclasses need to + # allocate and manage extra variables associated with the parameters + # to train. These variables are called accumulators. + # {accum_name : { paramter_name : accumulator_for_parameter, ...}, ...} + self._accumulators = defaultdict(lambda: dict()) + self.helper = None + self._opti_name_list = [] + self._accumulators_holder = {} + self._param_device_map = dict() + + @framework.dygraph_only + def state_dict(self): + ''' + Get state dict information from optimizer. It contain all the variable used by optimizer. For Adam optimizer, contains beta1, beta2, momentum etc. If LearningRateDecay have been used, global_step will be include in state dict. + If the optimizer never be called(minimize function), the state_dict is empty. + + Args: None + Return: + state_dict(dict) : dict contains all the variable used by optimizer + + Examples: + .. code-block:: python + + import paddle.fluid as fluid + import paddle + + with fluid.dygraph.guard(): + emb = fluid.dygraph.Embedding([10, 10]) + + adam = paddle.optimizer.Adam(0.001, params=emb.parameters()) + state_dict = adam.state_dict() + + ''' + state_dict = {} + for k, v in self._accumulators.items(): + for para_name, var_tmp in v.items(): + state_dict[var_tmp.name] = var_tmp + # global step if use lr decay + if isinstance(self._learning_rate, LearningRateDecay): + state_dict["LR_Scheduler"] = self._learning_rate.state_dict() + + if not isinstance(self._learning_rate, _LearningRateEpochDecay): + var_tmp = None + var_temp = framework._varbase_creator( + None, name='global_step', dtype='int32') + + tensor.fill_constant( + [1], "int32", self._learning_rate.step_num, out=var_temp) + + state_dict['global_step'] = var_temp + return state_dict + + @framework.dygraph_only + def load_state_dict(self, state_dict): + ''' + Load optimizer state dict. For Adam optimizer, contains beta1, beta2, momentum etc. If LearningRateDecay have been used, global_step will be changed. + + Args: + state_dict(dict) : Dict contains all the Variable needed by optimizer + Return: + None + + Examples: + .. code-block:: python + + with fluid.dygraph.guard(): + emb = fluid.dygraph.Embedding([10, 10]) + + state_dict = emb.state_dict() + fluid.save_dygraph(state_dict, "paddle_dy") + + adam = paddle.optimizer.Adam(lr=fluid.layers.noam_decay( 100, 10000), + params=emb.parameters()) + state_dict = adam.state_dict() + fluid.save_dygraph(state_dict, "paddle_dy") + + para_state_dict, opti_state_dict = fluid.load_dygraph( "paddle_dy") + + adam.load_state_dict(opti_state_dict) + + ''' + + if isinstance(self._learning_rate, LearningRateDecay): + self._learning_rate.set_dict(state_dict["LR_Scheduler"]) + + if not isinstance(self._learning_rate, _LearningRateEpochDecay): + assert 'global_step' in state_dict, \ + 'Global step not in state dict, Dygraph use LearningRateDecay, global_step must in state_dict' + global_step = state_dict['global_step'] + + if isinstance(global_step, Variable): + step_np = global_step + step_np = np.array(step_np.value().get_tensor()) + assert step_np.shape == (1,), \ + "global step shape is (1,), the shape is {}".format( step_np.shape ) + + self._learning_rate.step_num = int(step_np[0]) + elif isinstance(global_step, np.ndarray): + assert global_step.shape == (1,), \ + "global step shape is (1,), the shape is {}".format( global_step.shape ) + self._learning_rate.step_num = global_step[0] + else: + raise RuntimeError( + "Type not supprt, value in state dict must be [VarBase, Variable, numpy], the type is ", + type(global_step)) + + self._accumulators_holder = state_dict + for k, v in self._accumulators.items(): + for para_name, var_tmp in v.items(): + assert var_tmp.name in state_dict, \ + "optimizer variable {} not found".format( var_tmp.name ) + var = var_tmp.value() + tensor = var.get_tensor() + model_np = np.array(tensor) + + load_para = state_dict[var_tmp.name] + + if isinstance(load_para, Variable): + load_para_np = load_para.numpy() + elif isinstance(load_para, core.VarBase): + load_para_np = load_para.numpy() + elif isinstance(load_para, np.ndarray): + load_para_np = load_para + else: + raise RuntimeError("State dict type {} not supprt".format( + str(type(load_para)))) + + assert model_np.shape == load_para_np.shape, \ + "Parameter shape not match, Dygraph Parameter [ {} ] need tensor with shape {} but load tensor with shape {}".format( + item.name, model_np.shape, load_para_np.shape) + + assert model_np.dtype == load_para_np.dtype, \ + "Parameter dtype not match, Dygraph Parameter [ {} ] need tensor with dtype {} but load tensor with dtype {}".format( + item.name, model_np.dtype, load_para_np.dtype) + + tensor.set(load_para_np, framework._current_expected_place()) + + def get_opti_var_name_list(self): + return self._opti_name_list + + def _create_global_learning_rate(self): + if imperative_base.enabled(): + # create learning rate Variable + if isinstance(self._learning_rate, float): + lr = self._global_learning_rate() + + if isinstance(lr, framework.Variable): + return + else: + self._learning_rate_map[framework.default_main_program( + )] = layers.create_global_var( + name=unique_name.generate("learning_rate"), + shape=[1], + value=float(self._learning_rate), + dtype='float32' if self._dtype is None else self._dtype, + persistable=True) + # get learning rate Variable from LearningRateDecay + elif isinstance(self._learning_rate, LearningRateDecay): + self._learning_rate_map[framework.default_main_program( + )] = self._learning_rate() + else: + raise TypeError( + "optimizer's learning rate must be float or LearningRateDecay" + ) + else: + lr = self._global_learning_rate() + + if isinstance(lr, framework.Variable): + return + else: + if not isinstance(self._learning_rate, float): + raise TypeError( + "learning rate variable is create outside optimizer," + "can not create new learning rate variable for new program" + ) + + # create learning rate in the current main program + self._learning_rate_map[framework.default_main_program( + )] = layers.create_global_var( + name=unique_name.generate("learning_rate"), + shape=[1], + value=float(self._learning_rate), + dtype='float32' if self._dtype is None else self._dtype, + persistable=True) + + @framework.dygraph_only + def set_lr(self, value): + """ + :api_attr: imperative + + Set the value of the learning rate manually in the optimizer. If the optimizer use LearningRateDecay, + this API cannot be invoked, because it will lead to conflict. + + Args: + value (float|Variable): the value of learning rate + + Returns: + None + + Examples: + .. code-block:: python + + import paddle.fluid as fluid + import paddle + + with fluid.dygraph.guard(): + linear = fluid.dygraph.nn.Linear(10, 10) + + adam = paddle.optimizer.Adam(0.1, params=linear.parameters()) + + # set learning rate manually by python float value + lr_list = [0.2, 0.3, 0.4, 0.5, 0.6] + for i in range(5): + adam.set_lr(lr_list[i]) + lr = adam.current_step_lr() + print("current lr is {}".format(lr)) + # Print: + # current lr is 0.2 + # current lr is 0.3 + # current lr is 0.4 + # current lr is 0.5 + # current lr is 0.6 + + + # set learning rate manually by framework Variable + lr_var = fluid.layers.create_global_var( + shape=[1], value=0.7, dtype='float32') + adam.set_lr(lr_var) + lr = adam.current_step_lr() + print("current lr is {}".format(lr)) + # Print: + # current lr is 0.7 + + + + """ + if not isinstance(value, (framework.Variable, float)): + raise TypeError( + "The type of 'value' in optimizer.set_lr must be (float, Variable), but received %s." + % (type(value))) + if isinstance(self._learning_rate, LearningRateDecay): + raise RuntimeError( + "optimizer's learning rate can't be LearningRateDecay when invoke this API, because this will lead to conflict." + ) + if isinstance(value, float): + self._learning_rate = value + current_lr = self._global_learning_rate() + if current_lr is not None: + global_block = framework.default_main_program().global_block() + global_block.append_op( + type='fill_constant', + outputs={'Out': [current_lr]}, + attrs={ + 'dtype': current_lr.dtype, + 'shape': list(current_lr.shape), + 'value': float(value) + }, + stop_gradient=True) + else: + assert len(value.shape) == 1 and value.shape[ + 0] == 1, "optimizer's learning rate must be 1-D Tensor with shape[1]" + self._learning_rate_map[framework.default_main_program()] = value + + @framework.dygraph_only + def current_step_lr(self): + """ + :api_attr: imperative + + Get current step learning rate. The return value is all the same When LearningRateDecay is not used, + otherwise return the step learning rate. + + Returns: + float: The learning rate of the current step. + + Examples: + .. code-block:: python + + import paddle.fluid as fluid + import paddle + import numpy as np + + # example1: LearningRateDecay is not used, return value is all the same + with fluid.dygraph.guard(): + emb = fluid.dygraph.Embedding([10, 10]) + adam = paddle.optimizer.Adam(0.001, params = emb.parameters()) + lr = adam.current_step_lr() + print(lr) # 0.001 + + # example2: PiecewiseDecay is used, return the step learning rate + with fluid.dygraph.guard(): + inp = np.random.uniform(-0.1, 0.1, [10, 10]).astype("float32") + linear = fluid.dygraph.nn.Linear(10, 10) + inp = fluid.dygraph.to_variable(inp) + out = linear(inp) + loss = fluid.layers.reduce_mean(out) + + bd = [2, 4, 6, 8] + value = [0.2, 0.4, 0.6, 0.8, 1.0] + adam = paddle.optimizer.Adam(fluid.dygraph.PiecewiseDecay(bd, value, 0), + params=linear.parameters()) + + # first step: learning rate is 0.2 + np.allclose(adam.current_step_lr(), 0.2, rtol=1e-06, atol=0.0) # True + + # learning rate for different steps + ret = [0.2, 0.2, 0.4, 0.4, 0.6, 0.6, 0.8, 0.8, 1.0, 1.0, 1.0, 1.0] + for i in range(12): + adam.minimize(loss) + lr = adam.current_step_lr() + np.allclose(lr, ret[i], rtol=1e-06, atol=0.0) # True + + """ + current_lr = self._global_learning_rate() + if isinstance(current_lr, framework.Variable): + return self._global_learning_rate().numpy()[0] + + if isinstance(self._learning_rate, float): + return self._learning_rate + elif isinstance(self._learning_rate, _LearningRateEpochDecay): + step_lr = self._learning_rate() + return step_lr.numpy()[0] + else: + step_lr = self._learning_rate.step() + if isinstance(step_lr, (float, int)): + return step_lr + else: + return step_lr.numpy()[0] + + def _global_learning_rate(self, program=None): + """ + get global decayed learning rate + :return: + """ + if program is None: + program = framework.default_main_program() + return self._learning_rate_map.get(program, None) + + def _append_optimize_op(self, block, param_and_grad): + """ append optimize operator to block and return all the added optimize_op + """ + raise NotImplementedError() + + def _create_param_lr(self, param_and_grad): + # create learning rate variable for every parameter + param = param_and_grad[0] + param_lr = param.optimize_attr['learning_rate'] + if type(param_lr) == Variable: + return param_lr + else: + if param_lr == 1.0: + return self._global_learning_rate() + else: + with default_main_program()._lr_schedule_guard( + is_with_opt=True), framework.name_scope( + 'scale_with_param_lr'): + return self._global_learning_rate() * param_lr + + def _create_accumulators(self, block, parameters): + """Create all accumulators needed by the parameters + + Args: + block: the block in which the loss variable is present + parameters: list of parameter variables for the optimizer + """ + pass + + def _finish_update(self, block, parameters_and_grads): + """Finish any custom updates needed + before completing an optimization step + + Args: + block: the block in which the loss variable is present + parameters: list of parameter variables for the optimizer + + Returns: + None + """ + pass + + def _add_accumulator(self, + name, + param, + dtype=None, + fill_value=0.0, + shape=None, + type=None, + device=None): + """Utility function to add an accumulator for a parameter + + Args: + block: the block in which the loss variable is present + name: name of the accumulator + param: parameter variable for which accumulator is to be added + dtype: data type of the accumulator variable + fill_value: value to initialize the accumulator variable + """ + if self._name is not None: + name = self._name + "_" + name + if (name in self._accumulators and + param.name in self._accumulators[name]): + if framework.in_dygraph_mode(): + return self._accumulators[name][param.name] + raise Exception("Accumulator {} already exists for parameter {}". + format(name, param.name)) + if shape == None: + shape = param.shape + assert isinstance(self.helper, LayerHelper) + + var_name = param.name + "_" + name + var_name = unique_name.generate(var_name) + self._opti_name_list.append(var_name) + + var = self.helper.create_global_variable( + name=var_name, + persistable=True, + dtype=dtype or param.dtype, + type=param.type if type is None else type, + shape=shape, + belong_to_optimizer=True) + if device is None: + device = self._get_device_for_param(param.name) + with device_guard(device): + self.helper.set_variable_initializer( + var, initializer=Constant(value=float(fill_value))) + + if framework.in_dygraph_mode(): + if len(self._accumulators_holder) > 0: + assert var_name in self._accumulators_holder, \ + "Optimizer set error, {} should in state dict".format( var_name ) + var.set_value(self._accumulators_holder[var_name]) + + self._accumulators[name][param.name] = var + return var + + def _get_accumulator(self, name, param): + """Utility function to fetch an accumulator for a parameter + + Args: + name: name of the accumulator + param: parameter variable for which accumulator is to be fetched + + Returns: + accumulator variable for the parameter + """ + if self._name is not None: + name = self._name + "_" + name + if (name not in self._accumulators or + param.name not in self._accumulators[name]): + raise Exception("Accumulator {} does not exist for parameter {}". + format(name, param.name)) + return self._accumulators[name][param.name] + + def _update_param_device_map(self, parameters_and_grads, target_block): + for param_and_grad in parameters_and_grads: + if param_and_grad[0].trainable is True: + param_name = param_and_grad[0].name + ops = target_block.ops + device_attr_name = core.op_proto_and_checker_maker.kOpDeviceAttrName( + ) + for op in ops: + input_arg_names = op.input_arg_names + if param_name in input_arg_names: + self._param_device_map[param_name] = op.attr( + device_attr_name) + break + + def _get_device_for_param(self, param_name): + device = None + if param_name in self._param_device_map: + device = self._param_device_map[param_name] + return device + + def _create_optimization_pass(self, parameters_and_grads): + """Add optimization operators to update gradients to variables. + + Args: + parameters_and_grads(list(tuple(Variable, Variable))): + a list of (variable, gradient) pair to update. + + Returns: + return_op_list: a list of operators that will complete one step of + optimization. This will include parameter update ops, global step + update ops and any other custom ops required by subclasses to manage + their internal state. + """ + # This is a default implementation of create_optimization_pass that + # can be shared by most optimizers. This implementation assumes that + # the subclass will implement the _append_optimize_op method and the + # _initialize_tensors method. The subclass can extend the + # _create_accumulators method if it needs to create accumulators + # for parameters and extend _finish_update method to add custom ops. + + # Allways called under program_guard use global block as loss block + # But if current block is in control flow, append optimize op in the + # grad block of current block + + global_block = framework.default_main_program().global_block() + target_block = global_block + current_block = framework.default_main_program().current_block() + if current_block.idx != global_block.idx: + assert current_block.backward_block_idx != -1, \ + "current block is not global_block, but it doesn't have backward block." + target_block = framework.default_main_program().blocks[ + current_block.backward_block_idx] + + start = len(target_block.ops) + self.helper = LayerHelper(self.__class__.__name__) + self._update_param_device_map(parameters_and_grads, target_block) + self._create_accumulators( + target_block, + [p[0] for p in parameters_and_grads if p[0].trainable]) + self._create_global_learning_rate() + + if framework.in_dygraph_mode(): + for param_and_grad in parameters_and_grads: + if param_and_grad[1] is None: + continue + if param_and_grad[0].trainable is True: + self._append_optimize_op(target_block, param_and_grad) + else: + for param_and_grad in parameters_and_grads: + if param_and_grad[1] is None: + continue + with param_and_grad[0].block.program._optimized_guard( + param_and_grad), name_scope("optimizer"): + if param_and_grad[0].trainable is True: + device = self._get_device_for_param(param_and_grad[0] + .name) + with device_guard(device): + optimize_op = self._append_optimize_op( + target_block, param_and_grad) + + # Get custom finish ops for subclasses + # FIXME: Need to fix this once we figure out how to handle dependencies + self._finish_update(target_block, parameters_and_grads) + + end = len(target_block.ops) + return target_block._slice_ops(start, end) + + def _process_distribute_lookuptable(self, param_grads): + """ + Because distribute lookup table only support SGD optimizer for now, not support + other optimizer and regularization, so we should find the table parameter out, + and avoid to add regularization and other op for it, and add sgd optimize op + for it independently. + :param param_grads(list((Var, Var))): list of (param, grad) pair. + :param loss: the loss variable. + :param startup_program: the startup program + """ + program = framework.default_main_program() + global_block = framework.default_main_program().global_block() + table_name = find_distributed_lookup_table(program) + table_param = None + table_grad = None + new_param_grads = [] + for p, g in param_grads: + if p.name == table_name: + if table_param is not None: + raise RuntimeError( + "multi dist table var found, only support one now!") + table_param = p + table_grad = g + else: + new_param_grads.append((p, g)) + sgd_op = None + if table_param is not None: + param_and_grad = [table_param, table_grad] + with table_param.block.program._optimized_guard(param_and_grad), \ + framework.name_scope("optimizer"): + self._create_global_learning_rate() + # create the optimize op + sgd_op = global_block.append_op( + type='sgd', + inputs={ + "Param": table_param, + "Grad": table_grad, + "LearningRate": self._create_param_lr(param_and_grad) + }, + outputs={"ParamOut": param_and_grad[0]}) + return new_param_grads, (table_param, table_grad), sgd_op + + def _append_dgc_ops(self, param_and_grad): + pass + + def backward(self, + loss, + startup_program=None, + parameter_list=None, + no_grad_set=None, + callbacks=None): + """ + The first part of ``minimize``, do auto-diff to append backward operations for + the current program. + + Args: + loss (Variable): ``loss`` variable to run optimizations. + startup_program (Program, optional): :ref:`api_fluid_Program` for + initializing parameters in ``parameter_list``. The default value + is None, at this time :ref:`api_fluid_default_startup_program` will be used. + parameter_list (Iterable, optional): Iterable of ``Variable`` or ``Variable.name`` to update + to minimize ``loss``. The default value is None, at this time all parameters + will be updated. + no_grad_set (set, optional): Set of ``Variable`` or ``Variable.name`` that don't need + to be updated. The default value is None. + callbacks (list, optional): list of callable objects to run when appending backward + operator for one parameter. The default value is None. + + Return: + list: list of (param, grad) variable pairs, param is ``Parameter``, + grad is the gradient value corresponding to the parameter. + + Examples: + See examples in ``apply_gradients``. + """ + act_no_grad_set = None + if framework.in_dygraph_mode(): + pass + else: + act_no_grad_set = self._get_no_grad_set(loss, no_grad_set) + + self._dtype = loss.dtype + if framework.in_dygraph_mode(): + params_grads = [] + for param in self._parameter_list: + if not param.trainable: + continue + if param._grad_ivar() is not None: + # create gradient variable + grad_var = param._grad_ivar() + params_grads.append((param, grad_var)) + else: + if callbacks is None: + callbacks = [error_clip_callback] + else: + assert (isinstance(callbacks, list)) + program = loss.block.program + assert len(loss.shape) == 1 and loss.shape[0] == 1, \ + "The loss.shape should be (1L,), but the current loss.shape is {}. " \ + "Maybe that you should call fluid.layers.mean to process the current loss.".format( + loss.shape) + parameter_list = parameter_list if parameter_list \ + else self._parameter_list + with program_guard(program, startup_program): + params_grads = append_backward(loss, parameter_list, + act_no_grad_set, callbacks) + # Note: since we can't use all_reduce_op now, + # dgc_op should be the last op of one grad. + self._append_dgc_ops(params_grads) + return params_grads + + def apply_gradients(self, params_grads): + """ + Second part of `minimize`, appending optimization operators for + given `params_grads` pairs. + + Args: + params_grads (list): list of (param, grad) pair to do optimization. + + Returns: + list: A list of operators appended to the current program. + + Examples: + .. code-block:: python + + import paddle.fluid as fluid + import paddle + loss = network() + optimizer = paddle.optimizer.SGD(learning_rate=0.1) + params_grads = optimizer.backward(loss) + # you may append operations for params_grads here + # ... + optimizer.apply_gradients(params_grads) + """ + + params_grads = sorted(params_grads, key=lambda x: x[0].name) + + # 'optimizer(grad_clip)' or 'set_gradient_clip' + if self._grad_clip is not None: + params_grads = self._grad_clip(params_grads) + else: + params_grads = append_gradient_clip_ops(params_grads) + + # Add regularization if any + params_grads = append_regularization_ops(params_grads, + self.regularization) + + optimize_ops = self._create_optimization_pass(params_grads) + return optimize_ops + + def apply_optimize(self, loss, startup_program, params_grads): + """ + Second part of `minimize`, appending optimization operators for + given `params_grads` pairs. + Args: + loss (Variable): loss variable to run optimizations. + startup_program (Program): startup_program for initializing parameters + in `parameter_list`. + params_grads (list): list of (param, grad) pair to do optimization. + Returns: + list: A list of operators appended to the current program. + """ + if framework.in_dygraph_mode(): + with program_guard(framework.default_main_program(), + framework.default_startup_program()): + if self._grad_clip is not None: + params_grads = self._grad_clip(params_grads) + params_grads = append_regularization_ops(params_grads, + self.regularization) + optimize_ops = self._create_optimization_pass(params_grads) + else: + program = loss.block.program + with program_guard(program, startup_program): + optimize_ops = self.apply_gradients(params_grads) + return optimize_ops + + def _get_no_grad_set(self, loss, no_grad_set=None): + no_grad_set = _get_no_grad_set_name(no_grad_set) + parameters = loss.block.program.global_block().all_parameters() + param_no_trainable = set( + [param.name for param in parameters if param.trainable is False]) + # If the parameter is no trainable, it should not have a gradient. + no_grad_set.update(param_no_trainable) + + return no_grad_set + + @framework.dygraph_only + def clear_gradients(self): + """ + Clear the gradients of all optimized parameters for model. + + Returns: + None + + Examples: + .. code-block:: python + + import paddle.fluid as fluid + import paddle + import numpy as np + + with fluid.dygraph.guard(): + value = np.arange(26).reshape(2, 13).astype("float32") + a = fluid.dygraph.to_variable(value) + linear = fluid.Linear(13, 5, dtype="float32") + # This can be any optimizer supported by dygraph. + adam = paddle.optimizer.Adam(lr = 0.01, + params = linear.parameters()) + out = linear(a) + out.backward() + adam.minimize(out) + adam.clear_gradients() + + """ + for p in self._parameter_list: + if p.trainable: + p.clear_gradient() + + @imperative_base.no_grad() + def minimize(self, + loss, + startup_program=None, + parameter_list=None, + no_grad_set=None): + """ + Add operations to minimize ``loss`` by updating ``parameter_list``. + + Args: + loss (Variable): A ``Variable`` containing the value to minimize. + startup_program (Program, optional): :ref:`api_fluid_Program` for + initializing parameters in ``parameter_list``. The default value + is None, at this time :ref:`api_fluid_default_startup_program` will be used. + parameter_list (Iterable, optional): Iterable of ``Variable`` or ``Variable.name`` to update + to minimize ``loss``. The default value is None, at this time all parameters + will be updated. + no_grad_set (set, optional): Set of ``Variable`` or ``Variable.name`` that don't need + to be updated. The default value is None. + + Returns: + tuple: tuple (optimize_ops, params_grads), A list of operators appended + by minimize and a list of (param, grad) variable pairs, param is + ``Parameter``, grad is the gradient value corresponding to the parameter. + The returned tuple can be passed to ``fetch_list`` in ``Executor.run()`` to + indicate program pruning. If so, the program will be pruned by ``feed`` and + ``fetch_list`` before run, see details in ``Executor``. + + Examples: + Please refer to the example of current Optimizer. + """ + assert isinstance(loss, Variable), "The loss should be an Variable." + + parameter_list = parameter_list if parameter_list \ + else self._parameter_list + params_grads = self.backward( + loss, + startup_program=startup_program, + parameter_list=parameter_list, + no_grad_set=no_grad_set) + + optimize_ops = self.apply_optimize( + loss, startup_program=startup_program, params_grads=params_grads) + + return optimize_ops, params_grads + + +class SGDOptimizer(Optimizer): + """ + Optimizer of the stochastic gradient descent algorithm. + + .. math:: + + param\_out = param - learning\_rate * grad + + Parameters: + learning_rate (float|Variable): The learning rate used to update parameters. \ + Can be a float value or a Variable with one float value as data element. + parameter_list (Iterable, optional): Iterable of ``Variable`` names to update to minimize ``loss``. \ + This parameter is required in dygraph mode. \ + The default value is None in static mode, at this time all parameters will be updated. + regularization (WeightDecayRegularizer, optional): The strategy of regularization. There are two method: \ + :ref:`api_fluid_regularizer_L1Decay` , :ref:`api_fluid_regularizer_L2Decay` . If a parameter has set \ + regularizer using :ref:`api_fluid_ParamAttr` already, the regularization setting here in optimizer will be \ + ignored for this parameter. Otherwise, the regularization setting here in optimizer will take effect. \ + Default None, meaning there is no regularization. + grad_clip (GradientClipBase, optional): Gradient cliping strategy, it's an instance of + some derived class of ``GradientClipBase`` . There are three cliping strategies + ( :ref:`api_fluid_clip_GradientClipByGlobalNorm` , :ref:`api_fluid_clip_GradientClipByNorm` , + :ref:`api_fluid_clip_GradientClipByValue` ). Default None, meaning there is no gradient clipping. + name (str, optional): This parameter is used by developers to print debugging information. \ + For details, please refer to :ref:`api_guide_Name`. Default is None. + + Examples: + .. code-block:: python + + import paddle + import paddle.fluid as fluid + import numpy as np + + place = fluid.CPUPlace() + main = fluid.Program() + with fluid.program_guard(main): + x = fluid.layers.data(name='x', shape=[13], dtype='float32') + y = fluid.layers.data(name='y', shape=[1], dtype='float32') + y_predict = fluid.layers.fc(input=x, size=1, act=None) + cost = fluid.layers.square_error_cost(input=y_predict, label=y) + avg_cost = fluid.layers.mean(cost) + + sgd_optimizer = paddle.optimizer.SGD(learning_rate=0.001) + sgd_optimizer.minimize(avg_cost) + + fetch_list = [avg_cost] + train_reader = paddle.batch( + paddle.dataset.uci_housing.train(), batch_size=1) + feeder = fluid.DataFeeder(place=place, feed_list=[x, y]) + exe = fluid.Executor(place) + exe.run(fluid.default_startup_program()) + for data in train_reader(): + exe.run(main, feed=feeder.feed(data), fetch_list=fetch_list) + + """ + + def __init__(self, + learning_rate, + parameter_list=None, + regularization=None, + grad_clip=None, + name=None): + assert learning_rate is not None + super(SGDOptimizer, self).__init__( + learning_rate=learning_rate, + parameter_list=parameter_list, + regularization=regularization, + grad_clip=grad_clip, + name=name) + self.type = "sgd" + + @no_grad() + def _append_optimize_op(self, block, param_and_grad): + lr = self._create_param_lr(param_and_grad) + if framework.in_dygraph_mode(): + core.ops.sgd(param_and_grad[0], lr, param_and_grad[1], + param_and_grad[0]) + return None + + assert isinstance(block, framework.Block) + # create the optimize op + sgd_op = block.append_op( + type=self.type, + inputs={ + "Param": param_and_grad[0], + "Grad": param_and_grad[1], + "LearningRate": lr + }, + outputs={"ParamOut": param_and_grad[0]}, + stop_gradient=True) + + return sgd_op + + +class MomentumOptimizer(Optimizer): + """ + + Simple Momentum optimizer with velocity state + + This optimizer has a flag for Nestrov Momentum. + + The update equations are as follows: + + .. math:: + + & velocity = mu * velocity + gradient + + & if (use\_nesterov): + + &\quad param = param - (gradient + mu * velocity) * learning\_rate + + & else: + + &\quad param = param - learning\_rate * velocity + + Parameters: + learning_rate (float|Variable): The learning rate used to update parameters. \ + Can be a float value or a Variable with one float value as data element. + momentum (float): Momentum factor + parameter_list (Iterable, optional): Iterable of ``Variable`` names to update to minimize ``loss``. \ + This parameter is required in dygraph mode. \ + The default value is None in static mode, at this time all parameters will be updated. + use_nesterov (bool, optional): Enables Nesterov momentum, default is false. + regularization (WeightDecayRegularizer, optional): The strategy of regularization. There are two method: \ + :ref:`api_fluid_regularizer_L1Decay` , :ref:`api_fluid_regularizer_L2Decay` . If a parameter has set \ + regularizer using :ref:`api_fluid_ParamAttr` already, the regularization setting here in optimizer will be \ + ignored for this parameter. Otherwise, the regularization setting here in optimizer will take effect. \ + Default None, meaning there is no regularization. + grad_clip (GradientClipBase, optional): Gradient cliping strategy, it's an instance of + some derived class of ``GradientClipBase`` . There are three cliping strategies + ( :ref:`api_fluid_clip_GradientClipByGlobalNorm` , :ref:`api_fluid_clip_GradientClipByNorm` , + :ref:`api_fluid_clip_GradientClipByValue` ). Default None, meaning there is no gradient clipping. + name (str, optional): This parameter is used by developers to print debugging information. \ + For details, please refer to :ref:`api_guide_Name`. Default is None. + + Examples: + .. code-block:: python + + import paddle + import paddle.fluid as fluid + import numpy as np + + place = fluid.CPUPlace() + main = fluid.Program() + with fluid.program_guard(main): + x = fluid.layers.data(name='x', shape=[13], dtype='float32') + y = fluid.layers.data(name='y', shape=[1], dtype='float32') + y_predict = fluid.layers.fc(input=x, size=1, act=None) + cost = fluid.layers.square_error_cost(input=y_predict, label=y) + avg_cost = fluid.layers.mean(cost) + + moment_optimizer = paddle.optimizer.MomentumOptimizer(learning_rate=0.001, momentum=0.9) + moment_optimizer.minimize(avg_cost) + + fetch_list = [avg_cost] + train_reader = paddle.batch( + paddle.dataset.uci_housing.train(), batch_size=1) + feeder = fluid.DataFeeder(place=place, feed_list=[x, y]) + exe = fluid.Executor(place) + exe.run(fluid.default_startup_program()) + for data in train_reader(): + exe.run(main, feed=feeder.feed(data), fetch_list=fetch_list) + + """ + _velocity_acc_str = "velocity" + + def __init__(self, + learning_rate, + momentum, + parameter_list=None, + use_nesterov=False, + regularization=None, + grad_clip=None, + name=None): + assert learning_rate is not None + assert momentum is not None + super(MomentumOptimizer, self).__init__( + learning_rate=learning_rate, + parameter_list=parameter_list, + regularization=regularization, + grad_clip=grad_clip, + name=name) + self.type = "momentum" + self._momentum = momentum + self._use_nesterov = bool(use_nesterov) + + def _create_accumulators(self, block, parameters): + assert isinstance(block, framework.Block) + + for p in parameters: + self._add_accumulator(self._velocity_acc_str, p) + + def _append_optimize_op(self, block, param_and_grad): + assert isinstance(block, framework.Block) + + velocity_acc = self._get_accumulator(self._velocity_acc_str, + param_and_grad[0]) + lr = self._create_param_lr(param_and_grad) + + if framework.in_dygraph_mode(): + _, _ = core.ops.momentum(param_and_grad[0], param_and_grad[1], + velocity_acc, lr, param_and_grad[0], + velocity_acc, 'mu', self._momentum, + 'use_nesterov', self._use_nesterov) + return None + + attrs = {"mu": self._momentum, "use_nesterov": self._use_nesterov} + inputs = { + "Param": [param_and_grad[0]], + "Grad": [param_and_grad[1]], + "Velocity": [velocity_acc], + "LearningRate": [lr] + } + + outputs = { + "ParamOut": [param_and_grad[0]], + "VelocityOut": [velocity_acc] + } + # create the momentum optimize op + momentum_op = block.append_op( + type=self.type, + inputs=inputs, + outputs=outputs, + attrs=attrs, + stop_gradient=True) + + return momentum_op + + +class DGCMomentumOptimizer(Optimizer): + """ + :api_attr: Static Graph + + DGC (Deep Gradient Compression) Momentum Optimizer. Original paper is https://arxiv.org/abs/1712.01887 + + DGC reduces the communication bandwidth by sending only the important gradients (sparse update):\ + only gradients larger than a threshold are transmitted. + + To avoid losing information, DGC accumulates the rest of the gradients locally. + + Eventually, these gradients become large enough to be transmitted. + + Thus, DGC sends the large gradients immediately but eventually sends all of the gradients over time. + + To ensure no loss of accuracy, DGC employs momentum correction and local gradient clipping on top of the gradient sparsification to maintain model performance. + + DGC also uses momentum factor masking and warmup training to overcome the staleness problem caused by reduced communication. + + This optimizer will do two things: + + 1. Compress the gradient by get TopK import value from tensor \ + and use it for allreduce to reduce network bandwidth. + + 2. Call momentum to optimize the cost. + + Args: + learning_rate (float|Variable): The learning rate used to update parameters. \ + It can be a float value or a Variable with one float value as a data element. + momentum (float): Momentum factor. + rampup_begin_step (int): The beginning step from which gradient compression is implemented. + rampup_step (int): Time steps used in sparsity warm-up periods. Default is 1. + For example, if the sparsity is [0.75, 0.9375, 0.984375, 0.996, 0.999], and the rampup_step is 100, \ + it will use 0.75 at 0~19 steps, and 0.9375 at 20~39 steps, and so on. \ + And when reach sparsity array ends, it will use 0.999 then and after. + sparsity (list[float]): Get top important element from gradient tensor, the ratio is (1 - current sparsity). \ + Default is [0.999]. For example, if the sparsity is [0.99, 0.999], \ + the top [1%, 0.1%] important element will be transmitted. + parameter_list (Iterable, optional): Iterable of ``Variable`` names to update to minimize ``loss``. \ + This parameter is required in dygraph mode. \ + The default value is None in static mode, at this time all parameters will be updated. + use_nesterov (bool): Enables Nesterov momentum. True means use Nesterov. Default is False. + regularization (WeightDecayRegularizer, optional): The strategy of regularization. There are two method: \ + :ref:`api_fluid_regularizer_L1Decay` , :ref:`api_fluid_regularizer_L2Decay` . If a parameter has set \ + regularizer using :ref:`api_fluid_ParamAttr` already, the regularization setting here in optimizer will be \ + ignored for this parameter. Otherwise, the regularization setting here in optimizer will take effect. \ + Default None, meaning there is no regularization. + grad_clip (GradientClipByNorm, optional): Gradient cliping strategy. ``DGCMomentumOptimizer`` only support + :ref:`api_fluid_clip_GradientClipByNorm` , and if not, it will raise TypeError. Default None, + meaning there is no gradient clipping. + name (str, optional): This parameter is used by developers to print debugging information. \ + For details, please refer to :ref:`api_guide_Name`. Default is None. + + Examples: + .. code-block:: python + + import paddle + optimizer = paddle.optimizer.DGCMomentumOptimizer( + learning_rate=0.0001, + momentum=0.9, + rampup_step=1000, + rampup_begin_step=1252, + sparsity=[0.999, 0.999]) + + """ + _u_velocity_acc_str = "_dgc_u_" + _v_velocity_acc_str = "_dgc_v_" + + def __init__(self, + learning_rate, + momentum, + rampup_begin_step, + rampup_step=1, + sparsity=[0.999], + parameter_list=None, + use_nesterov=False, + num_trainers=None, + regularization=None, + grad_clip=None, + name=None): + if framework.in_dygraph_mode(): + raise Exception("In dygraph, don't support DGCMomentumOptimizer.") + + assert core.is_compiled_with_cuda(), \ + "Paddle is not compiled with CUDA. DGC is only support GPU for now." + + assert learning_rate is not None + assert momentum is not None + super(DGCMomentumOptimizer, self).__init__( + learning_rate=learning_rate, + parameter_list=parameter_list, + regularization=regularization, + grad_clip=grad_clip, + name=name) + self.type = "dgc_momentum" + self._momentum = momentum + self._use_nesterov = bool(use_nesterov) + + assert rampup_begin_step >= 0, "rampup_begin_step must >= 0" + self._rampup_begin_step = rampup_begin_step + self._rampup_step = rampup_step + self._sparsity = sparsity + + self._rampup_begin_step_var = None + self._global_step_var = None + + self._dgc_clip_norm = None + if grad_clip is not None: + if not isinstance(grad_clip, GradientClipByNorm): + raise TypeError( + "The type of grad_clip should be 'GradientClipByNorm', because DGCMomentumOptimizer only support GradientClipByNorm" + ) + assert isinstance( + num_trainers, int + ), "The type of num_trainers should be 'int', but received %s" % type( + value) + assert num_trainers > 0, "The value of num_trainers should be greater than 0!" + + self._num_trainers = num_trainers + self._dgc_clip_norm = grad_clip.clip_norm * (num_trainers**-0.5) + + self.regular_type, self.regular_coeff = self._get_regularization_param( + self.regularization) + + def _get_regularization_param(self, regularization): + regular_type = 0 + regular_coeff = 0.0 + + if regularization is not None: + regular_coeff = regularization._regularization_coeff + from .regularizer import L1Decay, L2Decay + if isinstance(regularization, L1Decay): + regular_type = 1 + elif isinstance(regularization, L2Decay): + regular_type = 2 + else: + assert False, 'regularization must be None|L1Decay|L2Deacy' + return regular_type, regular_coeff + + def _is_use_dgc(self, param_var, grad_var): + var_numel = abs(reduce(lambda x, y: x * y, param_var.shape)) + if var_numel < 16384 or \ + param_var.type == core.VarDesc.VarType.SELECTED_ROWS or \ + grad_var.type == core.VarDesc.VarType.SELECTED_ROWS or \ + param_var.dtype != core.VarDesc.VarType.FP32 : + return False + return True + + def _append_optimize_op(self, block, param_and_grad): + assert isinstance(block, framework.Block) + velocity_acc = self._get_accumulator(self._u_velocity_acc_str, + param_and_grad[0]) + assert velocity_acc is not None + + inputs = { + "Param": param_and_grad[0], + "Grad": param_and_grad[1], + "Velocity": velocity_acc, + "LearningRate": self._create_param_lr(param_and_grad), + } + outputs = { + "ParamOut": param_and_grad[0], + "VelocityOut": velocity_acc, + } + attrs = {"mu": self._momentum, "use_nesterov": self._use_nesterov} + + if not self._is_use_dgc(param_and_grad[0], param_and_grad[1]): + type = "momentum" + else: + type = "dgc_momentum" + inputs.update({ + "current_step": self._global_step_var, + "nranks": self._nranks_var + }) + outputs.update({'Grad_out': param_and_grad[1]}) + attrs.update({"rampup_begin_step": float(self._rampup_begin_step)}) + + # create the dgc momentum optimize op + dgc_momentum_op = block.append_op( + type=type, + inputs=inputs, + outputs=outputs, + attrs=attrs, + stop_gradient=True) + return dgc_momentum_op + + def _add_auto_increment_var(self, counter_name, begin, step=1): + helper = LayerHelper('global_step_counter') + counter, is_new_var = helper.create_or_get_global_variable( + name=counter_name, dtype='float32', shape=[1], persistable=True) + if is_new_var: + helper.set_variable_initializer( + counter, + initializer=Constant( + value=float(begin - 1), force_cpu=True)) + helper.main_program.global_block()._prepend_op( + type='increment', + inputs={'X': [counter]}, + outputs={'Out': [counter]}, + attrs={'step': float(step)}, + stop_gradient=True) + counter.stop_gradient = True + + return counter + + def _add_nranks_var(self, name, value=-1): + helper = LayerHelper('global_step_counter') + counter, is_new_var = helper.create_or_get_global_variable( + name=name, dtype='float32', shape=[1], persistable=True) + if is_new_var: + helper.set_variable_initializer( + counter, + initializer=Constant( + value=float(value), force_cpu=True)) + counter.stop_gradient = True + + return counter + + def _append_dgc_ops(self, param_and_grads): + main_program = default_main_program() + main_program._enable_dgc = True + + # step counter + self._global_step_var = self._add_auto_increment_var( + counter_name=core.dgc.kDGCCounterName(), begin=0) + + self._nranks_var = self._add_nranks_var( + name=core.dgc.kDGCNRanksName(), value=-1) + + # rampup begin step var for all_reduce_op_handle + self._rampup_begin_step_var = tensor.create_global_var( + shape=[1], + dtype=core.VarDesc.VarType.FP32, + persistable=True, + name=core.dgc.kDGCRampUpBeginStepName(), + value=self._rampup_begin_step * 1.0, + force_cpu=True) + + self.helper = LayerHelper(self.__class__.__name__) + + for param_var, grad_var in param_and_grads: + # reuse velocity in dgc_op and dgc_momentum_op + u_var = self._add_accumulator(self._u_velocity_acc_str, param_var) + + if not self._is_use_dgc(param_var, grad_var): + continue + + v_var = self._add_accumulator(self._v_velocity_acc_str, param_var) + + k_var = tensor.create_global_var( + shape=[1], + dtype=param_var.dtype, + persistable=True, + name=param_var.name + core.dgc.kDGCKName(), + value=0.0, + force_cpu=True) + + encoded_var = tensor.create_global_var( + shape=[1], + dtype=param_var.dtype, + persistable=True, + name=param_var.name + core.dgc.kDGCEncodedName(), + value=0.0, + force_cpu=False) + + gather_var = tensor.create_global_var( + shape=[1], + dtype=param_var.dtype, + persistable=True, + name=param_var.name + core.dgc.kDGCGatherName(), + value=0.0, + force_cpu=False) + + # del back oprolevarname + op_maker = core.op_proto_and_checker_maker + backward = core.op_proto_and_checker_maker.OpRole.Backward + for op in main_program.global_block().ops: + if not self._is_the_backward_op(op): + continue + + var_attr = op.all_attrs()[op_maker.kOpRoleVarAttrName()] + if param_var.name not in var_attr: + continue + + var_attr.remove(param_var.name) + var_attr.remove(grad_var.name) + if len(var_attr) > 1: + op._set_attr(op_maker.kOpRoleVarAttrName(), var_attr) + else: + op._remove_attr(op_maker.kOpRoleVarAttrName()) + + clip_var = grad_var + if self._dgc_clip_norm is not None: + clip_var = self._append_clip_norm(grad_var, self._dgc_clip_norm) + self._dgc_op(param_var, clip_var, grad_var, u_var, v_var, k_var, + encoded_var, gather_var) + + def _is_the_backward_op(self, op): + op_maker = core.op_proto_and_checker_maker + backward = core.op_proto_and_checker_maker.OpRole.Backward + if op_maker.kOpRoleVarAttrName() in op.attr_names and \ + int(op.all_attrs()[op_maker.kOpRoleAttrName()]) == int(backward): + return True + return False + + def _clip_by_norm(self, x, max_norm, name=None): + args = {'x': x, 'max_norm': max_norm, 'name': name} + + helper = LayerHelper("dgc_clip_by_norm_op", **args) + + if name is None: + name = unique_name.generate_with_ignorable_key(".".join( + [helper.name, 'tmp'])) + + out = helper.create_variable( + type=x.type, name=name, dtype=x.dtype, persistable=False) + + helper.append_op( + type="dgc_clip_by_norm", + inputs={"X": x, + "current_step": self._global_step_var}, + attrs={ + "max_norm": max_norm, + "rampup_begin_step": float(self._rampup_begin_step) + }, + outputs={"Out": out}) + return out + + def _append_clip_norm(self, grad_var, clip_norm): + with grad_var.block.program._backward_role_guard(): + return self._clip_by_norm( + x=grad_var, max_norm=clip_norm, name=grad_var.name) + + def _dgc_op(self, param_var, clip_var, grad_var, u_var, v_var, k_var, + encoded_var, gather_var): + block = framework.default_main_program().global_block() + op_maker = core.op_proto_and_checker_maker + + regular_type = self.regular_type + regular_coeff = self.regular_coeff + # The regularizer of the Parameters have higher priority + if param_var.regularizer is not None: + regular_type, regular_coeff = self._get_regularization_param( + param_var.regularizer) + + dgc_op = block.append_op( + type="dgc", + inputs={ + "U": u_var, + "V": v_var, + "Grad": clip_var, + "Param": param_var, + "current_step": self._global_step_var, + "nranks": self._nranks_var, + }, + outputs={ + "U_out": u_var, + "V_out": v_var, + "EncodeGrad": encoded_var, + "k": k_var, + "Grad_out": grad_var, + "GatherBuff": gather_var, + }, + attrs={ + "m": self._momentum, + "sparsity": self._sparsity, + "use_nesterov": self._use_nesterov, + "rampup_begin_step": float(self._rampup_begin_step), + "rampup_step": float(self._rampup_step), + "regular_coeff": float(regular_coeff), + "regular_type": int(regular_type), + }, + stop_gradient=True) + + backward = op_maker.OpRole.Backward + dgc_op._set_attr(op_maker.kOpRoleAttrName(), backward) + dgc_op._set_attr(op_maker.kOpRoleVarAttrName(), + [param_var.name, grad_var.name]) + + @imperative_base.no_grad() + def apply_gradients(self, params_grads): + params_grads = sorted(params_grads, key=lambda x: x[0].name) + params_grads, table_param_and_grad, table_optimize_op = \ + self._process_distribute_lookuptable(params_grads) + + not_dgc_params_grads = [] + dgc_params_grads = [] + # DGC clip and regularization in optimizer.backward + for param, grad in params_grads: + if not self._is_use_dgc(param, grad): + not_dgc_params_grads.append((param, grad)) + else: + dgc_params_grads.append((param, grad)) + + # 'optimizer(grad_clip)' or 'set_gradient_clip' + if self._grad_clip is not None: + not_dgc_params_grads = self._grad_clip(not_dgc_params_grads) + else: + not_dgc_params_grads = append_gradient_clip_ops( + not_dgc_params_grads) + + not_dgc_params_grads = append_regularization_ops(not_dgc_params_grads, + self.regularization) + + params_grads = not_dgc_params_grads + dgc_params_grads + params_grads = sorted(params_grads, key=lambda x: x[0].name) + + optimize_ops = self._create_optimization_pass(params_grads) + if table_optimize_op is not None: + optimize_ops.append(table_optimize_op) + params_grads.append(table_param_and_grad) + + return optimize_ops + + +class LarsMomentumOptimizer(Optimizer): + """ + Momentum optimizer with LARS support + + The update equations are as follows: + + .. math:: + + & local\_learning\_rate = learning\_rate * lars\_coeff * \\ + \\frac{||param||}{||gradient|| + lars\_weight\_decay * ||param||} + + & velocity = mu * velocity + local\_learning\_rate * (gradient + lars\_weight\_decay * param) + + & param = param - velocity + + Parameters: + learning_rate (float|Variable): The learning rate used to update parameters. \ + Can be a float value or a Variable with one float value as data element. \ + momentum (float): momentum factor + lars_coeff (float): Defines how much we trust the layer to change its weights. + lars_weight_decay (float): Weight decay coefficient for decaying using LARS. + parameter_list (Iterable, optional): Iterable of ``Variable`` names to update to minimize ``loss``. \ + This parameter is required in dygraph mode. \ + The default value is None in static mode, at this time all parameters will be updated. + regularization (WeightDecayRegularizer, optional): The strategy of regularization. There are two method: \ + :ref:`api_fluid_regularizer_L1Decay` , :ref:`api_fluid_regularizer_L2Decay` . If a parameter has set \ + regularizer using :ref:`api_fluid_ParamAttr` already, the regularization setting here in optimizer will be \ + ignored for this parameter. Otherwise, the regularization setting here in optimizer will take effect. \ + Default None, meaning there is no regularization. + grad_clip (GradientClipBase, optional): Gradient cliping strategy, it's an instance of + some derived class of ``GradientClipBase`` . There are three cliping strategies + ( :ref:`api_fluid_clip_GradientClipByGlobalNorm` , :ref:`api_fluid_clip_GradientClipByNorm` , + :ref:`api_fluid_clip_GradientClipByValue` ). Default None, meaning there is no gradient clipping. + name (str, optional): This parameter is used by developers to print debugging information. \ + For details, please refer to :ref:`api_guide_Name`. Default is None. + + Examples: + .. code-block:: python + + import paddle.fluid as fluid + import numpy as np + import paddle + + np_inp = np.array([[1.0, 2.0], [3.0, 4.0]], dtype=np.float32) + inp = fluid.layers.data( + name="inp", shape=[2, 2], append_batch_size=False) + out = fluid.layers.fc(inp, size=3) + out = fluid.layers.reduce_sum(out) + optimizer = paddle.optimizer.LarsMomentumOptimizer(learning_rate=0.001, momentum=0.9) + optimizer.minimize(out) + + exe = fluid.Executor(fluid.CPUPlace()) + exe.run(fluid.default_startup_program()) + exe.run( + feed={"inp": np_inp}, + fetch_list=[out.name]) + """ + _velocity_acc_str = "velocity" + + def __init__(self, + learning_rate, + momentum, + lars_coeff=0.001, + lars_weight_decay=0.0005, + parameter_list=None, + regularization=None, + grad_clip=None, + name=None): + assert learning_rate is not None + assert momentum is not None + super(LarsMomentumOptimizer, self).__init__( + learning_rate=learning_rate, + parameter_list=parameter_list, + regularization=regularization, + grad_clip=grad_clip, + name=name) + self.type = "lars_momentum" + self._momentum = momentum + self._lars_coeff = float(lars_coeff) + self._lars_weight_decay = float(lars_weight_decay) + + def _create_accumulators(self, block, parameters): + assert isinstance(block, framework.Block) + + for p in parameters: + self._add_accumulator(self._velocity_acc_str, p) + + def _append_optimize_op(self, block, param_and_grad): + assert isinstance(block, framework.Block) + + velocity_acc = self._get_accumulator(self._velocity_acc_str, + param_and_grad[0]) + # create the momentum optimize op + momentum_op = block.append_op( + type=self.type, + inputs={ + "Param": param_and_grad[0], + "Grad": param_and_grad[1], + "Velocity": velocity_acc, + "LearningRate": self._create_param_lr(param_and_grad) + }, + outputs={ + "ParamOut": param_and_grad[0], + "VelocityOut": velocity_acc + }, + attrs={ + "mu": self._momentum, + "lars_coeff": self._lars_coeff, + "lars_weight_decay": self._lars_weight_decay + }, + stop_gradient=True) + + return momentum_op + + +class AdagradOptimizer(Optimizer): + """ + The Adaptive Gradient optimizer (Adagrad for short) can adaptively assign + different learning rates to individual parameters. + + The parameter ``param_out`` update rule with gradient ``grad``: + + .. math:: + + moment\_out &= moment + grad * grad + + param\_out &= param - \\frac{learning\_rate * grad}{\sqrt{moment\_out} + \epsilon} + + Related paper: `Adaptive Subgradient Methods for Online Learning and + Stochastic Optimization `_. + + The original paper does not have the ``epsilon`` attribute. It is added here + in our implementation as also proposed `Per-parameter adaptive learning rate + methods `_ + for numerical stability to avoid the division by zero error. + + Args: + learning_rate (float|Variable): The learning rate used to update ``Parameter``. + It can be a float value or a ``Variable`` with a float type. + epsilon (float, optional): A small float value for numerical stability. + The default value is 1e-06. + parameter_list (Iterable, optional): Iterable of ``Variable`` names to update to minimize ``loss``. \ + This parameter is required in dygraph mode. \ + The default value is None in static mode, at this time all parameters will be updated. + regularization (WeightDecayRegularizer, optional): The strategy of regularization. There are two method: \ + :ref:`api_fluid_regularizer_L1Decay` , :ref:`api_fluid_regularizer_L2Decay` . If a parameter has set \ + regularizer using :ref:`api_fluid_ParamAttr` already, the regularization setting here in optimizer will be \ + ignored for this parameter. Otherwise, the regularization setting here in optimizer will take effect. \ + Default None, meaning there is no regularization. + grad_clip (GradientClipBase, optional): Gradient cliping strategy, it's an instance of + some derived class of ``GradientClipBase`` . There are three cliping strategies + ( :ref:`api_fluid_clip_GradientClipByGlobalNorm` , :ref:`api_fluid_clip_GradientClipByNorm` , + :ref:`api_fluid_clip_GradientClipByValue` ). Default None, meaning there is no gradient clipping. + name (str, optional): Normally there is no need for user to set this property. + For more information, please refer to :ref:`api_guide_Name`. + The default value is None. + initial_accumulator_value (float, optional): Initial value for moment accumulator. + The default value is 0.0. + + Examples: + .. code-block:: python + + import numpy as np + import paddle + import paddle.fluid as fluid + + np_inp = np.array([[1.0, 2.0], [3.0, 4.0]], dtype=np.float32) + inp = fluid.data(name="inp", shape=[2, 2]) + out = fluid.layers.fc(inp, size=3) + out = fluid.layers.reduce_sum(out) + optimizer = paddle.optimizer.AdagradOptimizer(learning_rate=0.2) + optimizer.minimize(out) + + exe = fluid.Executor(fluid.CPUPlace()) + exe.run(fluid.default_startup_program()) + exe.run( + feed={"inp": np_inp}, + fetch_list=[out.name]) + """ + _moment_acc_str = "moment" + + def __init__(self, + learning_rate, + epsilon=1.0e-6, + parameter_list=None, + regularization=None, + grad_clip=None, + name=None, + initial_accumulator_value=0.0): + assert learning_rate is not None + assert epsilon is not None + super(AdagradOptimizer, self).__init__( + learning_rate=learning_rate, + parameter_list=parameter_list, + regularization=regularization, + grad_clip=grad_clip, + name=name) + self.type = "adagrad" + self._epsilon = epsilon + self.initial_accumulator_value = initial_accumulator_value + + def _create_accumulators(self, block, parameters): + assert isinstance(block, framework.Block) + + for p in parameters: + self._add_accumulator( + self._moment_acc_str, + p, + fill_value=self.initial_accumulator_value) + + def _append_optimize_op(self, block, param_and_grad): + assert isinstance(block, framework.Block) + + moment_acc = self._get_accumulator(self._moment_acc_str, + param_and_grad[0]) + # Create the adagrad optimizer op + adagrad_op = block.append_op( + type=self.type, + inputs={ + "Param": param_and_grad[0], + "Grad": param_and_grad[1], + "Moment": moment_acc, + "LearningRate": self._create_param_lr(param_and_grad) + }, + outputs={"ParamOut": param_and_grad[0], + "MomentOut": moment_acc}, + attrs={"epsilon": self._epsilon}, + stop_gradient=True) + + return adagrad_op + + +class Adam(Optimizer): + """ + The Adam optimizer uses an optimization described at the end + of section 2 of `Adam paper `_ , + it can dynamically adjusts the learning rate of each parameter using + the 1st moment estimates and the 2nd moment estimates of the gradient. + + The parameter ``param_out`` update rule with gradient ``grad``: + + .. math:: + + t & = t + 1 + + moment\_1\_out & = {\\beta}_1 * moment\_1 + (1 - {\\beta}_1) * grad + + moment\_2\_out & = {\\beta}_2 * moment\_2 + (1 - {\\beta}_2) * grad * grad + + learning\_rate & = learning\_rate * \\ + \\frac{\sqrt{1 - {\\beta}_2^t}}{1 - {\\beta}_1^t} + + param\_out & = param - learning\_rate * \\frac{moment\_1}{\sqrt{moment\_2} + \epsilon} + + Related paper: `Adam: A Method for Stochastic Optimization `_ + + Args: + lr (float|Tensor, optional): The learning rate used to update ``Parameter``. + It can be a float value or a ``Tensor`` with a float type. The default value is 0.001. + betas (list[float]|list[Tensor], optional): The list of exponential decay rate for moment estimates. + The elements of list should be float numbers or Tensor with shape [1] and data type as float32. + The default value is [0.9, 0.999]. + eps (float, optional): A small float value for numerical stability. + The default value is 1e-08. + params (Iterable, optional): Iterable of ``Tensor`` names to update to minimize ``loss``. \ + This parameter is required in dygraph mode. \ + The default value is None in static mode, at this time all parameters will be updated. + weight_decay (float, optional): Weight decay of L2 regularization. The default value is 0. + grad_clip (GradientClipBase, optional): Gradient cliping strategy, it's an instance of + some derived class of ``GradientClipBase`` . There are three cliping strategies + ( :ref:`api_fluid_clip_GradientClipByGlobalNorm` , :ref:`api_fluid_clip_GradientClipByNorm` , + :ref:`api_fluid_clip_GradientClipByValue` ). Default None, meaning there is no gradient clipping. + name (str, optional): Normally there is no need for user to set this property. + For more information, please refer to :ref:`api_guide_Name`. + The default value is None. + lazy_mode (bool, optional): The official Adam algorithm has two moving-average accumulators. + The accumulators are updated at every step. Every element of the two moving-average + is updated in both dense mode and sparse mode. If the size of parameter is very large, + then the update may be very slow. The lazy mode only update the element that has + gradient in current mini-batch, so it will be much more faster. But this mode has + different semantics with the original Adam algorithm and may lead to different result. + The default value is False. + + Examples: + .. code-block:: python + + import paddle + import paddle.fluid as fluid + + place = fluid.CPUPlace() + main = fluid.Program() + with fluid.program_guard(main): + x = fluid.data(name='x', shape=[None, 13], dtype='float32') + y = fluid.data(name='y', shape=[None, 1], dtype='float32') + y_predict = fluid.layers.fc(input=x, size=1, act=None) + cost = fluid.layers.square_error_cost(input=y_predict, label=y) + avg_cost = fluid.layers.mean(cost) + + adam_optimizer = paddle.optimizer.Adam(0.01) + adam_optimizer.minimize(avg_cost) + + fetch_list = [avg_cost] + train_reader = paddle.batch( + paddle.dataset.uci_housing.train(), batch_size=1) + feeder = fluid.DataFeeder(place=place, feed_list=[x, y]) + exe = fluid.Executor(place) + exe.run(fluid.default_startup_program()) + for data in train_reader(): + exe.run(main, feed=feeder.feed(data), fetch_list=fetch_list) + + .. code-block:: python + + # Adam with betas as list[Tensor] + import paddle + import paddle.fluid as fluid + import paddle.fluid.layers.learning_rate_scheduler as lr_scheduler + + place = fluid.CPUPlace() + main = fluid.Program() + with fluid.program_guard(main): + x = fluid.data(name='x', shape=[None, 13], dtype='float32') + y = fluid.data(name='y', shape=[None, 1], dtype='float32') + y_predict = fluid.layers.fc(input=x, size=1, act=None) + cost = fluid.layers.square_error_cost(input=y_predict, label=y) + avg_cost = fluid.layers.mean(cost) + + # define beta decay Tensor + def get_decayed_betas(beta1_init, beta2_init, decay_steps, decay_rate): + global_step = lr_scheduler._decay_step_counter() + + beta1 = fluid.layers.create_global_var( + shape=[1], + value=float(beta1_init), + dtype='float32', + # set persistable for save checkpoints and resume + persistable=True, + name="beta1") + beta2 = fluid.layers.create_global_var( + shape=[1], + value=float(beta2_init), + dtype='float32', + # set persistable for save checkpoints and resume + persistable=True, + name="beta2") + + div_res = global_step / decay_steps + decayed_beta1 = beta1_init * (decay_rate**div_res) + decayed_beta2 = beta2_init * (decay_rate**div_res) + fluid.layers.assign(decayed_beta1, beta1) + fluid.layers.assign(decayed_beta2, beta2) + + return [beta1, beta2] + + betas = get_decayed_betas(0.9, 0.99, 1e5, 0.9) + adam_optimizer = paddle.optimizer.Adam( + lr=0.01, + betas=betas) + adam_optimizer.minimize(avg_cost) + + fetch_list = [avg_cost] + train_reader = paddle.batch( + paddle.dataset.uci_housing.train(), batch_size=1) + feeder = fluid.DataFeeder(place=place, feed_list=[x, y]) + exe = fluid.Executor(place) + exe.run(fluid.default_startup_program()) + for data in train_reader(): + exe.run(main, feed=feeder.feed(data), fetch_list=fetch_list) + """ + _moment1_acc_str = "moment1" + _moment2_acc_str = "moment2" + _beta1_pow_acc_str = "beta1_pow_acc" + _beta2_pow_acc_str = "beta2_pow_acc" + + def __init__(self, + lr=0.001, + betas=[0.9, 0.999], + eps=1e-8, + params=None, + weight_decay=0.0, + grad_clip=None, + name=None, + lazy_mode=False): + assert learning_rate is not None + assert betas is not None + assert epsilon is not None + regularization = L2Decay(regularization_coeff=weight_decay) + super(Adam, self).__init__( + learning_rate=lr, + parameter_list=params, + regularization=regularization, + grad_clip=grad_clip, + name=name) + self.type = "adam" + self._beta1 = betas[0] + self._beta2 = betas[1] + self._epsilon = epsilon + self._lazy_mode = lazy_mode + + def _create_accumulators(self, block, parameters): + assert isinstance(block, framework.Block) + + # Create accumulator tensors for first and second moments + for p in parameters: + self._add_accumulator(self._moment1_acc_str, p) + self._add_accumulator(self._moment2_acc_str, p) + self._add_accumulator( + name=self._beta1_pow_acc_str, + param=p, + fill_value=0.9 if isinstance(self._beta1, Variable) \ + else self._beta1, + shape=[1], + type=core.VarDesc.VarType.LOD_TENSOR, device='cpu') + self._add_accumulator( + name=self._beta2_pow_acc_str, + param=p, + fill_value=0.999 if isinstance(self._beta2, Variable) \ + else self._beta2, + shape=[1], + type=core.VarDesc.VarType.LOD_TENSOR, device='cpu') + + def _append_optimize_op(self, block, param_and_grad): + assert isinstance(block, framework.Block) + + moment1 = self._get_accumulator(self._moment1_acc_str, + param_and_grad[0]) + moment2 = self._get_accumulator(self._moment2_acc_str, + param_and_grad[0]) + beta1_pow_acc = self._get_accumulator(self._beta1_pow_acc_str, + param_and_grad[0]) + beta2_pow_acc = self._get_accumulator(self._beta2_pow_acc_str, + param_and_grad[0]) + lr = self._create_param_lr(param_and_grad) + # create the adam optimize op + + if framework.in_dygraph_mode(): + _beta1 = self._beta1 if not isinstance( + self._beta1, Variable) else self._beta1.numpy().item(0) + _beta2 = self._beta2 if not isinstance( + self._beta2, Variable) else self._beta2.numpy().item(0) + _, _, _, _, _ = core.ops.adam( + param_and_grad[0], param_and_grad[1], lr, moment1, moment2, + beta1_pow_acc, beta2_pow_acc, param_and_grad[0], moment1, + moment2, beta1_pow_acc, beta2_pow_acc, 'epsilon', self._epsilon, + 'lazy_mode', self._lazy_mode, 'min_row_size_to_use_multithread', + 1000, 'beta1', _beta1, 'beta2', _beta2) + + return None + + inputs = { + "Param": [param_and_grad[0]], + "Grad": [param_and_grad[1]], + "LearningRate": [lr], + "Moment1": [moment1], + "Moment2": [moment2], + "Beta1Pow": [beta1_pow_acc], + "Beta2Pow": [beta2_pow_acc] + } + outputs = { + "ParamOut": [param_and_grad[0]], + "Moment1Out": [moment1], + "Moment2Out": [moment2], + "Beta1PowOut": [beta1_pow_acc], + "Beta2PowOut": [beta2_pow_acc], + } + attrs = { + "epsilon": self._epsilon, + "lazy_mode": self._lazy_mode, + "min_row_size_to_use_multithread": 1000 + } + + if isinstance(self._beta1, Variable): + inputs['Beta1Tensor'] = self._beta1 + else: + attrs['beta1'] = self._beta1 + if isinstance(self._beta2, Variable): + inputs['Beta2Tensor'] = self._beta2 + else: + attrs['beta2'] = self._beta2 + + adam_op = block.append_op( + type=self.type, + inputs=inputs, + outputs=outputs, + attrs=attrs, + stop_gradient=True) + + return adam_op + + +class AdamaxOptimizer(Optimizer): + """ + The Adamax optimizer is implemented based on the Adamax Optimization + in Section 7 of `Adam paper `_. + The Adamax algorithm is a variant of the Adam algorithm based on the infinite norm, + which makes the learning rate update algorithm more stable and simple. + + The parameter ``param_out`` update rule with gradient ``grad``: + + .. math:: + + t & = t + 1 + + moment\_out & = {\\beta}_1 * moment + (1 - {\\beta}_1) * grad + + inf\_norm\_out & = max({\\beta}_2 * inf\_norm + \epsilon, |grad|) + + learning\_rate & = \\frac{learning\_rate}{1 - {\\beta}_1^t} + + param\_out & = param - learning\_rate * \\frac{moment\_out}{inf\_norm\_out} + + Related paper: `Adam: A Method for Stochastic Optimization `_ + + The original paper does not have an ``epsilon`` attribute, + it is added here for numerical stability to prevent the division by 0 error. + + Args: + lr (float|Tensor, optional): The learning rate used to update ``Parameter``. + It can be a float value or a ``Tensor`` with a float type. The default value is 0.001. + betas (list[float]|list[Tensor], optional): The list of exponential decay rate for moment estimates. + The elements of list should be float numbers or Tensor with shape [1] and data type as float32. + The default value is [0.9, 0.999]. + epsilon (float, optional): A small float value for numerical stability. + The default value is 1e-08. + parms (Iterable, optional): Iterable of ``Tensor`` names to update to minimize ``loss``. \ + This parameter is required in dygraph mode. \ + The default value is None in static mode, at this time all parameters will be updated. + weight_decay (float, optional): Weight decay of L2 regularization. The default value is 0. + grad_clip (GradientClipBase, optional): Gradient cliping strategy, it's an instance of + some derived class of ``GradientClipBase`` . There are three cliping strategies + ( :ref:`api_fluid_clip_GradientClipByGlobalNorm` , :ref:`api_fluid_clip_GradientClipByNorm` , + :ref:`api_fluid_clip_GradientClipByValue` ). Default None, meaning there is no gradient clipping. + name (str, optional): Normally there is no need for user to set this property. + For more information, please refer to :ref:`api_guide_Name`. + The default value is None. + + **Notes**: + **Currently, Adamax doesn't support sparse parameter optimization.** + + Examples: + .. code-block:: python + + import paddle.fluid as fluid + import paddle + import numpy + + # First create the Executor. + place = fluid.CPUPlace() # fluid.CUDAPlace(0) + exe = fluid.Executor(place) + + train_program = fluid.Program() + startup_program = fluid.Program() + with fluid.program_guard(train_program, startup_program): + data = fluid.data(name='X', shape=[None, 1], dtype='float32') + hidden = fluid.layers.fc(input=data, size=10) + loss = fluid.layers.mean(hidden) + adam = paddle.optimizer.Adamax(lr=0.2) + adam.minimize(loss) + + # Run the startup program once and only once. + exe.run(startup_program) + + x = numpy.random.random(size=(10, 1)).astype('float32') + outs = exe.run(program=train_program, + feed={'X': x}, + fetch_list=[loss.name]) + """ + _moment_acc_str = "moment" + _inf_norm_acc_str = "inf_norm" + _beta1_pow_acc_str = "beta1_pow_acc" + + def __init__(self, + learning_rate=0.001, + beta1=0.9, + beta2=0.999, + epsilon=1e-8, + parameter_list=None, + regularization=None, + grad_clip=None, + name=None): + assert learning_rate is not None + assert beta1 is not None + assert beta2 is not None + assert epsilon is not None + super(AdamaxOptimizer, self).__init__( + learning_rate=learning_rate, + parameter_list=parameter_list, + regularization=regularization, + grad_clip=grad_clip, + name=name) + self.type = "adamax" + self._beta1 = beta1 + self._beta2 = beta2 + self._epsilon = epsilon + + def _create_accumulators(self, block, parameters): + # Create accumulator tensors for first moment and infinity norm + for p in parameters: + self._add_accumulator(self._moment_acc_str, p) + self._add_accumulator(self._inf_norm_acc_str, p) + self._add_accumulator( + name=self._beta1_pow_acc_str, + param=p, + fill_value=self._beta1, + shape=[1]) + + def _append_optimize_op(self, block, param_and_grad): + assert isinstance(block, framework.Block) + + moment = self._get_accumulator(self._moment_acc_str, param_and_grad[0]) + inf_norm = self._get_accumulator(self._inf_norm_acc_str, + param_and_grad[0]) + beta1_pow_acc = self._get_accumulator(self._beta1_pow_acc_str, + param_and_grad[0]) + # create the adamax optimize op + adamax_op = block.append_op( + type=self.type, + inputs={ + "Param": param_and_grad[0], + "Grad": param_and_grad[1], + "LearningRate": self._create_param_lr(param_and_grad), + "Moment": moment, + "InfNorm": inf_norm, + "Beta1Pow": beta1_pow_acc + }, + outputs={ + "ParamOut": param_and_grad[0], + "MomentOut": moment, + "InfNormOut": inf_norm + }, + attrs={ + "beta1": self._beta1, + "beta2": self._beta2, + "epsilon": self._epsilon + }, + stop_gradient=True) + + return adamax_op + + def _finish_update(self, block, parameters_and_grads): + """Update Beta1 Power accumulator + """ + assert isinstance(block, framework.Block) + for param, grad in parameters_and_grads: + if grad is None or param.trainable is False: + continue + with param.block.program._optimized_guard( + [param, grad]), name_scope('adamax'): + beta1_pow_acc = self._get_accumulator(self._beta1_pow_acc_str, + param) + block.append_op( + type="scale", + inputs={"X": beta1_pow_acc}, + outputs={"Out": beta1_pow_acc}, + attrs={"scale": self._beta1}, + stop_gradient=True) + + +class AdamW(optimizer): + """ + The AdamW optimizer is implemented based on the AdamW Optimization + in paper `DECOUPLED WEIGHT DECAY REGULARIZATION `_. + it can resolves the problem of L2 regularization failure in the Adam optimizer. + + .. math:: + + t & = t + 1 + + moment\_1\_out & = {\\beta}_1 * moment\_1 + (1 - {\\beta}_1) * grad + + moemnt\_2\_out & = {\\beta}_2 * moment\_2 + (1 - {\\beta}_2) * grad * grad + + learning\_rate & = learning\_rate * \\ + \\frac{\sqrt{1 - {\\beta}_2^t}}{1 - {beta}_1^t} + + param\_out & = param - learning\_rate * (\\frac{moment\_1}{\sqrt{moment\_2} + \epsilon} + \lambda * param) + + + Args: + lr (float|Tensor, optional): The learning rate used to update ``Parameter``. + It can be a float value or a ``Tensor`` with a float type. The default value is 0.001. + betas (list[float]|list[Tensor], optional): The list of exponential decay rate for moment estimates. + The elements of list should be float numbers or Tensor with shape [1] and data type as float32. + The default value is [0.9, 0.999]. + epsilon (float, optional): A small float value for numerical stability. + The default value is 1e-08. + parms (Iterable, optional): Iterable of ``Tensor`` names to update to minimize ``loss``. \ + This parameter is required in dygraph mode. \ + The default value is None in static mode, at this time all parameters will be updated. + weight_decay (float, optional): Weight decay of L2 regularization. The default value is 0. + grad_clip (GradientClipBase, optional): Gradient cliping strategy, it's an instance of + some derived class of ``GradientClipBase`` . There are three cliping strategies + ( :ref:`api_fluid_clip_GradientClipByGlobalNorm` , :ref:`api_fluid_clip_GradientClipByNorm` , + :ref:`api_fluid_clip_GradientClipByValue` ). Default None, meaning there is no gradient clipping. + name (str, optional): Normally there is no need for user to set this property. + For more information, please refer to :ref:`api_guide_Name`. + The default value is None. + + **Notes**: + **Currently, AdamW doesn't support sparse parameter optimization.** + + Examples: + .. code-block:: python + + import paddle.fluid as fluid + import paddle + import numpy + + # First create the Executor. + place = fluid.CPUPlace() # fluid.CUDAPlace(0) + exe = fluid.Executor(place) + + train_program = fluid.Program() + startup_program = fluid.Program() + with fluid.program_guard(train_program, startup_program): + data = fluid.data(name='X', shape=[None, 1], dtype='float32') + hidden = fluid.layers.fc(input=data, size=10) + loss = fluid.layers.mean(hidden) + adam = paddle.optimizer.AdamW(lr=0.2) + adam.minimize(loss) + + # Run the startup program once and only once. + exe.run(startup_program) + + x = numpy.random.random(size=(10, 1)).astype('float32') + outs = exe.run(program=train_program, + feed={'X': x}, + fetch_list=[loss.name]) + """ + + +class DpsgdOptimizer(Optimizer): + """ + We implement the Dpsgd optimizer according to CCS16 paper - + Deep Learning with Differential Privacy. + + Examples: + .. code-block:: python + + import paddle.fluid as fluid + import paddle + import numpy + + # First create the Executor. + place = fluid.CPUPlace() # fluid.CUDAPlace(0) + exe = fluid.Executor(place) + + train_program = fluid.Program() + startup_program = fluid.Program() + with fluid.program_guard(train_program, startup_program): + data = fluid.layers.data(name='X', shape=[1], dtype='float32') + hidden = fluid.layers.fc(input=data, size=10) + loss = fluid.layers.mean(hidden) + optimizer = paddle.optimizer.Dpsgd(learning_rate=0.01, clip=10.0, batch_size=16.0, sigma=1.0) + optimizer.minimize(loss) + + # Run the startup program once and only once. + exe.run(startup_program) + + x = numpy.random.random(size=(10, 1)).astype('float32') + outs = exe.run(program=train_program, + feed={'X': x}, + fetch_list=[loss.name]) + + Args: + learning_rate (float|Variable): the learning rate used to update parameters. \ + Can be a float value or a Variable with one float value as data element. + clip (float): clipping threshold + batch_size (float): batch size. + sigma (float): for gaussian noise. + parameter_list (Iterable, optional): Iterable of ``Variable`` names to update to minimize ``loss``. \ + This parameter is required in dygraph mode. \ + The default value is None in static mode, at this time all parameters will be updated. + Notes: + Currently, DpsgdOptimizer doesn't support sparse parameter optimization. + """ + + def __init__(self, + learning_rate=0.001, + clip=0.9, + batch_size=0.999, + sigma=1e-8, + parameter_list=None): + assert learning_rate is not None + assert clip is not None + assert batch_size is not None + assert sigma is not None + super(DpsgdOptimizer, self).__init__( + learning_rate=learning_rate, parameter_list=parameter_list) + self.type = "dpsgd" + self._clip = clip + self._batch_size = batch_size + self._sigma = sigma + ''' + Note(wangzhongpu): + This property is only used for debugging, do not need to set it! + Dpsgd operator use time(NULL) as random seed to generate random number. + However, during debugging, we need determinated result, so we will set self._seed to a fixed number. + ''' + self._seed = None + + def _append_optimize_op(self, block, param_and_grad): + assert isinstance(block, framework.Block) + + # create the dpsgd optimize op + if self._seed == None: + self._seed = 0 + + dpsgd_op = block.append_op( + type=self.type, + inputs={ + "Param": param_and_grad[0], + "Grad": param_and_grad[1], + "LearningRate": self._create_param_lr(param_and_grad) + }, + outputs={"ParamOut": param_and_grad[0]}, + attrs={ + "clip": self._clip, + "batch_size": self._batch_size, + "sigma": self._sigma, + "seed": self._seed + }, + stop_gradient=True) + + return dpsgd_op + + +class DecayedAdagradOptimizer(Optimizer): + """ + The Decayed Adagrad optimizer can be seen as an Adagrad algorithm that introduces + the decay rate to solve the problem of a sharp drop in the learning rate + during model training when using the AdagradOptimizer. + + The parameter ``param_out`` update rule with gradient ``grad``: + + .. math:: + + moment\_out & = decay * moment + (1 - decay) * grad * grad + + param\_out & = param - \\frac{learning\_rate * grad}{\sqrt{moment\_out} + \epsilon} + + Related paper: `Adaptive Subgradient Methods for Online Learning and Stochastic + Optimization `_. + + The original paper does not have an ``epsilon`` attribute. It is added here for numerical + stability to avoid the division by zero error. + + Args: + learning_rate (float|Variable): The learning rate used to update ``Parameter``. + It can be a float value or a ``Variable`` with a float type. + decay (float, optional): The decay rate. The default value is 0.95. + epsilon (float, optional): A small float value for numerical stability. + The default value is 1e-06. + parameter_list (Iterable, optional): Iterable of ``Variable`` names to update to minimize ``loss``. \ + This parameter is required in dygraph mode. \ + The default value is None in static mode, at this time all parameters will be updated. + regularization (WeightDecayRegularizer, optional): The strategy of regularization. There are two method: \ + :ref:`api_fluid_regularizer_L1Decay` , :ref:`api_fluid_regularizer_L2Decay` . If a parameter has set \ + regularizer using :ref:`api_fluid_ParamAttr` already, the regularization setting here in optimizer will be \ + ignored for this parameter. Otherwise, the regularization setting here in optimizer will take effect. \ + Default None, meaning there is no regularization. + grad_clip (GradientClipBase, optional): Gradient cliping strategy, it's an instance of + some derived class of ``GradientClipBase`` . There are three cliping strategies + ( :ref:`api_fluid_clip_GradientClipByGlobalNorm` , :ref:`api_fluid_clip_GradientClipByNorm` , + :ref:`api_fluid_clip_GradientClipByValue` ). Default None, meaning there is no gradient clipping. + name (str, optional): Normally there is no need for user to set this property. + For more information, please refer to :ref:`api_guide_Name`. + The default value is None. + + **Notes**: + **Currently, DecayedAdagradOptimizer doesn't support sparse parameter optimization.** + + Examples: + .. code-block:: python + + import paddle.fluid as fluid + import paddle + + x = fluid.data( name='x', shape=[None, 10], dtype='float32' ) + trans = fluid.layers.fc( x, 100 ) + cost = fluid.layers.reduce_mean( trans ) + optimizer = paddle.optimizer.DecayedAdagradOptimizer(learning_rate=0.2) + optimizer.minimize(cost) + """ + _moment_acc_str = "moment" + + def __init__(self, + learning_rate, + decay=0.95, + epsilon=1.0e-6, + parameter_list=None, + regularization=None, + grad_clip=None, + name=None): + assert learning_rate is not None + assert decay is not None + assert epsilon is not None + + super(DecayedAdagradOptimizer, self).__init__( + learning_rate=learning_rate, + parameter_list=parameter_list, + regularization=regularization, + grad_clip=grad_clip, + name=name) + self.type = "decayed_adagrad" + self._decay = decay + self._epsilon = epsilon + + def _create_accumulators(self, block, parameters): + assert isinstance(block, framework.Block) + + for p in parameters: + self._add_accumulator(self._moment_acc_str, p) + + def _append_optimize_op(self, block, param_and_grad): + assert isinstance(block, framework.Block) + + moment_acc = self._get_accumulator(self._moment_acc_str, + param_and_grad[0]) + + # Create the decayed adagrad optimizer op + decayed_adagrad_op = block.append_op( + type=self.type, + inputs={ + "Param": param_and_grad[0], + "Grad": param_and_grad[1], + "Moment": moment_acc, + "LearningRate": self._create_param_lr(param_and_grad) + }, + outputs={"ParamOut": param_and_grad[0], + "MomentOut": moment_acc}, + attrs={"epsilon": self._epsilon, + "decay": self._decay}, + stop_gradient=True) + + return decayed_adagrad_op + + +class AdadeltaOptimizer(Optimizer): + """ + **Notes: This API does not support sparse parameter optimization.** + + Adadelta Optimizer. Please refer to this for details: + `ADADELTA: AN ADAPTIVE LEARNING RATE METHOD `_. + + The update is done as follows: + + .. math:: + + E(g_t^2) &= \\rho * E(g_{t-1}^2) + (1-\\rho) * g^2 + + learning\_rate &= \sqrt{ ( E(dx_{t-1}^2) + \\epsilon ) / ( E(g_t^2) + \\epsilon ) } + + E(dx_t^2) &= \\rho * E(dx_{t-1}^2) + (1-\\rho) * (-g*learning\_rate)^2 + + Args: + learning_rate (float|Variable): global learning rate. + epsilon (float): a small float number for numeric stability. Default 1.0e-6. + rho (float): a floating point value indicating the decay rate. Default 0.95. + parameter_list (Iterable, optional): Iterable of ``Variable`` names to update to minimize ``loss``. \ + This parameter is required in dygraph mode. \ + The default value is None in static mode, at this time all parameters will be updated. + regularization (WeightDecayRegularizer, optional): The strategy of regularization. There are two method: \ + :ref:`api_fluid_regularizer_L1Decay` , :ref:`api_fluid_regularizer_L2Decay` . If a parameter has set \ + regularizer using :ref:`api_fluid_ParamAttr` already, the regularization setting here in optimizer will be \ + ignored for this parameter. Otherwise, the regularization setting here in optimizer will take effect. \ + Default None, meaning there is no regularization. + grad_clip (GradientClipBase, optional): Gradient cliping strategy, it's an instance of + some derived class of ``GradientClipBase`` . There are three cliping strategies + ( :ref:`api_fluid_clip_GradientClipByGlobalNorm` , :ref:`api_fluid_clip_GradientClipByNorm` , + :ref:`api_fluid_clip_GradientClipByValue` ). Default None, meaning there is no gradient clipping. + name (str, optional): The default value is None. Normally there is no need for user + to set this property. For more information, please refer to + :ref:`api_guide_Name` . + + Examples: + .. code-block:: python + + import paddle.fluid as fluid + import paddle + + image = fluid.data(name='image', shape=[None, 28], dtype='float32') + fc = fluid.layers.fc(image, size=10) + cost = fluid.layers.reduce_mean(fc) + optimizer = paddle.optimizer.Adadelta( + learning_rate=0.0003, epsilon=1.0e-6, rho=0.95) + + # optimizer_ops is a list of optimizer operators to update parameters + # params_grads is a list of (param, param_grad), where param is each + # parameter and param_grad is the gradient variable of param. + optimizer_ops, params_grads = optimizer.minimize(cost) + """ + + _avg_squared_grad_acc_str = "_avg_squared_grad" + _avg_squared_update_acc_str = "_avg_squared_update" + + def __init__(self, + learning_rate, + epsilon=1.0e-6, + rho=0.95, + parameter_list=None, + regularization=None, + grad_clip=None, + name=None): + if learning_rate is None: + raise ValueError("learning_rate is not set.") + if epsilon is None: + raise ValueError("epsilon is not set.") + if rho is None: + raise ValueError("rho is not set.") + super(AdadeltaOptimizer, self).__init__( + learning_rate=learning_rate, + parameter_list=parameter_list, + regularization=regularization, + grad_clip=grad_clip, + name=name) + self.type = "adadelta" + self._epsilon = epsilon + self._rho = rho + + def _create_accumulators(self, block, parameters): + if not isinstance(block, framework.Block): + raise TypeError("block is not instance of framework.Block.") + + for p in parameters: + self._add_accumulator(self._avg_squared_grad_acc_str, p) + self._add_accumulator(self._avg_squared_update_acc_str, p) + + def _append_optimize_op(self, block, param_and_grad): + if not isinstance(block, framework.Block): + raise TypeError("block is not instance of framework.Block.") + + avg_squared_grad_acc = self._get_accumulator( + self._avg_squared_grad_acc_str, param_and_grad[0]) + avg_squared_update_acc = self._get_accumulator( + self._avg_squared_update_acc_str, param_and_grad[0]) + + # Create the adadelta optimizer op + adadelta_op = block.append_op( + type=self.type, + inputs={ + "Param": param_and_grad[0], + "Grad": param_and_grad[1], + "AvgSquaredGrad": avg_squared_grad_acc, + "AvgSquaredUpdate": avg_squared_update_acc + }, + outputs={ + "ParamOut": param_and_grad[0], + "AvgSquaredGradOut": avg_squared_grad_acc, + "AvgSquaredUpdateOut": avg_squared_update_acc + }, + attrs={"epsilon": self._epsilon, + "rho": self._rho}, + stop_gradient=True) + + return adadelta_op + + +class RMSPropOptimizer(Optimizer): + """ + Root Mean Squared Propagation (RMSProp) is an unpublished, adaptive learning + rate method. The original slides proposed RMSProp: Slide 29 of + http://www.cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf . + + The original equation is as follows: + + .. math:: + + r(w, t) & = \\rho r(w, t-1) + (1 - \\rho)(\\nabla Q_{i}(w))^2 + + w & = w - \\frac{\\eta} {\\sqrt{r(w,t) + \\epsilon}} \\nabla Q_{i}(w) + + The first equation calculates moving average of the squared gradient for + each weight. Then dividing the gradient by :math:`sqrt{v(w,t)}`. + + In some cases, adding a momentum term :math: `\\beta` is beneficial. + In our implementation, Nesterov momentum is used: + + .. math:: + + r(w, t) & = \\rho r(w, t-1) + (1 - \\rho)(\\nabla Q_{i}(w))^2 + + v(w, t) & = \\beta v(w, t-1) + \\frac{\\eta} {\\sqrt{r(w,t) + + \\epsilon}} \\nabla Q_{i}(w) + + w & = w - v(w, t) + + if centered is True: + + .. math:: + + r(w, t) & = \\rho r(w, t-1) + (1 - \\rho)(\\nabla Q_{i}(w))^2 + + g(w, t) & = \\rho g(w, t-1) + (1 - \\rho)\\nabla Q_{i}(w) + + v(w, t) & = \\beta v(w, t-1) + \\frac{\\eta} {\\sqrt{r(w,t) - (g(w, t))^2 + + \\epsilon}} \\nabla Q_{i}(w) + + w & = w - v(w, t) + + where, :math:`\\rho` is a hyperparameter and typical values are 0.9, 0.95 + and so on. :math: `beta` is the momentum term. :math: `\\epsilon` is a + smoothing term to avoid division by zero, usually set somewhere in range + from 1e-4 to 1e-8. + + + Parameters: + lr(float): Global learning rate. + alpha(float): rho is :math: `\\rho` in equation, default is 0.95. + eps(float): :math: `\\epsilon` in equation is smoothing term to + avoid division by zero, default is 1e-6. + momentum(float): :math:`\\beta` in equation is the momentum term, + default is 0.0. + centered(bool): If True, gradients are normalized by the estimated variance of + the gradient; if False, by the uncentered second moment. Setting this to + True may help with training, but is slightly more expensive in terms of + computation and memory. Defaults to False. + params (Iterable, optional): Iterable of ``Tensor`` names to update to minimize ``loss``. \ + This parameter is required in dygraph mode. \ + The default value is None in static mode, at this time all parameters will be updated. + weight_decay (float, optional): Weight decay of L2 regularization. The default value is 0. + grad_clip (GradientClipBase, optional): Gradient cliping strategy, it's an instance of + some derived class of ``GradientClipBase`` . There are three cliping strategies + ( :ref:`api_fluid_clip_GradientClipByGlobalNorm` , :ref:`api_fluid_clip_GradientClipByNorm` , + :ref:`api_fluid_clip_GradientClipByValue` ). Default None, meaning there is no gradient clipping. + name (str, optional): This parameter is used by developers to print debugging information. \ + For details, please refer to :ref:`api_guide_Name`. Default is None. + + Raises: + ValueError: If lr, rho, epsilon, momentum are None. + + Examples: + .. code-block:: python + + import paddle + import paddle.fluid as fluid + import numpy as np + + place = fluid.CPUPlace() + main = fluid.Program() + with fluid.program_guard(main): + x = fluid.layers.data(name='x', shape=[13], dtype='float32') + y = fluid.layers.data(name='y', shape=[1], dtype='float32') + y_predict = fluid.layers.fc(input=x, size=1, act=None) + cost = fluid.layers.square_error_cost(input=y_predict, label=y) + avg_cost = fluid.layers.mean(cost) + + rms_optimizer = paddle.optimizer.RMSProp(lr=0.1) + rms_optimizer.minimize(avg_cost) + + fetch_list = [avg_cost] + train_reader = paddle.batch( + paddle.dataset.uci_housing.train(), batch_size=1) + feeder = fluid.DataFeeder(place=place, feed_list=[x, y]) + exe = fluid.Executor(place) + exe.run(fluid.default_startup_program()) + for data in train_reader(): + exe.run(main, feed=feeder.feed(data), fetch_list=fetch_list) + + """ + + _momentum_acc_str = "momentum" + _mean_square_acc_str = "mean_square" + _mean_grad_acc_str = "mean_grad" + + def __init__(self, + learning_rate, + rho=0.95, + epsilon=1.0e-6, + momentum=0.0, + centered=False, + parameter_list=None, + regularization=None, + grad_clip=None, + name=None): + super(RMSPropOptimizer, self).__init__( + learning_rate=learning_rate, + parameter_list=parameter_list, + regularization=regularization, + grad_clip=grad_clip, + name=name) + if learning_rate is None: + raise ValueError("learning_rate is not set.") + if rho is None: + raise ValueError("rho is not set.") + if epsilon is None: + raise ValueError("epsilon is not set.") + if momentum is None: + raise ValueError("momentum is not set.") + + self.type = "rmsprop" + self._rho = rho + self._epsilon = epsilon + self._momentum = momentum + self._centered = centered + + def _create_accumulators(self, block, parameters): + if not isinstance(block, framework.Block): + raise TypeError("block is not instance of framework.Block.") + + for p in parameters: + self._add_accumulator(self._momentum_acc_str, p) + self._add_accumulator(self._mean_square_acc_str, p) + self._add_accumulator(self._mean_grad_acc_str, p) + + def _append_optimize_op(self, block, param_and_grad): + if not isinstance(block, framework.Block): + raise TypeError("block is not instance of framework.Block.") + + momentum_acc = self._get_accumulator(self._momentum_acc_str, + param_and_grad[0]) + mean_square_acc = self._get_accumulator(self._mean_square_acc_str, + param_and_grad[0]) + mean_grad_acc = self._get_accumulator(self._mean_grad_acc_str, + param_and_grad[0]) + rmsprop_op = block.append_op( + type=self.type, + inputs={ + "Param": param_and_grad[0], + "Grad": param_and_grad[1], + "Moment": momentum_acc, + "MeanSquare": mean_square_acc, + "MeanGrad": mean_grad_acc, + "LearningRate": self._create_param_lr(param_and_grad), + }, + outputs={ + "ParamOut": param_and_grad[0], + "MomentOut": momentum_acc, + "MeanSquareOut": mean_square_acc, + "MeanGradOut": mean_grad_acc + }, + attrs={ + "epsilon": self._epsilon, + "decay": self._rho, + "momentum": self._momentum, + "centered": self._centered + }, + stop_gradient=True) + + return rmsprop_op + + +class FtrlOptimizer(Optimizer): + """ + FTRL (Follow The Regularized Leader) Optimizer. + + The paper that proposed Follow The Regularized Leader (FTRL): + (https://www.eecs.tufts.edu/~dsculley/papers/ad-click-prediction.pdf) + + .. math:: + + &new\_accum = squared\_accum + grad^2 + + &if (lr\_power == -0.5): + + &\quad linear\_accum += grad - \\frac{\\sqrt{new\_accum} - \\sqrt{squared\_accum}}{learning\_rate * param} + + &else: + + &\quad linear\_accum += grad - \\frac{new\_accum^{-lr\_power} - accum^{-lr\_power}}{learning\_rate * param} + + + &x = l1 * sign(linear\_accum) - linear\_accum + + &if (lr\_power == -0.5): + + &\quad y = \\frac{\\sqrt{new\_accum}}{learning\_rate} + (2 * l2) + + &\quad pre\_shrink = \\frac{x}{y} + + &\quad param = (abs(linear\_accum) > l1).select(pre\_shrink, 0.0) + + &else: + + &\quad y = \\frac{new\_accum^{-lr\_power}}{learning\_rate} + (2 * l2) + + &\quad pre\_shrink = \\frac{x}{y} + + &\quad param = (abs(linear\_accum) > l1).select(pre\_shrink, 0.0) + + &squared\_accum += grad^2 + + Parameters: + learning_rate (float|Variable): Global learning rate. + l1 (float): L1 regularization strength, default is 0.0. + l2 (float): L2 regularization strength, default is 0.0. + lr_power (float): Learning Rate Power, default is -0.5. + parameter_list (Iterable, optional): Iterable of ``Variable`` names to update to minimize ``loss``. \ + This parameter is required in dygraph mode. \ + The default value is None in static mode, at this time all parameters will be updated. + regularization (WeightDecayRegularizer, optional): The strategy of regularization. There are two method: \ + :ref:`api_fluid_regularizer_L1Decay` , :ref:`api_fluid_regularizer_L2Decay` . If a parameter has set \ + regularizer using :ref:`api_fluid_ParamAttr` already, the regularization setting here in optimizer will be \ + ignored for this parameter. Otherwise, the regularization setting here in optimizer will take effect. \ + Default None, meaning there is no regularization. + grad_clip (GradientClipBase, optional): Gradient cliping strategy, it's an instance of + some derived class of ``GradientClipBase`` . There are three cliping strategies + ( :ref:`api_fluid_clip_GradientClipByGlobalNorm` , :ref:`api_fluid_clip_GradientClipByNorm` , + :ref:`api_fluid_clip_GradientClipByValue` ). Default None, meaning there is no gradient clipping. + name (str, optional): This parameter is used by developers to print debugging information. \ + For details, please refer to :ref:`api_guide_Name`. Default is None. + + Raises: + ValueError: If learning_rate, rho, epsilon, momentum are None. + + Examples: + .. code-block:: python + + import paddle + import paddle.fluid as fluid + import numpy as np + + place = fluid.CPUPlace() + main = fluid.Program() + with fluid.program_guard(main): + x = fluid.layers.data(name='x', shape=[13], dtype='float32') + y = fluid.layers.data(name='y', shape=[1], dtype='float32') + y_predict = fluid.layers.fc(input=x, size=1, act=None) + cost = fluid.layers.square_error_cost(input=y_predict, label=y) + avg_cost = fluid.layers.mean(cost) + + ftrl_optimizer = paddle.optimizer.Ftrl(learning_rate=0.1) + ftrl_optimizer.minimize(avg_cost) + + fetch_list = [avg_cost] + train_reader = paddle.batch( + paddle.dataset.uci_housing.train(), batch_size=1) + feeder = fluid.DataFeeder(place=place, feed_list=[x, y]) + exe = fluid.Executor(place) + exe.run(fluid.default_startup_program()) + for data in train_reader(): + exe.run(main, feed=feeder.feed(data), fetch_list=fetch_list) + + NOTE: + Currently, FtrlOptimizer doesn't support sparse parameter optimization. + """ + + _squared_acc_str = "squared" + _linear_acc_str = "linear" + + def __init__(self, + learning_rate, + l1=0.0, + l2=0.0, + lr_power=-0.5, + parameter_list=None, + regularization=None, + grad_clip=None, + name=None): + super(FtrlOptimizer, self).__init__( + learning_rate=learning_rate, + parameter_list=parameter_list, + regularization=regularization, + grad_clip=grad_clip, + name=name) + if learning_rate is None: + raise ValueError("learning_rate is not set.") + + self.type = "ftrl" + self._l1 = l1 + self._l2 = l2 + self._lr_power = lr_power + + def _create_accumulators(self, block, parameters): + if not isinstance(block, framework.Block): + raise TypeError("block is not instance of framework.Block.") + + for p in parameters: + self._add_accumulator(self._squared_acc_str, p) + self._add_accumulator(self._linear_acc_str, p) + + def _append_optimize_op(self, block, param_and_grad): + if not isinstance(block, framework.Block): + raise TypeError("block is not instance of framework.Block.") + + squared_acc = self._get_accumulator(self._squared_acc_str, + param_and_grad[0]) + linear_acc = self._get_accumulator(self._linear_acc_str, + param_and_grad[0]) + ftrl_op = block.append_op( + type=self.type, + inputs={ + "Param": param_and_grad[0], + "Grad": param_and_grad[1], + "SquaredAccumulator": squared_acc, + "LinearAccumulator": linear_acc, + "LearningRate": self._create_param_lr(param_and_grad), + }, + outputs={ + "ParamOut": param_and_grad[0], + "SquaredAccumOut": squared_acc, + "LinearAccumOut": linear_acc + }, + attrs={"l1": self._l1, + "l2": self._l2, + "lr_power": self._lr_power}, + stop_gradient=True) + + return ftrl_op + + +class LambOptimizer(Adam): + """ + LAMB (Layer-wise Adaptive Moments optimizer for Batching training) Optimizer. + + LAMB Optimizer is designed to scale up the batch size of training without losing + accuracy, which supports adaptive element-wise updating and accurate layer-wise + correction. For more information, please refer to `Large Batch Optimization for + Deep Learning: Training BERT in 76 minutes `_ . + + The updating of parameters follows: + + .. math:: + + m_t &= \\beta_1 m_{t - 1}+ (1 - \\beta_1)g_t + + v_t &= \\beta_2 v_{t - 1} + (1 - \\beta_2)g_t^2 + + r_t &= \\frac{m_t}{\\sqrt{v_t}+\\epsilon} + + w_t &= w_{t-1} -\\eta_t \\frac{\\left \| w_{t-1}\\right \|}{\\left \| r_t + \\lambda w_{t-1}\\right \|} (r_t + \\lambda w_{t-1}) + + + where :math:`m` is the 1st moment, and :math:`v` the 2nd moment, :math:`\\eta` the + learning rate, :math:`\\lambda` the LAMB weight decay rate. + + Args: + learning_rate (float|Variable, optional): the learning rate used to update parameters. \ + Can be a float value or a Variable with data type float32. Default 0.001. + lamb_weight_decay (float, optional): The LAMB weight decay rate. Default 0.01. + beta1 (float, optional): The exponential decay rate for the 1st moment estimates. + Default 0.9. + beta2 (float, optional): The exponential decay rate for the 2nd moment estimates. + Default 0.999. + epsilon (float, optional): A small float value for numerical stability. Default 1e-6. + parameter_list (Iterable, optional): Iterable of ``Variable`` names to update to minimize ``loss``. \ + This parameter is required in dygraph mode. \ + The default value is None in static mode, at this time all parameters will be updated. + regularization (WeightDecayRegularizer, optional): The strategy of regularization. There are two method: \ + :ref:`api_fluid_regularizer_L1Decay` , :ref:`api_fluid_regularizer_L2Decay` . If a parameter has set \ + regularizer using :ref:`api_fluid_ParamAttr` already, the regularization setting here in optimizer will be \ + ignored for this parameter. Otherwise, the regularization setting here in optimizer will take effect. \ + Default None, meaning there is no regularization. + grad_clip (GradientClipBase, optional): Gradient cliping strategy, it's an instance of + some derived class of ``GradientClipBase`` . There are three cliping strategies + ( :ref:`api_fluid_clip_GradientClipByGlobalNorm` , :ref:`api_fluid_clip_GradientClipByNorm` , + :ref:`api_fluid_clip_GradientClipByValue` ). Default None, meaning there is no gradient clipping. + exclude_from_weight_decay_fn (function|None): Exclude a parameter from weight + decay when **exclude_from_weight_decay_fn(parameter)** returns true. + Default None. + name(str|None): For detailed information, please refer to + :ref:`api_guide_Name` . Usually name is no need to set and None by default. + + Examples: + .. code-block:: python + + import paddle.fluid as fluid + import paddle + + data = fluid.data(name='x', shape=[-1, 5], dtype='float32') + hidden = fluid.layers.fc(input=data, size=10) + cost = fluid.layers.mean(hidden) + + def exclude_fn(param): + return param.name.endswith('.b_0') + + optimizer = paddle.optimizer.Lamb(learning_rate=0.002, + exclude_from_weight_decay_fn=exclude_fn) + optimizer.minimize(cost) + """ + _moment1_acc_str = "moment1" + _moment2_acc_str = "moment2" + # these two not used in op temporarily + _beta1_pow_acc_str = "beta1_pow_acc" + _beta2_pow_acc_str = "beta2_pow_acc" + + def __init__(self, + learning_rate=0.001, + lamb_weight_decay=0.01, + beta1=0.9, + beta2=0.999, + epsilon=1e-6, + parameter_list=None, + regularization=None, + grad_clip=None, + exclude_from_weight_decay_fn=None, + name=None): + assert learning_rate is not None + assert lamb_weight_decay is not None + assert beta1 is not None + assert beta2 is not None + assert epsilon is not None + super(LambOptimizer, self).__init__( + learning_rate=learning_rate, + parameter_list=parameter_list, + regularization=regularization, + grad_clip=grad_clip, + beta1=beta1, + beta2=beta2, + epsilon=epsilon, + name=name) + self.type = "lamb" + self._weight_decay = lamb_weight_decay + self._exclude_from_weight_decay_fn = exclude_from_weight_decay_fn + + def _append_optimize_op(self, block, param_and_grad): + assert isinstance(block, framework.Block) + block.program._use_lamb = True + + moment1 = self._get_accumulator(self._moment1_acc_str, + param_and_grad[0]) + moment2 = self._get_accumulator(self._moment2_acc_str, + param_and_grad[0]) + beta1_pow_acc = self._get_accumulator(self._beta1_pow_acc_str, + param_and_grad[0]) + beta2_pow_acc = self._get_accumulator(self._beta2_pow_acc_str, + param_and_grad[0]) + + if self._exclude_from_weight_decay_fn is not None \ + and self._exclude_from_weight_decay_fn(param_and_grad[0]): + weight_decay = 0.0 + else: + weight_decay = self._weight_decay + + # create the lamb optimize op + lamb_op = block.append_op( + type=self.type, + inputs={ + "Param": param_and_grad[0], + "Grad": param_and_grad[1], + "LearningRate": self._create_param_lr(param_and_grad), + "Moment1": moment1, + "Moment2": moment2, + "Beta1Pow": beta1_pow_acc, + "Beta2Pow": beta2_pow_acc + }, + outputs={ + "ParamOut": param_and_grad[0], + "Moment1Out": moment1, + "Moment2Out": moment2 + }, + attrs={ + "beta1": self._beta1, + "beta2": self._beta2, + "epsilon": self._epsilon, + "weight_decay": weight_decay + }, + stop_gradient=True) + + return lamb_op + + +# We short the class name, since users will use the optimizer with the package +# name. The sample code: +# +# import paddle.fluid as fluid +# +# sgd = paddle.optimizer.SGD(...) +# +# It is no need to add an `Optimizer` as the class suffix +SGD = SGDOptimizer +Momentum = MomentumOptimizer +Adagrad = AdagradOptimizer +Adamax = AdamaxOptimizer +Dpsgd = DpsgdOptimizer +DecayedAdagrad = DecayedAdagradOptimizer +Adadelta = AdadeltaOptimizer +RMSProp = RMSPropOptimizer +Ftrl = FtrlOptimizer +LarsMomentum = LarsMomentumOptimizer +Lamb = LambOptimizer + + +class ModelAverage(Optimizer): + """ + :api_attr: Static Graph + + The ModelAverage optimizer accumulates specific continuous historical parameters + during training. The accumulated historical range can be controlled by the passed + ``average_window_rate`` argument. The averaged ``Parameter`` are used in the prediction, + which usually can improve the accuracy of the prediction. + + Accumulate the average of the ``Parameter`` in the sliding window, the result will be saved + in a temporary variable, can be applied to the current model's ``Parameter`` by calling + the ``apply()`` method, and the current model ``Parameter`` can be restored by calling + the ``restore()`` method. + + The window size for calculating the average is determined by ``average_window_rate``, + ``min_average_window``, ``max_average_window`` and the current ``Parameter`` update times (num_updates). + + When the cumulative times (num_accumulates) is greater than the specific window + threshold (average_window), the accumulated ``Parameter`` temporary variable is set to 0.0. + The following example will help to understand the role of these arguments: + + :: + + if num_accumulates >= min_average_window and num_accumulates >= min(max_average_window, num_updates * average_window_rate): + num_accumulates = 0 + + In the above conditional judgment statement, ``num_accumulates`` indicates the current + accumulated number, which can be abstractly understood as the length of the cumulative window. + The length of the window must be at least the length set by the ``min_average_window`` argument, + and cannot exceed the length specified by the ``max_average_window`` argument or + ``num_updates * average_window_rate``, where ``num_updates`` indicates the current ``Parameter`` + update times, ``average_window_rate`` is a coefficient that calculates the length of the window. + + Args: + average_window_rate (float): The calculate ratio of the window length relative to ``Parameter`` update times. + min_average_window (int, optional): the minimum size of average window length. The default value is 10000. + max_average_window (int, optional): The maximum size of average window length. The default value is 10000. + regularization (WeightDecayRegularizer, optional): The strategy of regularization. There are two method: \ + :ref:`api_fluid_regularizer_L1Decay` , :ref:`api_fluid_regularizer_L2Decay` . If a parameter has set \ + regularizer using :ref:`api_fluid_ParamAttr` already, the regularization setting here in optimizer will be \ + ignored for this parameter. Otherwise, the regularization setting here in optimizer will take effect. \ + Default None, meaning there is no regularization. + name (str, optional): Normally there is no need for user to set this property. + For more information, please refer to :ref:`api_guide_Name`. + The default value is None. + + Examples: + + .. code-block:: python + + import paddle.fluid as fluid + import paddle + import numpy + + # First create the Executor. + place = fluid.CPUPlace() # fluid.CUDAPlace(0) + exe = fluid.Executor(place) + + train_program = fluid.Program() + startup_program = fluid.Program() + with fluid.program_guard(train_program, startup_program): + # build net + data = fluid.data(name='X', shape=[None, 1], dtype='float32') + hidden = fluid.layers.fc(input=data, size=10) + loss = fluid.layers.mean(hidden) + optimizer = paddle.optimizer.Momentum(learning_rate=0.2, momentum=0.1) + optimizer.minimize(loss) + + # build ModelAverage optimizer + model_average = paddle.optimizer.ModelAverage(0.15, + min_average_window=10000, + max_average_window=12500) + + exe.run(startup_program) + for i in range(12500): + x = numpy.random.random(size=(10, 1)).astype('float32') + outs = exe.run(program=train_program, + feed={'X': x}, + fetch_list=[loss.name]) + + # apply ModelAverage + with model_average.apply(exe): + x = numpy.random.random(size=(10, 1)).astype('float32') + exe.run(program=train_program, + feed={'X': x}, + fetch_list=[loss.name]) + """ + + def __init__(self, + average_window_rate, + min_average_window=10000, + max_average_window=10000, + regularization=None, + name=None): + if framework.in_dygraph_mode(): + raise Exception("In dygraph, don't support ModelAverage.") + super(ModelAverage, self).__init__( + 0.0, regularization=regularization, name=name) + self.average_window = average_window_rate + self.min_average_window = min_average_window + self.max_average_window = max_average_window + + self.params_grads = [] + for param in framework.default_main_program().global_block( + ).all_parameters(): + if param.do_model_average != False: + grad = param.block.create_var( + name=unique_name.generate_with_ignorable_key(".".join( + [param.name, 'tmp'])), + dtype=param.dtype, + persistable=False, + stop_gradient=True) + self.params_grads.append((param, grad)) + + for param, grad in self.params_grads: + if grad is None: + continue + with param.block.program._optimized_guard( + [param, grad]), name_scope('move_average'): + self._append_average_accumulate_op(param) + + self.apply_program = Program() + block = self.apply_program.global_block() + with program_guard(main_program=self.apply_program): + for param_grad in self.params_grads: + self._add_average_apply_op(block, param_grad) + + self.restore_program = Program() + block = self.restore_program.global_block() + with program_guard(main_program=self.restore_program): + for param_grad in self.params_grads: + self._add_average_restore_op(block, param_grad) + + def _add_average_apply_op(self, block, param_grad): + param = block._clone_variable(param_grad[0]) + grad = block._clone_variable(param_grad[1]) + sum_1 = block._clone_variable(self._get_accumulator('sum_1', param)) + sum_2 = block._clone_variable(self._get_accumulator('sum_2', param)) + sum_3 = block._clone_variable(self._get_accumulator('sum_3', param)) + num_accumulates = block._clone_variable( + self._get_accumulator('num_accumulates', param)) + old_num_accumulates = block._clone_variable( + self._get_accumulator('old_num_accumulates', param)) + num_updates = block._clone_variable( + self._get_accumulator('num_updates', param)) + # backup param value to grad + layers.assign(input=param, output=grad) + # param = (sum_1 + sum_2 + sum_3) / (num_accumulates + old_num_accumulates) + tmp = layers.sum(x=[num_accumulates, old_num_accumulates]) + sum = layers.sum(x=[sum_1, sum_2, sum_3]) + tmp = layers.cast( + x=tmp, dtype='float32' if self._dtype == None else self._dtype) + sum = layers.cast( + x=sum, dtype='float32' if self._dtype == None else self._dtype) + ops._elementwise_div(x=sum, y=tmp, out=param) + + def _add_average_restore_op(self, block, param_grad): + param = block._clone_variable(param_grad[0]) + grad = block._clone_variable(param_grad[1]) + layers.assign(input=grad, output=param) + + def _append_average_accumulate_op(self, param): + self.helper = LayerHelper("average_accumulate") + sum_1 = self._add_accumulator('sum_1', param) + sum_2 = self._add_accumulator('sum_2', param) + sum_3 = self._add_accumulator('sum_3', param) + num_accumulates = self._add_accumulator( + 'num_accumulates', param, dtype='int64', shape=[1]) + old_num_accumulates = self._add_accumulator( + 'old_num_accumulates', param, dtype='int64', shape=[1]) + num_updates = self._add_accumulator( + 'num_updates', param, dtype='int64', shape=[1]) + + self.helper.append_op( + type='average_accumulates', + inputs={ + "param": param, + "in_sum_1": sum_1, + "in_sum_2": sum_2, + "in_sum_3": sum_3, + "in_num_accumulates": num_accumulates, + "in_old_num_accumulates": old_num_accumulates, + "in_num_updates": num_updates + }, + outputs={ + "out_sum_1": sum_1, + "out_sum_2": sum_2, + "out_sum_3": sum_3, + "out_num_accumulates": num_accumulates, + "out_old_num_accumulates": old_num_accumulates, + "out_num_updates": num_updates, + }, + attrs={ + "average_window": self.average_window, + "min_average_window": self.min_average_window, + "max_average_window": self.max_average_window, + }, + stop_gradient=True) + + @signature_safe_contextmanager + def apply(self, executor, need_restore=True): + """ + Apply the average of the cumulative ``Parameter`` to the parameters of the current model. + + Args: + executor(fluid.Executor): The current network executor. + need_restore(bool): Restore flag variable, if set to True, the network will restore + the parameters of the network to the default value, if set to False, + it will not be restored. The default value is True. + + Examples: + + .. code-block:: python + + import paddle.fluid as fluid + import paddle + import numpy + + # First create the Executor. + place = fluid.CPUPlace() # fluid.CUDAPlace(0) + exe = fluid.Executor(place) + + train_program = fluid.Program() + startup_program = fluid.Program() + with fluid.program_guard(train_program, startup_program): + # build net + data = fluid.data(name='X', shape=[None, 1], dtype='float32') + hidden = fluid.layers.fc(input=data, size=10) + loss = fluid.layers.mean(hidden) + optimizer = paddle.optimizer.Momentum(learning_rate=0.2, momentum=0.1) + optimizer.minimize(loss) + + # build ModelAverage optimizer + model_average = paddle.optimizer.ModelAverage(0.15, + min_average_window=10000, + max_average_window=12500) + + exe.run(startup_program) + for i in range(12500): + x = numpy.random.random(size=(10, 1)).astype('float32') + outs = exe.run(program=train_program, + feed={'X': x}, + fetch_list=[loss.name]) + + # apply ModelAverage + with model_average.apply(exe): + x = numpy.random.random(size=(10, 1)).astype('float32') + exe.run(program=train_program, + feed={'X': x}, + fetch_list=[loss.name]) + """ + executor.run(self.apply_program) + try: + yield + finally: + if need_restore: + self.restore(executor) + + def restore(self, executor): + """ + Restore ``Parameter`` values of current model. + + Args: + executor(fluid.Executor): The current network executor. + + Examples: + + .. code-block:: python + + import paddle.fluid as fluid + import paddle + import numpy + + # First create the Executor. + place = fluid.CPUPlace() # fluid.CUDAPlace(0) + exe = fluid.Executor(place) + + train_program = fluid.Program() + startup_program = fluid.Program() + with fluid.program_guard(train_program, startup_program): + # build net + data = fluid.data(name='X', shape=[None, 1], dtype='float32') + hidden = fluid.layers.fc(input=data, size=10) + loss = fluid.layers.mean(hidden) + optimizer = paddle.optimizer.Momentum(learning_rate=0.2, momentum=0.1) + optimizer.minimize(loss) + + # build ModelAverage optimizer + model_average = paddle.optimizer.ModelAverage(0.15, + min_average_window=10000, + max_average_window=12500) + + exe.run(startup_program) + for i in range(12500): + x = numpy.random.random(size=(10, 1)).astype('float32') + outs = exe.run(program=train_program, + feed={'X': x}, + fetch_list=[loss.name]) + + # apply ModelAverage + with model_average.apply(exe, False): + x = numpy.random.random(size=(10, 1)).astype('float32') + exe.run(program=train_program, + feed={'X': x}, + fetch_list=[loss.name]) + + # restore Parameters + model_average.restore(exe) + """ + executor.run(self.restore_program) + + +class ExponentialMovingAverage(object): + """ + :api_attr: Static Graph + + Compute the moving average of parameters with exponential decay. + Given a parameter :math:`\\theta`, its exponential moving average (EMA) + will be + + .. math:: + + \\text{EMA}_0 & = 0 + + \\text{EMA}_t & = \\text{decay} * \\text{EMA}_{t-1} + (1 - \\text{decay}) * \\theta_t + + The average results calculated by **update()** method will be saved in + temporary variables which are created and maintained by the object, and can + be applied to parameters of current model by calling **apply()** method. And + the **restore()** method is used to restore the parameters. + + **Bias correction**. All EMAs are initialized to :math:`0` and hence they will be + zero biased, which can be corrected by divided by a factor + :math:`(1 - \\text{decay}^t)` , i.e., the actual EMAs applied to parameters + when calling **apply()** method would be + + .. math:: + + \\widehat{\\text{EMA}}_t = \\frac{\\text{EMA}_t}{1 - \\text{decay}^t} + + **Decay rate scheduling**. A large decay rate very close to 1 would result + in that the averages move very slowly. And a better strategy is to set a + relative smaller decay rate in the very beginning. The argument **thres_steps** + allows users to pass a Variable to schedule the decay rate, in this case, + the actual decay rate becomes + + .. math:: + + \\min(\\text{decay}, \\frac{1 + \\text{thres_steps}}{10 + \\text{thres_steps}}) + + Usually **thres_steps** can be the global training steps. + + + Args: + decay (float, optional): The exponential decay rate, usually close to 1, such as + 0.999, 0.9999, ... . Default 0.999. + thres_steps (Variable|None): If not `None`, schedule the decay rate. + Default None. + name (str|None): For detailed information, please refer to + :ref:`api_guide_Name`. Usually name is no need to set and None by + default. + + + Examples: + + .. code-block:: python + + import numpy + import paddle + import paddle.fluid as fluid + + data = fluid.data(name='x', shape=[-1, 5], dtype='float32') + hidden = fluid.layers.fc(input=data, size=10) + cost = fluid.layers.mean(hidden) + + test_program = fluid.default_main_program().clone(for_test=True) + + optimizer = paddle.optimizer.Adam(lr=0.001) + optimizer.minimize(cost) + + global_steps = fluid.layers.autoincreased_step_counter() + ema = paddle.optimizer.ExponentialMovingAverage(0.999, thres_steps=global_steps) + ema.update() + + place = fluid.CPUPlace() + exe = fluid.Executor(place) + exe.run(fluid.default_startup_program()) + + for pass_id in range(3): + for batch_id in range(6): + data = numpy.random.random(size=(10, 5)).astype('float32') + exe.run(program=fluid.default_main_program(), + feed={'x': data}, + fetch_list=[cost.name]) + + # usage 1 + with ema.apply(exe): + data = numpy.random.random(size=(10, 5)).astype('float32') + exe.run(program=test_program, + feed={'x': data}, + fetch_list=[hidden.name]) + + + # usage 2 + with ema.apply(exe, need_restore=False): + data = numpy.random.random(size=(10, 5)).astype('float32') + exe.run(program=test_program, + feed={'x': data}, + fetch_list=[hidden.name]) + ema.restore(exe) + """ + + def __init__(self, decay=0.999, thres_steps=None, name=None): + if framework.in_dygraph_mode(): + raise Exception( + "In dygraph, don't support ExponentialMovingAverage.") + self._decay = decay + self._thres_steps = thres_steps + self._name = name if name is not None else '' + self._decay_var = self._get_ema_decay() + + self._step_counter_name = "@EMA_STEP_COUNTER@" + self._params_tmps = [] + for param in default_main_program().global_block().all_parameters(): + if param.do_model_average != False: + tmp = param.block.create_var( + name=unique_name.generate(".".join( + [self._name + param.name, 'ema_tmp'])), + dtype=param.dtype, + persistable=False, + stop_gradient=True) + self._params_tmps.append((param, tmp)) + + self._ema_vars = {} + for param, tmp in self._params_tmps: + with param.block.program._optimized_guard( + [param, tmp]), name_scope('moving_average'): + self._ema_vars[param.name] = self._create_ema_vars(param) + + self.apply_program = Program() + block = self.apply_program.global_block() + with program_guard(main_program=self.apply_program): + decay_pow, global_step = self._get_decay_pow(block) + for param, tmp in self._params_tmps: + param = block._clone_variable(param) + tmp = block._clone_variable(tmp) + ema = block._clone_variable(self._ema_vars[param.name]) + layers.assign(input=param, output=tmp) + # bias correction + with layers.control_flow.Switch() as switch: + with switch.case(global_step > 0): + layers.assign(output=ema, input=ema / (1.0 - decay_pow)) + layers.assign(input=ema, output=param) + + self.restore_program = Program() + block = self.restore_program.global_block() + with program_guard(main_program=self.restore_program): + for param, tmp in self._params_tmps: + tmp = block._clone_variable(tmp) + param = block._clone_variable(param) + layers.assign(input=tmp, output=param) + + def _get_ema_decay(self): + with default_main_program()._lr_schedule_guard(): + decay_var = layers.tensor.create_global_var( + shape=[1], + value=self._decay, + dtype='float32', + persistable=True, + name="scheduled_ema_decay_rate") + + if self._thres_steps is not None: + decay_t = (self._thres_steps + 1.0) / (self._thres_steps + 10.0) + with layers.control_flow.Switch() as switch: + with switch.case(decay_t < self._decay): + layers.tensor.assign(decay_t, decay_var) + with switch.default(): + layers.tensor.assign( + np.array( + [self._decay], dtype=np.float32), + decay_var) + return decay_var + + def _get_decay_pow(self, block): + global_step = layers.create_global_var( + name=self._step_counter_name, + shape=[1], + value=0, + dtype='int64', + persistable=True) + global_step = layers.cast(global_step, "float32") + decay_var = block._clone_variable(self._decay_var) + decay_pow_acc = layers.elementwise_pow(decay_var, global_step) + return decay_pow_acc, global_step + + def _create_ema_vars(self, param): + param_ema = layers.create_global_var( + name=unique_name.generate(self._name + param.name + '_ema'), + shape=param.shape, + value=0.0, + dtype=param.dtype, + persistable=True) + + return param_ema + + def update(self): + """ + Update Exponential Moving Average. Should only call this method in + train program. + """ + global_step = layers.autoincreased_step_counter( + counter_name=self._step_counter_name) + param_master_emas = [] + for param, tmp in self._params_tmps: + with param.block.program._optimized_guard( + [param, tmp]), name_scope('moving_average'): + param_ema = self._ema_vars[param.name] + if param.name + '.master' in self._ema_vars: + master_ema = self._ema_vars[param.name + '.master'] + param_master_emas.append([param_ema, master_ema]) + else: + ema_t = param_ema * self._decay_var + param * ( + 1 - self._decay_var) + layers.assign(input=ema_t, output=param_ema) + + # for fp16 params + for param_ema, master_ema in param_master_emas: + default_main_program().global_block().append_op( + type="cast", + inputs={"X": master_ema}, + outputs={"Out": param_ema}, + attrs={ + "in_dtype": master_ema.dtype, + "out_dtype": param_ema.dtype + }) + + @signature_safe_contextmanager + def apply(self, executor, need_restore=True): + """ + Apply moving average to parameters for evaluation. + + Args: + executor (Executor): The Executor to execute applying. + need_restore (bool, optional): Whether to restore parameters after + applying. Default True. + """ + executor.run(self.apply_program) + try: + yield + finally: + if need_restore: + self.restore(executor) + + def restore(self, executor): + """Restore parameters. + + Args: + executor (Executor): The Executor to execute restoring. + """ + executor.run(self.restore_program) + + +class PipelineOptimizer(object): + """ + :api_attr: Static Graph + + Pipeline Optimizer: Make a program to run as pipeline, that is splitting a + program into multiple sections (sub-programs) and each section run on a + device to enable the training of large scale models and the use of + heterogeneous devices. Meanwhile, all sections run in the stype of pipeline. + + Args: + optimizer (Optimizer): The optimizer to use, such as SGD. + num_microbatches (int): Number of microbatches. [Optional. Default:1]. + start_cpu_core_id (int): The first cpu core id to use. [Optional. Default:0]. + + Examples: + .. code-block:: python + + import paddle.fluid as fluid + import paddle + import paddle.fluid.layers as layers + + with fluid.device_guard("gpu:0"): + x = fluid.layers.data(name='x', shape=[1], dtype='int64', lod_level=0) + y = fluid.layers.data(name='y', shape=[1], dtype='int64', lod_level=0) + data_loader = fluid.io.DataLoader.from_generator( + feed_list=[x, y], + capacity=64, + use_double_buffer=True, + iterable=False) + + emb_x = layers.embedding(input=x, param_attr=fluid.ParamAttr(name="embx"), size=[10,2], is_sparse=False) + emb_y = layers.embedding(input=y, param_attr=fluid.ParamAttr(name="emby",learning_rate=0.9), size=[10,2], is_sparse=False) + + with fluid.device_guard("gpu:1"): + concat = layers.concat([emb_x, emb_y], axis=1) + fc = layers.fc(input=concat, name="fc", size=1, num_flatten_dims=1, bias_attr=False) + loss = layers.reduce_mean(fc) + optimizer = paddle.optimizer.SGD(learning_rate=0.5) + optimizer = paddle.optimizer.PipelineOptimizer(optimizer) + optimizer.minimize(loss) + + def train_reader(): + for _ in range(4): + x = np.random.random(size=[1]).astype('int64') + y = np.random.random(size=[1]).astype('int64') + yield x, y + data_loader.set_sample_generator(train_reader, batch_size=1) + + place = fluid.CUDAPlace(0) + exe = fluid.Executor(place) + exe.run(fluid.default_startup_program()) + batch_size = 1 + filelist = [] # you should set your own filelist, e.g. filelist = ["dataA.txt"] + dataset = fluid.DatasetFactory().create_dataset("FileInstantDataset") + dataset.set_use_var([x,y]) + dataset.set_batch_size(batch_size) + dataset.set_filelist(filelist) + data_loader.start() + exe.train_from_dataset( + fluid.default_main_program(), + dataset) + data_loader.reset() + """ + + def __init__(self, optimizer, num_microbatches=1, start_cpu_core_id=0): + if framework.in_dygraph_mode(): + raise Exception("In dygraph, don't support PipelineOptimizer.") + if not isinstance(optimizer, Optimizer): + raise ValueError("The 'optimizer' parameter for " + "PipelineOptimizer must be an instance of " + "Optimizer, but the given type is {}.".format( + type(optimizer))) + self._optimizer = optimizer + assert num_microbatches >= 1, ( + "num_microbatches must be a positive value.") + self._num_microbatches = num_microbatches + assert start_cpu_core_id >= 0, ( + "start_cpu_core_id must be greater than or equal to 0.") + self._start_cpu_core_id = start_cpu_core_id + self._place_list = None + op_maker = core.op_proto_and_checker_maker + self._op_role = op_maker.OpRole + self._op_role_key = op_maker.kOpRoleAttrName() + self._op_role_var_key = op_maker.kOpRoleVarAttrName() + self._op_device_key = op_maker.kOpDeviceAttrName() + self._param_device_map = dict() + + def _create_vars(self, block, main_program): + # Create vars for block, copied from main_program's global block + used_var_set = set() + for op_idx in range(block.desc.op_size()): + op_desc = block.desc.op(op_idx) + vars = op_desc.input_arg_names() + op_desc.output_arg_names() + for var in vars: + # a var whose name contains "blocking_queue" + # only exists in startup program + if var in used_var_set or "_blocking_queue" in var: + continue + used_var_set.add(var) + source_var = main_program.block(0).var(str(var)) + if source_var.type == core.VarDesc.VarType.READER: + block.create_var(name=var, type=core.VarDesc.VarType.READER) + else: + block._clone_variable(source_var, False) + + def _is_loss_grad_op(self, op): + if self._op_role_key not in op.attr_names: + return False + op_role = int(op.all_attrs()[self._op_role_key]) + return op_role & int(self._op_role.Backward) and op_role & int( + self._op_role.Loss) + + def _is_backward_op(self, op): + return self._op_role_key in op.attr_names and int(op.all_attrs()[ + self._op_role_key]) & int(self._op_role.Backward) + + def _is_optimize_op(self, op): + return self._op_role_key in op.attr_names and int(op.all_attrs()[ + self._op_role_key]) & int(self._op_role.Optimize) + + def _is_update_op(self, op): + return 'Param' in op.input_names and 'Grad' in op.input_names and ( + "LearningRate" in op.input_names) + + def _split_program(self, main_program): + """ + Split a program into sections according to devices that ops run on. + + Args: + main_program (Program): the main program + """ + programs = [] + # Map from device to its corresponding section program info + device_program_map = dict() + block = main_program.block(0) + + for op in block.ops: + device = op.attr(self._op_device_key) + + if device not in device_program_map: + program = {"program": Program()} + device_program_map[device] = program + program = device_program_map[device] + op_desc = op.desc + ap_op = program["program"].block(0).desc.append_op() + ap_op.copy_from(op_desc) + + for key in sorted(device_program_map.keys()): + program = device_program_map[key] + program['program']._sync_with_cpp() + programs.append(program) + + return programs + + def _find_post_op(self, ops, cur_op, var_name): + """ + Find the real post op that has variable named var_name as input. + + Args: + ops (list): A list of ops. + cur_op (Operator): Current operator which has variable named + var_name as output. + var_name (string): Variable name. + """ + post_op = [] + before = True + for op in ops: + if op == cur_op: + before = False + continue + if before: + continue + for in_var_name in op.input_arg_names: + if in_var_name == var_name: + post_op.append(op) + if post_op: + if not len(post_op) == 1: + raise ValueError("Each op can only have one post op.") + return post_op[0] + return None + + def _find_real_prev_op(self, ops, cur_op, var_name): + """ + Find the real previous op that outputs variable named var_name. + + Args: + ops (list): A list of ops. + cur_op (Operator): Current operator which has variable named + var_name as input. + var_name (string): Variable name. + """ + prev_op = [] + for op in ops: + if op == cur_op: + break + for out_var_name in op.output_arg_names: + if out_var_name == var_name: + prev_op.append(op) + if prev_op: + # A op may have more than one prev op, + # e.g., for 'learning_rate', there may be multiple ops have it as + # output. + return prev_op[-1] + return None + + def _rename_arg(self, op, old_name, new_name): + op_desc = op.desc + if isinstance(op_desc, tuple): + op_desc = op_desc[0] + op_desc._rename_input(old_name, new_name) + op_desc._rename_output(old_name, new_name) + + def _create_var(self, block, ref_var, name): + """ + Create a new var for block, which has the same type, + shape and dtype as ref_var, then rename it with the + name `name`. + """ + new_var = block.create_var( + name=name, + shape=ref_var.shape, + dtype=ref_var.dtype, + type=ref_var.type, + lod_level=ref_var.lod_level, + persistable=False, + is_data=False, + need_check_feed=ref_var.desc.need_check_feed()) + return new_var + + def _get_data_var_info(self, block): + """ + Get all vars whose is_data attribute are true and then rename them. + + For PipelineTrainer, all data vars are binded to + minibatch scope, so we have to feed them to the microbatch + to avoid conflicts. The vars feeded to microbatch have to + be renamed. + """ + # A map from var name to the renamed name. + raw_name_new_name_map = dict() + # Because we will create vars in block, it is more safe + # to get all var_names before iteration. + var_names = list(block.vars.keys()) + for var_name in var_names: + var = block.var(var_name) + if not var.is_data: + continue + assert var_name not in raw_name_new_name_map, ( + "{} has already been processed.".format(var_name)) + new_name = unique_name.generate(var_name) + raw_name_new_name_map[var_name] = new_name + new_var = self._create_var(block, var, new_name) + new_var.is_data = False + + # map of data to devices that that data on + data_devices_map = dict() + for op in block.ops: + dev_spec = op.attr(self._op_device_key) + for var_name in op.input_arg_names: + if var_name not in raw_name_new_name_map: + continue + if not var_name in data_devices_map: + data_devices_map[var_name] = [] + if not dev_spec in data_devices_map[var_name]: + data_devices_map[var_name].append(dev_spec) + new_name = raw_name_new_name_map[var_name] + #self._rename_arg(op, var_name, new_name) + return data_devices_map, raw_name_new_name_map + + def _rename_var_in_block(self, block, raw_name_new_name_map): + """ + Rename vars whose names in raw_name_new_name_map to the corresponding + new names. + """ + for op in block.ops: + if op.type == "enqueue" or op.type == "dequeue": + continue + for var_name in op.input_arg_names: + if var_name in raw_name_new_name_map: + new_name = raw_name_new_name_map[var_name] + self._rename_arg(op, var_name, new_name) + + def _insert_enq_deq_for_data_var(self, main_block, programs, startup, + devices): + """ + Insert enqueue and dequeue ops for data var + + Args: + main_block (Block): Global block for main program + programs (dict): Dictionary for section params + startup (Program): Startup program + devices (list): List of devices in the format (dev:dev_index) + """ + main_program = main_block.program + data_devices_map, raw_name_new_name_map = self._get_data_var_info( + main_block) + + first_prog = programs[0]['program'] + first_block = first_prog.block(0) + enqueue_index = 0 + if first_block.ops[0].type == "create_py_reader" or ( + first_block.ops[1].type == "create_py_reader"): + for op in first_block.ops: + if op.type == "read": + enqueue_index += 1 + break + enqueue_index += 1 + first_dev_spec = devices[0] + for var_name in data_devices_map.keys(): + for device in data_devices_map[var_name]: + # step1: generate queue for each pair of data var and device + # that that data on + queue_name = var_name + "_blocking_queue" + queue_name = unique_name.generate(queue_name) + queue_var = startup.block(0).create_var( + name=queue_name, + persistable=True, + type=core.VarDesc.VarType.RAW) + startup.block(0).append_op( + type='queue_generator', + attrs={ + 'names': [queue_name], + 'capacity': self._num_microbatches + }) + main_var = main_block.var(var_name) + assert main_var.is_data + if not var_name in first_block.vars: + self._create_var(first_block, main_var, var_name) + first_block._insert_op( + index=enqueue_index, + type='enqueue', + inputs={'X': first_block.var(var_name)}, + attrs={ + 'queue_name': queue_name, + self._op_device_key: first_dev_spec, + self._op_role_key: self._op_role.Forward + }) + # Get the device that that data on + assert device in devices + prog_index = devices.index(device) + prog = programs[prog_index]['program'] + block = prog.block(0) + index = 0 + if device == first_dev_spec: + index = enqueue_index + 1 + new_name = raw_name_new_name_map[var_name] + source_var = main_program.block(0).var(var_name) + new_var = self._create_var(block, source_var, new_name) + block._insert_op( + index=index, + type='dequeue', + outputs={'Out': [new_var]}, + attrs={ + self._op_device_key: device, + self._op_role_key: self._op_role.Forward, + 'queue_name': queue_name, + }) + self._rename_var_in_block(block, raw_name_new_name_map) + + def _strip_grad_suffix(self, name): + """ + Strip the grad suffix from the given variable name + """ + pos = name.find(core.grad_var_suffix()) + return name[:pos] if pos != -1 else name + + def _append_grad_suffix(self, name): + """ + Append grad suffix to the given variable name + """ + return name + core.grad_var_suffix() + + def _update_param_device_map(self, params_grads, block): + for param_grad in params_grads: + if not param_grad[0].trainable: continue + param_name = param_grad[0].name + ops = block.ops + for op in ops: + input_arg_names = op.input_arg_names + if param_name in input_arg_names: + self._param_device_map[param_name] = op.attr( + self._op_device_key) + break + + def _add_opdevice_attr_for_regularization_clip(self, block): + """ + Add op_device attribute for regulization and clip ops. + """ + for op in block.ops: + # role for regularization and clip ops is optimize + if int(op.attr(self._op_role_key)) != int(self._op_role.Optimize): + continue + if op.has_attr(self._op_device_key) and ( + op.attr(self._op_device_key) != ""): + continue + assert self._op_role_var_key in op.attr_names + op_role_var = op.all_attrs()[self._op_role_var_key] + assert len(op_role_var) == 2 + param_name = block.vars[op_role_var[0]].name + device = self._param_device_map[param_name] + op._set_attr(self._op_device_key, device) + + def _add_default_opdevice_attr(self, block): + """ + 1. Add default op_device attribute for lr-related ops. + The default value is the one that of the first place. + 2. Add default op_device attribute for sum ops added during + backward. For these ops, we set the op_device attribute + as the one of its post op, i.e, which op has the output of the + sum op as an input. + """ + first_devcie = "" + + # Get the device spec of the first place. + # device_spec: 'cpu' for cpu device and 'gpu:id' for gpu device, + # e.g. 'gpu:0', 'gpu:1', etc. + for op in block.ops: + if op.has_attr(self._op_device_key) and ( + op.attr(self._op_device_key) != ""): + first_device = op.attr(self._op_device_key) + break + assert first_device + + # set op_device attr for lr-related ops + lrsched_role = int(self._op_role.LRSched) + for op in block.ops: + if not op.has_attr(self._op_device_key) or ( + op.attr(self._op_device_key) == ""): + if op.type == "sum": + # For sum ops that compute the sum of @RENAMED@ vars + for name in op.desc.input_arg_names(): + assert '@RENAME@' in name + assert len(op.desc.output_arg_names()) == 1 + out_name = op.desc.output_arg_names()[0] + post_op = self._find_post_op(block.ops, op, out_name) + device = post_op.attr(self._op_device_key) + assert device + op._set_attr(self._op_device_key, device) + continue + + assert op.attr(self._op_role_key) == lrsched_role, ( + "Op whose op_device attr has not been set for pipeline" + " must be of the role LRSched.") + op._set_attr(self._op_device_key, first_device) + + def _check_validation(self, block): + """ + Check whether ops in a block are all validate (i.e., the + op_device attribute has been set). + Then, return all device specifications in order. + """ + device_specs = [] + for op in block.ops: + type = op.type + if not op._has_kernel(type): + assert op.type == "conditional_block" and ( + op.attr(self._op_role_key) == int(self._op_role.LRSched)), ( + "Now, the only supported op without kernel is " + "conditional_block, and its op role must be LRSched.") + assert op.has_attr(self._op_device_key), ( + "op ({}) has no {} attribute.".format(op.type, + self._op_device_key)) + dev_spec = op.attr(self._op_device_key) + assert dev_spec, ("op_device attribute for op " + "{} has not been set.".format(op.type)) + if not dev_spec in device_specs: + device_specs.append(dev_spec) + return device_specs + + def _insert_enq_deq_ops_for_boundaries(self, block, origin_block, + startup_program): + """ + Insert a pair of enqueue and dequeue ops for every two + consecutive ops on different devices. + """ + startup_block = startup_program.global_block() + extra_index = 0 + + # A map from var to device spec where op takes it as input, + # avoiding multiple enqueue and dequeue ops. + var_devspec = dict() + + for index, op in list(enumerate(origin_block.ops)): + cur_device_spec = op.attr(self._op_device_key) + for var_name in op.input_arg_names: + # i.e., lod_tensor_blocking_queue created by DataLoader, + # which only exists in startup program. + if not var_name in origin_block.vars: continue + var = block.var(var_name) + # skip data, because we will process it later + if var.is_data: continue + prev_op = self._find_real_prev_op(origin_block.ops, op, + var_name) + if prev_op is None: + continue + prev_device_spec = prev_op.attr(self._op_device_key) + + if prev_device_spec != cur_device_spec: + if var_name not in var_devspec: + var_devspec[var_name] = [] + if cur_device_spec in var_devspec[var_name]: continue + var_devspec[var_name].append(cur_device_spec) + + queue_name = var_name + "_blocking_queue" + queue_name = unique_name.generate(queue_name) + queue_var = startup_block.create_var( + name=queue_name, + persistable=True, + type=core.VarDesc.VarType.RAW) + startup_block.append_op( + type='queue_generator', + attrs={ + 'names': [queue_name], + 'capacity': self._num_microbatches + }) + op_role = op.all_attrs()[self._op_role_key] + var = block.vars[var_name] + block._insert_op( + index=index + extra_index, + type='enqueue', + inputs={'X': var}, + attrs={ + 'queue_name': queue_name, + self._op_device_key: prev_device_spec, + self._op_role_key: op_role + }) + extra_index += 1 + block._insert_op( + index=index + extra_index, + type='dequeue', + outputs={'Out': [var]}, + attrs={ + self._op_device_key: cur_device_spec, + 'queue_name': queue_name, + self._op_role_key: op_role + }) + extra_index += 1 + + def _add_dequeue_ops_for_optimize(self, block, startup_program): + startup_block = startup_program.global_block() + grad_queue_map = dict() + grad_device_map = dict() + optimize_index = None + grad_names_to_dequeue = [] + + for index, op in reversed(list(enumerate(block.ops))): + device = op.attr(self._op_device_key) + # Optimizer pass + if not self._is_optimize_op(op): + optimize_index = index + 1 + break + if not self._is_update_op(op): continue + assert self._op_role_var_key in op.attr_names + op_role_var = op.all_attrs()[self._op_role_var_key] + assert len(op_role_var) == 2 + grad_name = op_role_var[1] + assert grad_name not in grad_device_map + assert grad_name not in grad_names_to_dequeue + grad_device_map[grad_name] = device + grad_names_to_dequeue.append(grad_name) + + for grad_name in grad_names_to_dequeue: + device = grad_device_map[grad_name] + grad_names = [] + grads = [] + queue_name = grad_name + "_blocking_queue" + queue_name = unique_name.generate(queue_name) + grad_queue_map[grad_name] = queue_name + ref_var = block.vars[grad_name] + queue_var = startup_block.create_var( + name=queue_name, + persistable=True, + type=core.VarDesc.VarType.RAW) + startup_block.append_op( + type='queue_generator', + attrs={ + 'names': [queue_name], + 'capacity': self._num_microbatches + }) + orig_var_name = self._strip_grad_suffix(grad_name) + for _ in range(self._num_microbatches): + u_name = unique_name.generate(orig_var_name) + u_grad_name = self._append_grad_suffix(u_name) + grad_var = self._create_var(block, ref_var, u_grad_name) + grad_names.append(u_grad_name) + grads.append(grad_var) + block._insert_op( + index=optimize_index, + type='dequeue', + outputs={'Out': grads}, + attrs={ + self._op_device_key: device, + 'queue_name': queue_name, + self._op_role_key: self._op_role.Optimize + }) + block._insert_op( + index=optimize_index + 1, + type='sum', + inputs={'X': grad_names}, + outputs={'Out': ref_var}, + attrs={ + self._op_device_key: device, + self._op_role_key: self._op_role.Optimize + }) + return grad_queue_map + + def _insert_enq_deq_ops_for_update(self, block, startup_program): + """ + Insert enqueue and dequeue ops for gradients of parameters. + """ + startup_block = startup_program.global_block() + grad_queue_map = self._add_dequeue_ops_for_optimize(block, + startup_program) + + for index, op in reversed(list(enumerate(block.ops))): + offset = index + device = op.attr(self._op_device_key) + + # Backward pass + if self._is_loss_grad_op(op): + loss_grad_var = block.vars[op.output_arg_names[0]] + scale_factor = self._num_microbatches + block._insert_op( + index=index + 1, + type='scale', + inputs={'X': loss_grad_var}, + outputs={'Out': loss_grad_var}, + attrs={ + 'scale': 1.0 / scale_factor, + self._op_device_key: device, + self._op_role_key: self._op_role.Backward + }) + break + if self._is_backward_op(op) and ( + self._op_role_var_key in op.attr_names): + op_role_var = op.all_attrs()[self._op_role_var_key] + + if len(op_role_var) == 0: + continue + assert len(op_role_var) % 2 == 0 + for i in range(0, len(op_role_var), 2): + grad_name = op_role_var[i + 1] + grad_var = block.vars[grad_name] + assert grad_name in grad_queue_map + queue_name = grad_queue_map[grad_name] + block._insert_op( + index=offset + 1, + type='enqueue', + inputs={'X': block.vars[grad_name]}, + attrs={ + 'queue_name': queue_name, + self._op_device_key: device, + self._op_role_key: self._op_role.Backward + }) + offset += 1 + + def _add_sub_blocks(self, main_block, program_list): + main_program = main_block.program + for prog_info in program_list: + prog = prog_info['program'] + for op in prog.block(0).ops: + if not op.has_attr('sub_block'): + continue + origin_sub_block_id = op.attr('sub_block').id + origin_sub_block = main_program.block(origin_sub_block_id) + new_sub_block = prog._create_block(parent_idx=0) + for op in origin_sub_block.ops: + op_desc = op.desc + ap_op = new_sub_block.desc.append_op() + ap_op.copy_from(op_desc) + new_sub_block._sync_with_cpp() + op._set_attr('sub_block:', new_sub_block) + + def _get_device_info(self, block): + for op in block.ops: + if not op._has_kernel(op.type): continue + op_device = op.attr(self._op_device_key) + return op_device + + def _process_persistable_vars_in_multi_sections(self, main_program, + startup_prog, program_list): + """ + Special Case: process persistable vars that exist in + multiple sections, e.g., shared weight + """ + # var_info = {var_name: [program1, program2...]}, + # persistable var only + var_info = dict() + for prog_info in program_list: + prog = prog_info['program'] + block = prog.block(0) + for var_name in block.vars: + var = block.var(var_name) + if not var.persistable: continue + if not var_name in var_info: + var_info[var_name] = [] + if not prog in var_info[var_name]: + var_info[var_name].append(prog) + for var_name in list(var_info.keys()): + if len(var_info[var_name]) == 1: + var_info.pop(var_name) + + # write_info = {var_name: program}, where program is the only program + # in which the var named var_name is written. + write_info = dict() + for var_name in var_info.keys(): + for prog in var_info[var_name]: + block = prog.block(0) + for op in block.ops: + if op.type == "dequeue": continue + # We have processed lr related vars + if op.attr(self._op_role_key) == int( + self._op_role.Optimize.LRSched): + continue + if var_name in op.desc.output_arg_names(): + assert var_name not in write_info, ( + "two sections write the same var({}): second " + "op {}.".format(var_name, op)) + write_info[var_name] = prog + break + + for var_name in var_info.keys(): + # Case 1: read only variables, no special process + if not var_name in write_info: continue + + # Case 2: one write multiple reads + write_prog = write_info[var_name] + write_block = write_prog.block(0) + write_device = self._get_device_info(write_block) + all_progs = var_info[var_name] + for prog in all_progs: + if prog == write_prog: continue + + queue_name = var_name + "_blocking_queue" + queue_name = unique_name.generate(queue_name) + queue_var = startup_prog.block(0).create_var( + name=queue_name, + persistable=True, + type=core.VarDesc.VarType.RAW) + startup_prog.block(0).append_op( + type='queue_generator', + attrs={ + 'names': [queue_name], + 'capacity': self._num_microbatches + }) + write_block._insert_op( + index=0, + type='enqueue', + inputs={'X': write_block.var(var_name), }, + attrs={ + 'queue_name': queue_name, + self._op_device_key: write_device, + # A trick to make the role LRSched to avoid copy every + # microbatch + self._op_role_key: self._op_role.LRSched + }) + read_block = prog.block(0) + read_device = self._get_device_info(read_block) + read_block._insert_op( + index=0, + type='dequeue', + outputs={'Out': [read_block.var(var_name)]}, + attrs={ + self._op_device_key: read_device, + # A trick to make the role LRSched to avoid copy every + # microbatch + self._op_role_key: self._op_role.LRSched, + 'queue_name': queue_name, + }) + + def minimize(self, + loss, + startup_program=None, + parameter_list=None, + no_grad_set=None): + main_block = loss.block + if startup_program is None: + startup_program = default_startup_program() + optimize_ops, params_grads = self._optimizer.minimize( + loss, startup_program, parameter_list, no_grad_set) + self._update_param_device_map(params_grads, main_block) + + # Step1: add default op_device attribute for regulization and clip ops + self._add_opdevice_attr_for_regularization_clip(main_block) + + # Step2: add default op_device attribute for ops whose op_device + # attribute have not been set yet. + self._add_default_opdevice_attr(main_block) + device_specs = self._check_validation(main_block) + + # Step3: add enqueue and dequeue ops between section boundaries + origin_prog = main_block.program.clone(for_test=False) + origin_main_block = origin_prog.global_block() + self._insert_enq_deq_ops_for_boundaries(main_block, origin_main_block, + startup_program) + + # Step4: add a pair of enqueue and dequeueN for parameter gradients + self._insert_enq_deq_ops_for_update(main_block, startup_program) + + main_program = main_block.program + + place_list = [] + place_id_list = [] + for dev_spec in device_specs: + if dev_spec == "cpu": + place_list.append(core.CPUPlace()) + place_id_list.append(-1) + elif "gpu" in dev_spec and ":" in dev_spec: + dev_index = dev_spec.split(":")[1] + place_list.append(core.CUDAPlace(int(dev_index))) + place_id_list.append(int(dev_index)) + else: + raise ValueError("Unknown device type: %s", dev_spec) + + # Step5: split program into sections and add pairs of + # enqueue and dequeue ops for data var. + if len(place_list) == 0: + program_list = [] + ptmp = { + "program": main_program, + "input_set": set(), + "output_set": set() + } + program_list.append(ptmp) + else: + program_list = self._split_program(main_program) + for p in program_list: + self._create_vars(p["program"].block(0), main_program) + self._insert_enq_deq_for_data_var(main_block, program_list, + startup_program, device_specs) + + # Step6: Special Case: process persistable vars that exist in + # multiple sections + self._process_persistable_vars_in_multi_sections( + main_program, startup_program, program_list) + + # Step7: Add sub blocks for section programs + self._add_sub_blocks(main_block, program_list) + + main_program._pipeline_opt = { + "trainer": "PipelineTrainer", + "device_worker": "Section", + "section_program_list": program_list, + "place_list": place_list, + "place_id_list": place_id_list, + "sync_steps": -1, + "num_microbatches": self._num_microbatches, + "start_cpu_core_id": self._start_cpu_core_id, + } + return optimize_ops, params_grads, program_list + + +class RecomputeOptimizer(Optimizer): + """ + :api_attr: Static Graph + + Recompute Optimizer Wrapper + + Normally, a training step contains three sub-steps: first, run forward + Operators to calculate the loss; second, run backward Operators to + calculate gradient of the parameters; third, apply optimization method + to update the value of the parameters. + + In the forward computation process, all variables that are needed by + backward computation process will be kept in memory, which occupy a great + amount of memory when the network becomes very deep. + + Recompute split the network to k segments. In each segment, It will + recompute the forward Operators, before running backward operators. It is + very helpful for saving memory. + + The Variables that separate a network to segments are called as checkpoints, + and users should set it manually. The usage is very simple: + + Args: + optimizer (Optimizer): The optimizer that is applied to parameters. + + Examples: + .. code-block:: python + + import paddle.fluid as fluid + import paddle + import numpy as np + def gen_data(): + return {"x": np.random.random(size=(32, 32)).astype('float32'), + "y": np.random.randint(2, size=(32, 1)).astype('int64')} + def mlp(input_x, input_y, hid_dim=128, label_dim=2): + print(input_x) + fc_1 = fluid.layers.fc(input=input_x, size=hid_dim) + prediction = fluid.layers.fc(input=[fc_1], size=label_dim, act='softmax') + cost = fluid.layers.cross_entropy(input=prediction, label=input_y) + sum_cost = fluid.layers.reduce_mean(cost) + return sum_cost, fc_1, prediction + input_x = fluid.layers.data(name="x", shape=[32], dtype='float32') + input_y = fluid.layers.data(name="y", shape=[1], dtype='int64') + cost, fc_1, pred = mlp(input_x, input_y) + + sgd = paddle.optimizer.Adam(lr=0.01) + sgd = paddle.optimizer.RecomputeOptimizer(sgd) + sgd._set_checkpoints([fc_1, pred]) + sgd.minimize(cost) + + print("Finished optimize") + place = fluid.CPUPlace() + exe = fluid.Executor(place) + exe.run(fluid.default_startup_program()) + step = 10 + + for i in range(step): + cost_val = exe.run(feed=gen_data(), + program=fluid.default_main_program(), + fetch_list=[cost.name]) + print("step=%d cost=%f" % (i, cost_val[0])) + + """ + + def __init__(self, optimizer): + if framework.in_dygraph_mode(): + raise Exception("In dygraph, don't support RecomputeOptimizer.") + self._optimizer = optimizer + self._checkpoints = None + self._learning_rate = self._optimizer._learning_rate + self._learning_rate_map = self._optimizer._learning_rate_map + + def _set_checkpoints(self, checkpoints): + """ + Args: + checkpoints (list): List of Variable or string + """ + assert isinstance( + checkpoints, list + ), "_checkpoints should be a list of Variable or a list of String" + for ckpt in checkpoints: + assert ( + isinstance(ckpt, six.string_types) or isinstance(ckpt, Variable) + ), "_checkpoints should be a list of Variable or a list of String" + self._checkpoints = checkpoints + + def load(self, stat_dict): + """ + :api_attr: Static Graph + + load function is not supported by Recompute Optimizer for now. + :return: None + + Args: + stat_dict: the dict load by load_persistable method + + Examples: + .. code-block:: python + + import paddle.fluid as fluid + import paddle + import paddle.compat as cpt + + def mlp(input_x, input_y, hid_dim=128, label_dim=2): + fc_1 = fluid.layers.fc(input=input_x, size=hid_dim) + prediction = fluid.layers.fc(input=[fc_1], size=label_dim, act='softmax') + cost = fluid.layers.cross_entropy(input=prediction, label=input_y) + sum_cost = fluid.layers.reduce_mean(cost) + return sum_cost, fc_1, prediction + + input_x = fluid.layers.data(name="x", shape=[32], dtype='float32') + input_y = fluid.layers.data(name="y", shape=[1], dtype='int64') + cost, fc_1, pred = mlp(input_x, input_y) + print("Finished FF") + + sgd = paddle.optimizer.Adam(lr=0.01) + sgd = paddle.optimizer.RecomputeOptimizer(sgd) + sgd._set_checkpoints([fc_1, pred]) + try: + stat_dict = {} + sgd.load(stat_dict) + except NotImplementedError as e: + print(cpt.get_exception_message(e)) + """ + raise NotImplementedError( + "load function is not supported by Recompute Optimizer for now") + + def apply_gradients(self, params_grads): + """ + call apply_gradients function of self._optimizer. + + Args: + params_grads (list): list of (param, grad) pair to do optimization. + + Returns: + list: A list of operators appended to the current program. + + Examples: + .. code-block:: python + + import paddle.fluid as fluid + import paddle + import paddle.fluid.framework as framework + + def mlp(input_x, input_y, hid_dim=128, label_dim=2): + fc_1 = fluid.layers.fc(input=input_x, size=hid_dim) + prediction = fluid.layers.fc(input=[fc_1], size=label_dim, act='softmax') + cost = fluid.layers.cross_entropy(input=prediction, label=input_y) + sum_cost = fluid.layers.reduce_mean(cost) + return sum_cost, fc_1, prediction + + + input_x = fluid.layers.data(name="x", shape=[32], dtype='float32') + input_y = fluid.layers.data(name="y", shape=[1], dtype='int64') + cost, fc_1, pred = mlp(input_x, input_y) + print("Finished FF") + + sgd = paddle.optimizer.Adam(lr=0.01) + sgd = paddle.optimizer.RecomputeOptimizer(sgd) + sgd._set_checkpoints([fc_1, pred]) + params_grads = sgd.backward( + cost, + startup_program=None, + parameter_list=None, + no_grad_set=None) + + program = cost.block.program + with framework.program_guard(program, None): + optimize_ops = sgd.apply_gradients(params_grads) + + print("Finished apply gradients") + """ + + return self._optimizer.apply_gradients(params_grads=params_grads) + + def backward(self, + loss, + startup_program=None, + parameter_list=None, + no_grad_set=None, + callbacks=None): + """ + call append_backward with checkpoints. + + Args: + loss (Variable): loss variable to run optimizations. + startup_program (Program): startup_program for initializing parameters + in `parameter_list`. + parameter_list (list): list of Variables or Variable.names to update. + no_grad_set (set|None): set of Variables or Variables.names should be ignored. + callbacks (list|None): list of callables to run when appending backward + operator for one parameter. + checkpoints (list): list of Variables as checkpoints + + Examples: + .. code-block:: python + + import paddle.fluid as fluid + import paddle + + def mlp(input_x, input_y, hid_dim=128, label_dim=2): + fc_1 = fluid.layers.fc(input=input_x, size=hid_dim) + prediction = fluid.layers.fc(input=[fc_1], size=label_dim, act='softmax') + cost = fluid.layers.cross_entropy(input=prediction, label=input_y) + sum_cost = fluid.layers.reduce_mean(cost) + return sum_cost, fc_1, prediction + + + input_x = fluid.layers.data(name="x", shape=[32], dtype='float32') + input_y = fluid.layers.data(name="y", shape=[1], dtype='int64') + cost, fc_1, pred = mlp(input_x, input_y) + print("Finished FF") + + sgd = paddle.optimizer.Adam(lr=0.01) + sgd = paddle.optimizer.RecomputeOptimizer(sgd) + sgd._set_checkpoints([fc_1, pred]) + params_grads = sgd.backward( + cost, + startup_program=None, + parameter_list=None, + no_grad_set=None) + print("Finished backward") + """ + assert (self._checkpoints is not None + ), "You should call _set_checkpoints first" + + if framework.in_dygraph_mode(): + raise NotImplementedError( + "DyGraph current does not support recompute") + + self._dtype = loss.dtype + program = loss.block.program + with program_guard(program, startup_program): + checkpoint_vars = [] + for ckpt in self._checkpoints: + if isinstance(ckpt, Variable): + checkpoint_vars.append(ckpt) + else: + checkpoint_vars.append(loss.block.var(ckpt)) + + params_grads = append_backward( + loss, parameter_list, no_grad_set, checkpoints=checkpoint_vars) + # Note: since we can't use all_reduce_op now, + # dgc_op should be the last op of one grad. + if hasattr(self._optimizer, "_append_dgc_ops"): + self._optimizer._append_dgc_ops(params_grads) + return params_grads + + def apply_optimize(self, loss, startup_program, params_grads): + """ + call the apply_optimize function of self._optimizer + Args: + loss (Variable): loss variable to run optimizations. + startup_program (Program): startup_program for initializing parameters + in `parameter_list`. + params_grads (list): list of (param, grad) pair to do optimization. + Examples: + .. code-block:: python + import paddle.fluid as fluid + import paddle + + def mlp(input_x, input_y, hid_dim=128, label_dim=2): + fc_1 = fluid.layers.fc(input=input_x, size=hid_dim) + prediction = fluid.layers.fc(input=[fc_1], size=label_dim, act='softmax') + cost = fluid.layers.cross_entropy(input=prediction, label=input_y) + sum_cost = fluid.layers.reduce_mean(cost) + return sum_cost, fc_1, prediction + + input_x = fluid.layers.data(name="x", shape=[32], dtype='float32') + input_y = fluid.layers.data(name="y", shape=[1], dtype='int64') + cost, fc_1, pred = mlp(input_x, input_y) + print("Finished FF") + + sgd = paddle.optimizer.Adam(lr=0.01) + sgd = paddle.optimizer.RecomputeOptimizer(sgd) + sgd._set_checkpoints([fc_1, pred]) + params_grads = sgd.backward( + cost, + startup_program=None, + parameter_list=None, + no_grad_set=None) + + optimize_ops = sgd.apply_optimize( + cost, startup_program=None, params_grads=params_grads) + + print("Finished apply_optimize") + """ + + return self._optimizer.apply_optimize( + loss, startup_program=startup_program, params_grads=params_grads) + + def minimize(self, + loss, + startup_program=None, + parameter_list=None, + no_grad_set=None): + assert isinstance(loss, Variable), "The loss should be an Variable." + assert (self._checkpoints is not None + ), "You should call _set_checkpoints first" + if framework.in_dygraph_mode(): + raise NotImplementedError( + "DyGraph current does not support recompute") + params_grads = self.backward( + loss, + startup_program=startup_program, + parameter_list=parameter_list, + no_grad_set=no_grad_set) + + optimize_ops = self.apply_optimize( + loss, startup_program=startup_program, params_grads=params_grads) + + return optimize_ops, params_grads + + +class LookaheadOptimizer(object): + """ + :api_attr: Static Graph + + This implements the Lookahead optimizer of the + paper : https://arxiv.org/abs/1907.08610. + + Lookahead keeps two sets of params: the fast_params and + the slow_params. inner_optimizer update fast_params every + training step. Lookahead updates the slow_params and fast_params + every k training steps as follows: + + .. math:: + + slow\_param_t &= slow\_param_{t-1} + \\alpha * (fast\_param_{t-1} - slow\_param_{t-1}) + + fast\_param_t &= slow\_param_t + + Args: + inner_optimizer (Optimizer): The optimizer that update fast params step by step. + alpha (float): The learning rate of Lookahead. + k (int): The slow params is updated every k steps. + + Examples: + .. code-block:: python + + import paddle + import paddle.fluid as fluid + import numpy as np + + x = fluid.layers.data(name='x', shape=[2], dtype='float32') + label = fluid.layers.data(name="label", shape=[1], dtype="int64") + y = fluid.layers.fc(input=[x], size=2, act="softmax") + loss = fluid.layers.cross_entropy(input=y, label=label) + loss = fluid.layers.mean(x=loss) + sgd = paddle.optimizer.SGD(learning_rate=0.01) + optimizer = paddle.optimizer.LookaheadOptimizer(sgd, + alpha=0.5, + k=5) + optimizer.minimize(loss) + main_program = fluid.default_main_program() + place = fluid.CPUPlace() + exe = fluid.Executor(place) + exe.run(fluid.default_startup_program()) + + feeder = fluid.DataFeeder(feed_list=[x, label], place=place) + + step = 0 + while(step < 10): + step += 1 + exe.run(fluid.default_main_program(), + feed=feeder.feed(batch_data)) + + """ + + def __init__(self, inner_optimizer, alpha=0.5, k=5): + + if framework.in_dygraph_mode(): + raise Exception("In dygraph, don't support LookaheadOptimizer.") + assert (inner_optimizer is not None), "inner optimizer can not be None" + assert ( + 0.0 <= alpha <= 1.0 + ), "alpha should be larger or equal to 0.0, and less or equal than 1.0" + assert (isinstance(k, int) and k > 0), "k should be a positive integer" + + self.inner_optimizer = inner_optimizer + self.alpha = alpha + self.k = k + self.type = "lookahead" + + def minimize(self, loss, startup_program=None): + + # Apply inner optimizer to the main_program + mini_out = self.inner_optimizer.minimize( + loss, startup_program=startup_program) + + # Get startup_program and main_program + if startup_program is None: + startup_program = default_startup_program() + main_block = loss.block + + # add some vars to the main_program + params = [param.name for param in main_block.all_parameters()] + param_to_slow = {} + for param in params: + fast_var = main_block.var(param) + assert (fast_var is not None) + slow_var = main_block.create_var( + name=param + "@SLOW", + shape=fast_var.shape, + dtype=fast_var.dtype, + persistable=True) + param_to_slow[param] = slow_var + + # add some vars to the startup_program + startup_block = startup_program.global_block() + for param in params: + fast_var = startup_block.var(param) + assert (fast_var is not None) + slow_var = startup_block.create_var( + name=param + "@SLOW", + shape=fast_var.shape, + dtype=fast_var.dtype, + persistable=True) + + startup_block.append_op( + type="assign", + inputs={"X": fast_var}, + outputs={"Out": slow_var}) + + with framework.program_guard(main_block.program, startup_program): + # Add Var k to main prog and startup prog + k = layers.create_global_var( + name="lookahead_k", + shape=[1], + value=int(self.k), + dtype='int32', + persistable=True) + + # Add Var alpha to main prog and startup prog + alpha = layers.create_global_var( + name="lookahead_alpha", + shape=[1], + value=float(self.alpha), + dtype='float32', + persistable=True) + + # Add Var step + step = layers.create_global_var( + name="lookahead_step", + shape=[1], + value=int(0), + dtype='int32', + persistable=True) + layers.increment(x=step, value=1.0, in_place=True) + + # lookahead + zero_var = layers.fill_constant( + shape=[1], dtype='float32', value=0.0) + + one_var = layers.fill_constant( + shape=[1], dtype='float32', value=1.0) + + mod = layers.elementwise_mod(step, k) + with layers.control_flow.Switch() as switch: + with switch.case(mod == zero_var): + for param_name in params: + fast_var = main_block.var(param_name) + slow_var = param_to_slow[param_name] + tmp_var = layers.elementwise_add( + layers.elementwise_mul(fast_var, alpha), + layers.elementwise_mul( + slow_var, + layers.elementwise_sub(one_var, alpha))) + layers.assign(input=tmp_var, output=slow_var) + layers.assign(input=tmp_var, output=fast_var) + with switch.default(): + pass + return mini_out + + +class GradientMergeOptimizer(object): + """ + Gradient Merge, also called as Gradient Accumulation, + is a training strategy for larger batches. With this strategy, + the parameter will not be updated until specific steps. + + For each step, the forward network and the backward network + will run to calculate the gradient of the parameters. + + For every k step, the optimization network will run, + applying a specific optimization method (such as SGD, Adam) + to the parameters. + + Args: + inner_optimizer (Optimizer): The specific optimization (such as SGD, Adam) + which update the parameters + k_steps (int): the update period of the parameters + avg (bool): whether to average the gradients of each mini-batch, + the default value is `True` + + Examples: + .. code-block:: python + + import paddle.fluid as fluid + import paddle + import numpy as np + + def gen_data(batch_size): + return {"x": np.random.random(size=(batch_size, 32)).astype('float32'), + "y": np.random.random(size=(batch_size, 1)).astype('int64')} + + def mlp(input_x, input_y, hid_dim=128, label_dim=2): + fc_1 = fluid.layers.fc(input=input_x, size=hid_dim) + prediction = fluid.layers.fc(input=[fc_1], size=label_dim, act='softmax') + cost = fluid.layers.cross_entropy(input=prediction, label=input_y) + sum_cost = fluid.layers.reduce_mean(cost) + return sum_cost, fc_1, prediction + + input_x = fluid.layers.data(name="x", shape=[32], dtype='float32') + input_y = fluid.layers.data(name="y", shape=[1], dtype='int64') + cost, fc_1, pred = mlp(input_x, input_y) + sgd = paddle.optimizer.Adam(lr=0.01) + sgd = paddle.optimizer.GradientMergeOptimizer(sgd, k_steps=4, avg=True) + sgd.minimize(cost) + + place = fluid.CPUPlace() + exe = fluid.Executor(place) + exe.run(fluid.default_startup_program()) + + for i in range(10): + cost_val = exe.run(feed=gen_data(32), + program=fluid.default_main_program(), + fetch_list=[cost.name]) + print("step=%d, cost=%f" % (i, cost_val[0])) + """ + + def __init__(self, inner_optimizer, k_steps=1, avg=True): + if framework.in_dygraph_mode(): + raise Exception( + "In dygraph, we don't support GradientMergeOptimizer." + "You can do Gradient merge by yourself with k-times forward + backward, " + "and one-time optimizer.minimize()") + + assert (inner_optimizer is not None), "inner optimizer can not be None" + assert (isinstance(k_steps, int) and + k_steps > 0), "k_steps should be a positive integer" + + self.inner_optimizer = inner_optimizer + self.k_steps = k_steps + self.type = "gradient_merge" + self.avg = avg + + def _set_k_steps(self, k_steps): + self.k_steps = k_steps + + def _set_avg(self, avg): + self.avg = avg + + def minimize(self, + loss, + startup_program=None, + parameter_list=None, + no_grad_set=None): + + assert isinstance(loss, Variable), "The loss should be an Variable." + assert ( + parameter_list is None + ), "The parameter_list should be None when using GradientMergeOptimizer" + assert ( + no_grad_set is None + ), "The no_grad_set should be None when using GradientMergeOptimizer" + + params_grads = self.inner_optimizer.backward( + loss, startup_program=startup_program) + + #TODO(mapingshuo) support sparse embedding + for k, v in params_grads: + assert ( + v.type != core.VarDesc.VarType.SELECTED_ROWS + ), "SELECTED_ROWS is not supported in GradientMergeOptimizer for now" + + param_to_grad = {k.name: v for (k, v) in params_grads} + + # Get startup_program and main_program + if startup_program is None: + startup_program = default_startup_program() + main_block = loss.block + + # add some vars to the main_program and startup_program + startup_block = startup_program.global_block() + param_names = param_to_grad.keys() + param_to_gradient_merge = {} + + for param_name in param_names: + param_var = main_block.var(param_name) + assert (param_var is not None) + gradient_merge_var = main_block.create_var( + name=param_name + "@GRAD@GradientMerge", + shape=param_var.shape, + dtype=param_var.dtype, + persistable=True) + param_to_gradient_merge[param_name] = gradient_merge_var + startup_gradient_merge_var = startup_block.create_var( + name=param_name + "@GRAD@GradientMerge", + shape=param_var.shape, + dtype=param_var.dtype, + persistable=True) + startup_block.append_op( + type="fill_constant", + outputs={"Out": startup_gradient_merge_var}, + attrs={ + "shape": param_var.shape, + "dtype": param_var.dtype, + "value": float(0), + }) + + with framework.program_guard(main_block.program, startup_program): + # Add Var k to main prog and startup prog + gradient_merge_k = layers.create_global_var( + name="gradient_merge_k", + shape=[1], + value=int(self.k_steps), + dtype='int32', + persistable=True) + + # Add Var step + gradient_merge_step = layers.create_global_var( + name="gradient_merge_step", + shape=[1], + value=int(0), + dtype='int32', + persistable=True) + layers.increment(x=gradient_merge_step, value=1.0, in_place=True) + + # gradient merge + zero_var = layers.fill_constant( + shape=[1], dtype='float32', value=0.0) + one_var = layers.fill_constant( + shape=[1], dtype='float32', value=1.0) + + mod = layers.elementwise_mod(gradient_merge_step, gradient_merge_k) + with layers.control_flow.Switch() as switch: + with switch.case(mod != zero_var): + # 1. update the gradient_merge_vars + # gradient_merge_vars += gradient_vars + cur_block = main_block.program.current_block() + for param_name in param_names: + grad = param_to_grad[param_name] + grad_merge = param_to_gradient_merge[param_name] + cur_block.append_op( + type="elementwise_add", + inputs={'X': grad, + 'Y': grad_merge}, + outputs={'Out': grad_merge}, + attrs={'axis': -1, + 'use_mkldnn': False}) + + with switch.default(): + # 1. update the graient_vars + # gradient_vars += gradient_merge_vars + cur_block_idx = main_block.program.current_block_idx + cur_block = main_block.program.current_block() + for param_name in param_names: + grad = param_to_grad[param_name] + grad_merge = param_to_gradient_merge[param_name] + if self.avg: + tmp_var = layers.elementwise_add(grad, grad_merge) + cur_block.append_op( + type='scale', + inputs={'X': tmp_var}, + outputs={'Out': grad}, + attrs={ + 'scale': 1.0 / self.k_steps, + 'bias': 0.0, + 'bias_after_scale': False + }) + else: + cur_block.append_op( + type="elementwise_add", + inputs={'X': grad, + 'Y': grad_merge}, + outputs={'Out': grad}, + attrs={'axis': -1, + 'use_mkldnn': False}) + + # 2. apply_optimize + target_grad_block = main_block.program._create_block( + parent_idx=cur_block.parent_idx) + target_grad_block._set_forward_block_idx(cur_block_idx) + main_block.program.current_block_idx = cur_block_idx + + optimize_ops = self.inner_optimizer.apply_optimize( + loss, + startup_program=startup_program, + params_grads=params_grads) + + # 3. clear gradient_merge_vars + for param_name in param_names: + grad_merge = param_to_gradient_merge[param_name] + layers.fill_constant( + shape=grad_merge.shape, + dtype=grad_merge.dtype, + value=0.0, + out=grad_merge) + return optimize_ops, params_grads From 73baac03b605d25d1be37ebbe9fde8193099fddf Mon Sep 17 00:00:00 2001 From: MRXLT Date: Tue, 18 Aug 2020 14:43:01 +0800 Subject: [PATCH 05/30] refine optimizer; notest --- .../fluid/tests/unittests/test_adam_op.py | 4 +- python/paddle/optimizer/__init__.py | 4 +- python/paddle/optimizer/optimizer.py | 4481 +---------------- 3 files changed, 99 insertions(+), 4390 deletions(-) diff --git a/python/paddle/fluid/tests/unittests/test_adam_op.py b/python/paddle/fluid/tests/unittests/test_adam_op.py index 3a35006e8cbc6..4c1ff217adcc6 100644 --- a/python/paddle/fluid/tests/unittests/test_adam_op.py +++ b/python/paddle/fluid/tests/unittests/test_adam_op.py @@ -444,7 +444,7 @@ def test_with_place(place, shape): test_with_place(place, shape) -class TestAdamOpBetasV2(unittest.TestCase): +class TestAdamOpV2(unittest.TestCase): def test_adam_op(self): exe = fluid.Executor(place, shape) train_prog = fluid.Program() @@ -461,7 +461,7 @@ def test_adam_op(self): shape=[1], value=0.95, dtype='float32', persistable=True) betas = [beta1, beta2] opt = paddle.optimizer.Adam( - lr=1e-5, betas=betas, weight_decay=0.01) + lr=1e-5, betas=betas, weight_decay=0.01, eps=1e-8) opt.minimize(loss) exe.run(startup) diff --git a/python/paddle/optimizer/__init__.py b/python/paddle/optimizer/__init__.py index 26b6c20af1176..15a3e8f7aa9bb 100644 --- a/python/paddle/optimizer/__init__.py +++ b/python/paddle/optimizer/__init__.py @@ -24,10 +24,12 @@ ] -from .optimizer import SGD, Momentum, Adagrad, Adam, Adamax, AdamW, Dpsgd, DecayedAdagrad, \ +from ..fluid.optimizer import SGD, Momentum, Adagrad, Dpsgd, DecayedAdagrad, \ Ftrl, SGDOptimizer, MomentumOptimizer, AdagradOptimizer, DpsgdOptimizer, \ DecayedAdagradOptimizer, RMSPropOptimizer, FtrlOptimizer, Adadelta, \ AdadeltaOptimizer, ModelAverage, LarsMomentum, \ LarsMomentumOptimizer, DGCMomentumOptimizer, LambOptimizer, \ ExponentialMovingAverage, PipelineOptimizer, LookaheadOptimizer, \ RecomputeOptimizer + +from .optimizer import Optimizer diff --git a/python/paddle/optimizer/optimizer.py b/python/paddle/optimizer/optimizer.py index 0f7f9ccba38e0..975f9e1ad1bf6 100644 --- a/python/paddle/optimizer/optimizer.py +++ b/python/paddle/optimizer/optimizer.py @@ -37,19 +37,18 @@ from ..fluid.dygraph.learning_rate_scheduler import LearningRateDecay, _LearningRateEpochDecay from paddle.fluid import core from paddle.fluid.layers import tensor -from paddle.fluid.regularizer import L2Decay -from functools import reduce +from ..fluid.functools import reduce from ..fluid.wrapped_decorator import signature_safe_contextmanager from .. import compat as cpt __all__ = [ 'SGD', 'Momentum', 'Adagrad', 'Adam', 'Adamax', 'Dpsgd', 'DecayedAdagrad', 'Ftrl', 'SGDOptimizer', 'MomentumOptimizer', 'AdagradOptimizer', - 'AdamaxOptimizer', 'DpsgdOptimizer', 'DecayedAdagradOptimizer', - 'RMSPropOptimizer', 'FtrlOptimizer', 'Adadelta', 'AdadeltaOptimizer', - 'ModelAverage', 'LarsMomentum', 'LarsMomentumOptimizer', 'LambOptimizer', - 'ExponentialMovingAverage', 'PipelineOptimizer', 'LookaheadOptimizer', - 'RecomputeOptimizer' + 'AdamOptimizer', 'AdamaxOptimizer', 'DpsgdOptimizer', + 'DecayedAdagradOptimizer', 'RMSPropOptimizer', 'FtrlOptimizer', 'Adadelta', + 'AdadeltaOptimizer', 'ModelAverage', 'LarsMomentum', + 'LarsMomentumOptimizer', 'LambOptimizer', 'ExponentialMovingAverage', + 'PipelineOptimizer', 'LookaheadOptimizer', 'RecomputeOptimizer' ] @@ -64,12 +63,12 @@ class Optimizer(object): @imperative_base.no_grad() def __init__(self, learning_rate, - parameter_list=None, - regularization=None, + parameters=None, + weight_decay=None, grad_clip=None, name=None): self._parameter_list = list( - parameter_list) if parameter_list is not None else None + parameters) if parameters is not None else None self._name = name if framework.in_dygraph_mode(): if not isinstance(learning_rate, float) and \ @@ -79,21 +78,21 @@ def __init__(self, % type(learning_rate)) if self._parameter_list is None: raise AttributeError( - "parameter_list argument given to the Optimizer should not be None in dygraph mode." + "parameters argument given to the Optimizer should not be None in dygraph mode." ) - if regularization is not None: + if weight_decay is not None: for param in self._parameter_list: if param.regularizer is not None: logging.info( "If regularizer of a Parameter has been set by 'fluid.ParamAttr' or 'fluid.WeightNormParamAttr' already. " - "The Regularization[%s] in Optimizer will not take effect, and it will only be applied to other Parameters!" - % regularization.__str__()) + "The weight_decay[%s] in Optimizer will not take effect, and it will only be applied to other Parameters!" + % weight_decay.__str__()) break else: if not isinstance(learning_rate, float) and \ not isinstance(learning_rate, framework.Variable): raise TypeError( - "learning rate should be float or Variable, got %s here" % + "learning rate should be float or Tensor, got %s here" % type(learning_rate)) if grad_clip is not None: @@ -101,7 +100,11 @@ def __init__(self, raise TypeError( "'grad_clip' should be an instance of GradientClipBase's derived class" ) - self.regularization = regularization + if isinstance(weight_decay, float): + from ..fluid.regularizer import L2Decay + self.regularization = L2Decay(weight_decay) + else: + self.regularization = weight_decay self._grad_clip = grad_clip self._learning_rate = learning_rate # the learning rate type should be inferenced from loss @@ -130,18 +133,17 @@ def state_dict(self): Args: None Return: - state_dict(dict) : dict contains all the variable used by optimizer + state_dict(dict) : dict contains all the Tensor used by optimizer Examples: .. code-block:: python import paddle.fluid as fluid import paddle - with fluid.dygraph.guard(): emb = fluid.dygraph.Embedding([10, 10]) - adam = paddle.optimizer.Adam(0.001, params=emb.parameters()) + adam = paddle.optimizer.Adam(0.001, parameters=emb.parameters()) state_dict = adam.state_dict() ''' @@ -165,12 +167,12 @@ def state_dict(self): return state_dict @framework.dygraph_only - def load_state_dict(self, state_dict): + def set_state_dict(self, state_dict): ''' Load optimizer state dict. For Adam optimizer, contains beta1, beta2, momentum etc. If LearningRateDecay have been used, global_step will be changed. Args: - state_dict(dict) : Dict contains all the Variable needed by optimizer + state_dict(dict) : Dict contains all the Tensor needed by optimizer Return: None @@ -183,14 +185,14 @@ def load_state_dict(self, state_dict): state_dict = emb.state_dict() fluid.save_dygraph(state_dict, "paddle_dy") - adam = paddle.optimizer.Adam(lr=fluid.layers.noam_decay( 100, 10000), - params=emb.parameters()) + adam = paddle.optimizer.Adam(learning_rate=fluid.layers.noam_decay( 100, 10000), + parameters=emb.parameters()) state_dict = adam.state_dict() fluid.save_dygraph(state_dict, "paddle_dy") para_state_dict, opti_state_dict = fluid.load_dygraph( "paddle_dy") - adam.load_state_dict(opti_state_dict) + adam.set_state_dict(opti_state_dict) ''' @@ -215,14 +217,14 @@ def load_state_dict(self, state_dict): self._learning_rate.step_num = global_step[0] else: raise RuntimeError( - "Type not supprt, value in state dict must be [VarBase, Variable, numpy], the type is ", + "Type not supprt, value in state dict must be [VarBase, Tensor, numpy], the type is ", type(global_step)) self._accumulators_holder = state_dict for k, v in self._accumulators.items(): for para_name, var_tmp in v.items(): assert var_tmp.name in state_dict, \ - "optimizer variable {} not found".format( var_tmp.name ) + "optimizer Tensor {} not found".format( var_tmp.name ) var = var_tmp.value() tensor = var.get_tensor() model_np = np.array(tensor) @@ -268,7 +270,7 @@ def _create_global_learning_rate(self): value=float(self._learning_rate), dtype='float32' if self._dtype is None else self._dtype, persistable=True) - # get learning rate Variable from LearningRateDecay + # get learning rate Tensor from LearningRateDecay elif isinstance(self._learning_rate, LearningRateDecay): self._learning_rate_map[framework.default_main_program( )] = self._learning_rate() @@ -284,8 +286,8 @@ def _create_global_learning_rate(self): else: if not isinstance(self._learning_rate, float): raise TypeError( - "learning rate variable is create outside optimizer," - "can not create new learning rate variable for new program" + "learning rate Tensor is create outside optimizer," + "can not create new learning rate Tensor for new program" ) # create learning rate in the current main program @@ -306,7 +308,7 @@ def set_lr(self, value): this API cannot be invoked, because it will lead to conflict. Args: - value (float|Variable): the value of learning rate + value (float|Tensor): the value of learning rate Returns: None @@ -316,11 +318,10 @@ def set_lr(self, value): import paddle.fluid as fluid import paddle - with fluid.dygraph.guard(): linear = fluid.dygraph.nn.Linear(10, 10) - adam = paddle.optimizer.Adam(0.1, params=linear.parameters()) + adam = paddle.optimizer.Adam(0.1, parameters=linear.parameters()) # set learning rate manually by python float value lr_list = [0.2, 0.3, 0.4, 0.5, 0.6] @@ -336,7 +337,7 @@ def set_lr(self, value): # current lr is 0.6 - # set learning rate manually by framework Variable + # set learning rate manually by framework Tensor lr_var = fluid.layers.create_global_var( shape=[1], value=0.7, dtype='float32') adam.set_lr(lr_var) @@ -350,7 +351,7 @@ def set_lr(self, value): """ if not isinstance(value, (framework.Variable, float)): raise TypeError( - "The type of 'value' in optimizer.set_lr must be (float, Variable), but received %s." + "The type of 'value' in optimizer.set_lr must be (float, Tensor), but received %s." % (type(value))) if isinstance(self._learning_rate, LearningRateDecay): raise RuntimeError( @@ -390,13 +391,12 @@ def current_step_lr(self): .. code-block:: python import paddle.fluid as fluid - import paddle import numpy as np - + import paddle # example1: LearningRateDecay is not used, return value is all the same with fluid.dygraph.guard(): emb = fluid.dygraph.Embedding([10, 10]) - adam = paddle.optimizer.Adam(0.001, params = emb.parameters()) + adam = paddle.optimizer.Adam(0.001, parameters = emb.parameters()) lr = adam.current_step_lr() print(lr) # 0.001 @@ -411,7 +411,7 @@ def current_step_lr(self): bd = [2, 4, 6, 8] value = [0.2, 0.4, 0.6, 0.8, 1.0] adam = paddle.optimizer.Adam(fluid.dygraph.PiecewiseDecay(bd, value, 0), - params=linear.parameters()) + parameters=linear.parameters()) # first step: learning rate is 0.2 np.allclose(adam.current_step_lr(), 0.2, rtol=1e-06, atol=0.0) # True @@ -588,7 +588,7 @@ def _create_optimization_pass(self, parameters_and_grads): """Add optimization operators to update gradients to variables. Args: - parameters_and_grads(list(tuple(Variable, Variable))): + parameters_and_grads(list(tuple(Tensor, Tensor))): a list of (variable, gradient) pair to update. Returns: @@ -699,7 +699,7 @@ def _append_dgc_ops(self, param_and_grad): def backward(self, loss, startup_program=None, - parameter_list=None, + parameters=None, no_grad_set=None, callbacks=None): """ @@ -707,14 +707,14 @@ def backward(self, the current program. Args: - loss (Variable): ``loss`` variable to run optimizations. + loss (Tensor): ``loss`` variable to run optimizations. startup_program (Program, optional): :ref:`api_fluid_Program` for - initializing parameters in ``parameter_list``. The default value + initializing parameters in ``parameters``. The default value is None, at this time :ref:`api_fluid_default_startup_program` will be used. - parameter_list (Iterable, optional): Iterable of ``Variable`` or ``Variable.name`` to update + parameters (Iterable, optional): Iterable of ``Tensor`` or ``Tensor.name`` to update to minimize ``loss``. The default value is None, at this time all parameters will be updated. - no_grad_set (set, optional): Set of ``Variable`` or ``Variable.name`` that don't need + no_grad_set (set, optional): Set of ``Tensor`` or ``Tensor.name`` that don't need to be updated. The default value is None. callbacks (list, optional): list of callable objects to run when appending backward operator for one parameter. The default value is None. @@ -752,7 +752,7 @@ def backward(self, "The loss.shape should be (1L,), but the current loss.shape is {}. " \ "Maybe that you should call fluid.layers.mean to process the current loss.".format( loss.shape) - parameter_list = parameter_list if parameter_list \ + parameter_list = parameters if parameters \ else self._parameter_list with program_guard(program, startup_program): params_grads = append_backward(loss, parameter_list, @@ -776,10 +776,9 @@ def apply_gradients(self, params_grads): Examples: .. code-block:: python - import paddle.fluid as fluid import paddle loss = network() - optimizer = paddle.optimizer.SGD(learning_rate=0.1) + optimizer = paddle.optimizer.SGD(learning_rate=0.1) params_grads = optimizer.backward(loss) # you may append operations for params_grads here # ... @@ -792,6 +791,7 @@ def apply_gradients(self, params_grads): if self._grad_clip is not None: params_grads = self._grad_clip(params_grads) else: + params_grads = append_gradient_clip_ops(params_grads) # Add regularization if any @@ -806,9 +806,9 @@ def apply_optimize(self, loss, startup_program, params_grads): Second part of `minimize`, appending optimization operators for given `params_grads` pairs. Args: - loss (Variable): loss variable to run optimizations. + loss (Tensor): loss variable to run optimizations. startup_program (Program): startup_program for initializing parameters - in `parameter_list`. + in `parameters`. params_grads (list): list of (param, grad) pair to do optimization. Returns: list: A list of operators appended to the current program. @@ -849,16 +849,15 @@ def clear_gradients(self): .. code-block:: python import paddle.fluid as fluid - import paddle import numpy as np - + import paddle with fluid.dygraph.guard(): value = np.arange(26).reshape(2, 13).astype("float32") a = fluid.dygraph.to_variable(value) linear = fluid.Linear(13, 5, dtype="float32") # This can be any optimizer supported by dygraph. - adam = paddle.optimizer.Adam(lr = 0.01, - params = linear.parameters()) + adam = paddle.optimizer.Adam(learning_rate = 0.01, + parameters = linear.parameters()) out = linear(a) out.backward() adam.minimize(out) @@ -873,20 +872,20 @@ def clear_gradients(self): def minimize(self, loss, startup_program=None, - parameter_list=None, + parameters=None, no_grad_set=None): """ - Add operations to minimize ``loss`` by updating ``parameter_list``. + Add operations to minimize ``loss`` by updating ``parameters``. Args: - loss (Variable): A ``Variable`` containing the value to minimize. + loss (Tensor): A ``Tensor`` containing the value to minimize. startup_program (Program, optional): :ref:`api_fluid_Program` for - initializing parameters in ``parameter_list``. The default value + initializing parameters in ``parameters``. The default value is None, at this time :ref:`api_fluid_default_startup_program` will be used. - parameter_list (Iterable, optional): Iterable of ``Variable`` or ``Variable.name`` to update + parameters (Iterable, optional): Iterable of ``Tensor`` or ``Tensor.name`` to update to minimize ``loss``. The default value is None, at this time all parameters will be updated. - no_grad_set (set, optional): Set of ``Variable`` or ``Variable.name`` that don't need + no_grad_set (set, optional): Set of ``Tensor`` or ``Tensor.name`` that don't need to be updated. The default value is None. Returns: @@ -900,14 +899,14 @@ def minimize(self, Examples: Please refer to the example of current Optimizer. """ - assert isinstance(loss, Variable), "The loss should be an Variable." + assert isinstance(loss, Variable), "The loss should be an Tensor." - parameter_list = parameter_list if parameter_list \ + parameter_list = parameters if parameters \ else self._parameter_list params_grads = self.backward( loss, startup_program=startup_program, - parameter_list=parameter_list, + parameters=parameter_list, no_grad_set=no_grad_set) optimize_ops = self.apply_optimize( @@ -915,4334 +914,42 @@ def minimize(self, return optimize_ops, params_grads + @framework.dygraph_only + def step(self): + """ + Execute the optimizer once. + + Returns: + None -class SGDOptimizer(Optimizer): - """ - Optimizer of the stochastic gradient descent algorithm. - - .. math:: - - param\_out = param - learning\_rate * grad - - Parameters: - learning_rate (float|Variable): The learning rate used to update parameters. \ - Can be a float value or a Variable with one float value as data element. - parameter_list (Iterable, optional): Iterable of ``Variable`` names to update to minimize ``loss``. \ - This parameter is required in dygraph mode. \ - The default value is None in static mode, at this time all parameters will be updated. - regularization (WeightDecayRegularizer, optional): The strategy of regularization. There are two method: \ - :ref:`api_fluid_regularizer_L1Decay` , :ref:`api_fluid_regularizer_L2Decay` . If a parameter has set \ - regularizer using :ref:`api_fluid_ParamAttr` already, the regularization setting here in optimizer will be \ - ignored for this parameter. Otherwise, the regularization setting here in optimizer will take effect. \ - Default None, meaning there is no regularization. - grad_clip (GradientClipBase, optional): Gradient cliping strategy, it's an instance of - some derived class of ``GradientClipBase`` . There are three cliping strategies - ( :ref:`api_fluid_clip_GradientClipByGlobalNorm` , :ref:`api_fluid_clip_GradientClipByNorm` , - :ref:`api_fluid_clip_GradientClipByValue` ). Default None, meaning there is no gradient clipping. - name (str, optional): This parameter is used by developers to print debugging information. \ - For details, please refer to :ref:`api_guide_Name`. Default is None. - - Examples: - .. code-block:: python - - import paddle - import paddle.fluid as fluid - import numpy as np - - place = fluid.CPUPlace() - main = fluid.Program() - with fluid.program_guard(main): - x = fluid.layers.data(name='x', shape=[13], dtype='float32') - y = fluid.layers.data(name='y', shape=[1], dtype='float32') - y_predict = fluid.layers.fc(input=x, size=1, act=None) - cost = fluid.layers.square_error_cost(input=y_predict, label=y) - avg_cost = fluid.layers.mean(cost) - - sgd_optimizer = paddle.optimizer.SGD(learning_rate=0.001) - sgd_optimizer.minimize(avg_cost) - - fetch_list = [avg_cost] - train_reader = paddle.batch( - paddle.dataset.uci_housing.train(), batch_size=1) - feeder = fluid.DataFeeder(place=place, feed_list=[x, y]) - exe = fluid.Executor(place) - exe.run(fluid.default_startup_program()) - for data in train_reader(): - exe.run(main, feed=feeder.feed(data), fetch_list=fetch_list) - - """ - - def __init__(self, - learning_rate, - parameter_list=None, - regularization=None, - grad_clip=None, - name=None): - assert learning_rate is not None - super(SGDOptimizer, self).__init__( - learning_rate=learning_rate, - parameter_list=parameter_list, - regularization=regularization, - grad_clip=grad_clip, - name=name) - self.type = "sgd" - - @no_grad() - def _append_optimize_op(self, block, param_and_grad): - lr = self._create_param_lr(param_and_grad) - if framework.in_dygraph_mode(): - core.ops.sgd(param_and_grad[0], lr, param_and_grad[1], - param_and_grad[0]) - return None - - assert isinstance(block, framework.Block) - # create the optimize op - sgd_op = block.append_op( - type=self.type, - inputs={ - "Param": param_and_grad[0], - "Grad": param_and_grad[1], - "LearningRate": lr - }, - outputs={"ParamOut": param_and_grad[0]}, - stop_gradient=True) - - return sgd_op - - -class MomentumOptimizer(Optimizer): - """ - - Simple Momentum optimizer with velocity state - - This optimizer has a flag for Nestrov Momentum. - - The update equations are as follows: - - .. math:: - - & velocity = mu * velocity + gradient - - & if (use\_nesterov): - - &\quad param = param - (gradient + mu * velocity) * learning\_rate - - & else: - - &\quad param = param - learning\_rate * velocity - - Parameters: - learning_rate (float|Variable): The learning rate used to update parameters. \ - Can be a float value or a Variable with one float value as data element. - momentum (float): Momentum factor - parameter_list (Iterable, optional): Iterable of ``Variable`` names to update to minimize ``loss``. \ - This parameter is required in dygraph mode. \ - The default value is None in static mode, at this time all parameters will be updated. - use_nesterov (bool, optional): Enables Nesterov momentum, default is false. - regularization (WeightDecayRegularizer, optional): The strategy of regularization. There are two method: \ - :ref:`api_fluid_regularizer_L1Decay` , :ref:`api_fluid_regularizer_L2Decay` . If a parameter has set \ - regularizer using :ref:`api_fluid_ParamAttr` already, the regularization setting here in optimizer will be \ - ignored for this parameter. Otherwise, the regularization setting here in optimizer will take effect. \ - Default None, meaning there is no regularization. - grad_clip (GradientClipBase, optional): Gradient cliping strategy, it's an instance of - some derived class of ``GradientClipBase`` . There are three cliping strategies - ( :ref:`api_fluid_clip_GradientClipByGlobalNorm` , :ref:`api_fluid_clip_GradientClipByNorm` , - :ref:`api_fluid_clip_GradientClipByValue` ). Default None, meaning there is no gradient clipping. - name (str, optional): This parameter is used by developers to print debugging information. \ - For details, please refer to :ref:`api_guide_Name`. Default is None. - - Examples: - .. code-block:: python - - import paddle - import paddle.fluid as fluid - import numpy as np - - place = fluid.CPUPlace() - main = fluid.Program() - with fluid.program_guard(main): - x = fluid.layers.data(name='x', shape=[13], dtype='float32') - y = fluid.layers.data(name='y', shape=[1], dtype='float32') - y_predict = fluid.layers.fc(input=x, size=1, act=None) - cost = fluid.layers.square_error_cost(input=y_predict, label=y) - avg_cost = fluid.layers.mean(cost) - - moment_optimizer = paddle.optimizer.MomentumOptimizer(learning_rate=0.001, momentum=0.9) - moment_optimizer.minimize(avg_cost) - - fetch_list = [avg_cost] - train_reader = paddle.batch( - paddle.dataset.uci_housing.train(), batch_size=1) - feeder = fluid.DataFeeder(place=place, feed_list=[x, y]) - exe = fluid.Executor(place) - exe.run(fluid.default_startup_program()) - for data in train_reader(): - exe.run(main, feed=feeder.feed(data), fetch_list=fetch_list) - - """ - _velocity_acc_str = "velocity" - - def __init__(self, - learning_rate, - momentum, - parameter_list=None, - use_nesterov=False, - regularization=None, - grad_clip=None, - name=None): - assert learning_rate is not None - assert momentum is not None - super(MomentumOptimizer, self).__init__( - learning_rate=learning_rate, - parameter_list=parameter_list, - regularization=regularization, - grad_clip=grad_clip, - name=name) - self.type = "momentum" - self._momentum = momentum - self._use_nesterov = bool(use_nesterov) - - def _create_accumulators(self, block, parameters): - assert isinstance(block, framework.Block) - - for p in parameters: - self._add_accumulator(self._velocity_acc_str, p) - - def _append_optimize_op(self, block, param_and_grad): - assert isinstance(block, framework.Block) - - velocity_acc = self._get_accumulator(self._velocity_acc_str, - param_and_grad[0]) - lr = self._create_param_lr(param_and_grad) - - if framework.in_dygraph_mode(): - _, _ = core.ops.momentum(param_and_grad[0], param_and_grad[1], - velocity_acc, lr, param_and_grad[0], - velocity_acc, 'mu', self._momentum, - 'use_nesterov', self._use_nesterov) - return None - - attrs = {"mu": self._momentum, "use_nesterov": self._use_nesterov} - inputs = { - "Param": [param_and_grad[0]], - "Grad": [param_and_grad[1]], - "Velocity": [velocity_acc], - "LearningRate": [lr] - } - - outputs = { - "ParamOut": [param_and_grad[0]], - "VelocityOut": [velocity_acc] - } - # create the momentum optimize op - momentum_op = block.append_op( - type=self.type, - inputs=inputs, - outputs=outputs, - attrs=attrs, - stop_gradient=True) - - return momentum_op - - -class DGCMomentumOptimizer(Optimizer): - """ - :api_attr: Static Graph - - DGC (Deep Gradient Compression) Momentum Optimizer. Original paper is https://arxiv.org/abs/1712.01887 - - DGC reduces the communication bandwidth by sending only the important gradients (sparse update):\ - only gradients larger than a threshold are transmitted. - - To avoid losing information, DGC accumulates the rest of the gradients locally. - - Eventually, these gradients become large enough to be transmitted. - - Thus, DGC sends the large gradients immediately but eventually sends all of the gradients over time. - - To ensure no loss of accuracy, DGC employs momentum correction and local gradient clipping on top of the gradient sparsification to maintain model performance. - - DGC also uses momentum factor masking and warmup training to overcome the staleness problem caused by reduced communication. - - This optimizer will do two things: - - 1. Compress the gradient by get TopK import value from tensor \ - and use it for allreduce to reduce network bandwidth. - - 2. Call momentum to optimize the cost. - - Args: - learning_rate (float|Variable): The learning rate used to update parameters. \ - It can be a float value or a Variable with one float value as a data element. - momentum (float): Momentum factor. - rampup_begin_step (int): The beginning step from which gradient compression is implemented. - rampup_step (int): Time steps used in sparsity warm-up periods. Default is 1. - For example, if the sparsity is [0.75, 0.9375, 0.984375, 0.996, 0.999], and the rampup_step is 100, \ - it will use 0.75 at 0~19 steps, and 0.9375 at 20~39 steps, and so on. \ - And when reach sparsity array ends, it will use 0.999 then and after. - sparsity (list[float]): Get top important element from gradient tensor, the ratio is (1 - current sparsity). \ - Default is [0.999]. For example, if the sparsity is [0.99, 0.999], \ - the top [1%, 0.1%] important element will be transmitted. - parameter_list (Iterable, optional): Iterable of ``Variable`` names to update to minimize ``loss``. \ - This parameter is required in dygraph mode. \ - The default value is None in static mode, at this time all parameters will be updated. - use_nesterov (bool): Enables Nesterov momentum. True means use Nesterov. Default is False. - regularization (WeightDecayRegularizer, optional): The strategy of regularization. There are two method: \ - :ref:`api_fluid_regularizer_L1Decay` , :ref:`api_fluid_regularizer_L2Decay` . If a parameter has set \ - regularizer using :ref:`api_fluid_ParamAttr` already, the regularization setting here in optimizer will be \ - ignored for this parameter. Otherwise, the regularization setting here in optimizer will take effect. \ - Default None, meaning there is no regularization. - grad_clip (GradientClipByNorm, optional): Gradient cliping strategy. ``DGCMomentumOptimizer`` only support - :ref:`api_fluid_clip_GradientClipByNorm` , and if not, it will raise TypeError. Default None, - meaning there is no gradient clipping. - name (str, optional): This parameter is used by developers to print debugging information. \ - For details, please refer to :ref:`api_guide_Name`. Default is None. - - Examples: - .. code-block:: python - - import paddle - optimizer = paddle.optimizer.DGCMomentumOptimizer( - learning_rate=0.0001, - momentum=0.9, - rampup_step=1000, - rampup_begin_step=1252, - sparsity=[0.999, 0.999]) - - """ - _u_velocity_acc_str = "_dgc_u_" - _v_velocity_acc_str = "_dgc_v_" - - def __init__(self, - learning_rate, - momentum, - rampup_begin_step, - rampup_step=1, - sparsity=[0.999], - parameter_list=None, - use_nesterov=False, - num_trainers=None, - regularization=None, - grad_clip=None, - name=None): - if framework.in_dygraph_mode(): - raise Exception("In dygraph, don't support DGCMomentumOptimizer.") - - assert core.is_compiled_with_cuda(), \ - "Paddle is not compiled with CUDA. DGC is only support GPU for now." - - assert learning_rate is not None - assert momentum is not None - super(DGCMomentumOptimizer, self).__init__( - learning_rate=learning_rate, - parameter_list=parameter_list, - regularization=regularization, - grad_clip=grad_clip, - name=name) - self.type = "dgc_momentum" - self._momentum = momentum - self._use_nesterov = bool(use_nesterov) - - assert rampup_begin_step >= 0, "rampup_begin_step must >= 0" - self._rampup_begin_step = rampup_begin_step - self._rampup_step = rampup_step - self._sparsity = sparsity - - self._rampup_begin_step_var = None - self._global_step_var = None - - self._dgc_clip_norm = None - if grad_clip is not None: - if not isinstance(grad_clip, GradientClipByNorm): - raise TypeError( - "The type of grad_clip should be 'GradientClipByNorm', because DGCMomentumOptimizer only support GradientClipByNorm" - ) - assert isinstance( - num_trainers, int - ), "The type of num_trainers should be 'int', but received %s" % type( - value) - assert num_trainers > 0, "The value of num_trainers should be greater than 0!" - - self._num_trainers = num_trainers - self._dgc_clip_norm = grad_clip.clip_norm * (num_trainers**-0.5) - - self.regular_type, self.regular_coeff = self._get_regularization_param( - self.regularization) - - def _get_regularization_param(self, regularization): - regular_type = 0 - regular_coeff = 0.0 - - if regularization is not None: - regular_coeff = regularization._regularization_coeff - from .regularizer import L1Decay, L2Decay - if isinstance(regularization, L1Decay): - regular_type = 1 - elif isinstance(regularization, L2Decay): - regular_type = 2 - else: - assert False, 'regularization must be None|L1Decay|L2Deacy' - return regular_type, regular_coeff - - def _is_use_dgc(self, param_var, grad_var): - var_numel = abs(reduce(lambda x, y: x * y, param_var.shape)) - if var_numel < 16384 or \ - param_var.type == core.VarDesc.VarType.SELECTED_ROWS or \ - grad_var.type == core.VarDesc.VarType.SELECTED_ROWS or \ - param_var.dtype != core.VarDesc.VarType.FP32 : - return False - return True - - def _append_optimize_op(self, block, param_and_grad): - assert isinstance(block, framework.Block) - velocity_acc = self._get_accumulator(self._u_velocity_acc_str, - param_and_grad[0]) - assert velocity_acc is not None - - inputs = { - "Param": param_and_grad[0], - "Grad": param_and_grad[1], - "Velocity": velocity_acc, - "LearningRate": self._create_param_lr(param_and_grad), - } - outputs = { - "ParamOut": param_and_grad[0], - "VelocityOut": velocity_acc, - } - attrs = {"mu": self._momentum, "use_nesterov": self._use_nesterov} - - if not self._is_use_dgc(param_and_grad[0], param_and_grad[1]): - type = "momentum" - else: - type = "dgc_momentum" - inputs.update({ - "current_step": self._global_step_var, - "nranks": self._nranks_var - }) - outputs.update({'Grad_out': param_and_grad[1]}) - attrs.update({"rampup_begin_step": float(self._rampup_begin_step)}) - - # create the dgc momentum optimize op - dgc_momentum_op = block.append_op( - type=type, - inputs=inputs, - outputs=outputs, - attrs=attrs, - stop_gradient=True) - return dgc_momentum_op - - def _add_auto_increment_var(self, counter_name, begin, step=1): - helper = LayerHelper('global_step_counter') - counter, is_new_var = helper.create_or_get_global_variable( - name=counter_name, dtype='float32', shape=[1], persistable=True) - if is_new_var: - helper.set_variable_initializer( - counter, - initializer=Constant( - value=float(begin - 1), force_cpu=True)) - helper.main_program.global_block()._prepend_op( - type='increment', - inputs={'X': [counter]}, - outputs={'Out': [counter]}, - attrs={'step': float(step)}, - stop_gradient=True) - counter.stop_gradient = True - - return counter - - def _add_nranks_var(self, name, value=-1): - helper = LayerHelper('global_step_counter') - counter, is_new_var = helper.create_or_get_global_variable( - name=name, dtype='float32', shape=[1], persistable=True) - if is_new_var: - helper.set_variable_initializer( - counter, - initializer=Constant( - value=float(value), force_cpu=True)) - counter.stop_gradient = True - - return counter - - def _append_dgc_ops(self, param_and_grads): - main_program = default_main_program() - main_program._enable_dgc = True - - # step counter - self._global_step_var = self._add_auto_increment_var( - counter_name=core.dgc.kDGCCounterName(), begin=0) - - self._nranks_var = self._add_nranks_var( - name=core.dgc.kDGCNRanksName(), value=-1) - - # rampup begin step var for all_reduce_op_handle - self._rampup_begin_step_var = tensor.create_global_var( - shape=[1], - dtype=core.VarDesc.VarType.FP32, - persistable=True, - name=core.dgc.kDGCRampUpBeginStepName(), - value=self._rampup_begin_step * 1.0, - force_cpu=True) - - self.helper = LayerHelper(self.__class__.__name__) + Examples: + .. code-block:: python - for param_var, grad_var in param_and_grads: - # reuse velocity in dgc_op and dgc_momentum_op - u_var = self._add_accumulator(self._u_velocity_acc_str, param_var) + import paddle + import paddle.fluid as fluid + import numpy as np - if not self._is_use_dgc(param_var, grad_var): + with fluie.dygraph.guard(): + value = np.arange(26).reshape(2, 13).astype("float32") + a = fluid.dygraph.to_variable(value) + linear = fluid.Linear(13, 5, dtype="float32") + # This can be any optimizer supported by dygraph. + adam = paddle.optimizer.Adam(learning_rate = 0.01, + parameters = linear.parameters()) + out = linear(a) + out.backward() + adam.step() + adam.clear_gradients() + """ + parameter_list = self._parameter_list + self._dtype = None + params_grads = [] + for param in self._parameter_list: + if not param.trainable: continue - - v_var = self._add_accumulator(self._v_velocity_acc_str, param_var) - - k_var = tensor.create_global_var( - shape=[1], - dtype=param_var.dtype, - persistable=True, - name=param_var.name + core.dgc.kDGCKName(), - value=0.0, - force_cpu=True) - - encoded_var = tensor.create_global_var( - shape=[1], - dtype=param_var.dtype, - persistable=True, - name=param_var.name + core.dgc.kDGCEncodedName(), - value=0.0, - force_cpu=False) - - gather_var = tensor.create_global_var( - shape=[1], - dtype=param_var.dtype, - persistable=True, - name=param_var.name + core.dgc.kDGCGatherName(), - value=0.0, - force_cpu=False) - - # del back oprolevarname - op_maker = core.op_proto_and_checker_maker - backward = core.op_proto_and_checker_maker.OpRole.Backward - for op in main_program.global_block().ops: - if not self._is_the_backward_op(op): - continue - - var_attr = op.all_attrs()[op_maker.kOpRoleVarAttrName()] - if param_var.name not in var_attr: - continue - - var_attr.remove(param_var.name) - var_attr.remove(grad_var.name) - if len(var_attr) > 1: - op._set_attr(op_maker.kOpRoleVarAttrName(), var_attr) - else: - op._remove_attr(op_maker.kOpRoleVarAttrName()) - - clip_var = grad_var - if self._dgc_clip_norm is not None: - clip_var = self._append_clip_norm(grad_var, self._dgc_clip_norm) - self._dgc_op(param_var, clip_var, grad_var, u_var, v_var, k_var, - encoded_var, gather_var) - - def _is_the_backward_op(self, op): - op_maker = core.op_proto_and_checker_maker - backward = core.op_proto_and_checker_maker.OpRole.Backward - if op_maker.kOpRoleVarAttrName() in op.attr_names and \ - int(op.all_attrs()[op_maker.kOpRoleAttrName()]) == int(backward): - return True - return False - - def _clip_by_norm(self, x, max_norm, name=None): - args = {'x': x, 'max_norm': max_norm, 'name': name} - - helper = LayerHelper("dgc_clip_by_norm_op", **args) - - if name is None: - name = unique_name.generate_with_ignorable_key(".".join( - [helper.name, 'tmp'])) - - out = helper.create_variable( - type=x.type, name=name, dtype=x.dtype, persistable=False) - - helper.append_op( - type="dgc_clip_by_norm", - inputs={"X": x, - "current_step": self._global_step_var}, - attrs={ - "max_norm": max_norm, - "rampup_begin_step": float(self._rampup_begin_step) - }, - outputs={"Out": out}) - return out - - def _append_clip_norm(self, grad_var, clip_norm): - with grad_var.block.program._backward_role_guard(): - return self._clip_by_norm( - x=grad_var, max_norm=clip_norm, name=grad_var.name) - - def _dgc_op(self, param_var, clip_var, grad_var, u_var, v_var, k_var, - encoded_var, gather_var): - block = framework.default_main_program().global_block() - op_maker = core.op_proto_and_checker_maker - - regular_type = self.regular_type - regular_coeff = self.regular_coeff - # The regularizer of the Parameters have higher priority - if param_var.regularizer is not None: - regular_type, regular_coeff = self._get_regularization_param( - param_var.regularizer) - - dgc_op = block.append_op( - type="dgc", - inputs={ - "U": u_var, - "V": v_var, - "Grad": clip_var, - "Param": param_var, - "current_step": self._global_step_var, - "nranks": self._nranks_var, - }, - outputs={ - "U_out": u_var, - "V_out": v_var, - "EncodeGrad": encoded_var, - "k": k_var, - "Grad_out": grad_var, - "GatherBuff": gather_var, - }, - attrs={ - "m": self._momentum, - "sparsity": self._sparsity, - "use_nesterov": self._use_nesterov, - "rampup_begin_step": float(self._rampup_begin_step), - "rampup_step": float(self._rampup_step), - "regular_coeff": float(regular_coeff), - "regular_type": int(regular_type), - }, - stop_gradient=True) - - backward = op_maker.OpRole.Backward - dgc_op._set_attr(op_maker.kOpRoleAttrName(), backward) - dgc_op._set_attr(op_maker.kOpRoleVarAttrName(), - [param_var.name, grad_var.name]) - - @imperative_base.no_grad() - def apply_gradients(self, params_grads): - params_grads = sorted(params_grads, key=lambda x: x[0].name) - params_grads, table_param_and_grad, table_optimize_op = \ - self._process_distribute_lookuptable(params_grads) - - not_dgc_params_grads = [] - dgc_params_grads = [] - # DGC clip and regularization in optimizer.backward - for param, grad in params_grads: - if not self._is_use_dgc(param, grad): - not_dgc_params_grads.append((param, grad)) - else: - dgc_params_grads.append((param, grad)) - - # 'optimizer(grad_clip)' or 'set_gradient_clip' - if self._grad_clip is not None: - not_dgc_params_grads = self._grad_clip(not_dgc_params_grads) - else: - not_dgc_params_grads = append_gradient_clip_ops( - not_dgc_params_grads) - - not_dgc_params_grads = append_regularization_ops(not_dgc_params_grads, - self.regularization) - - params_grads = not_dgc_params_grads + dgc_params_grads - params_grads = sorted(params_grads, key=lambda x: x[0].name) - - optimize_ops = self._create_optimization_pass(params_grads) - if table_optimize_op is not None: - optimize_ops.append(table_optimize_op) - params_grads.append(table_param_and_grad) - - return optimize_ops - - -class LarsMomentumOptimizer(Optimizer): - """ - Momentum optimizer with LARS support - - The update equations are as follows: - - .. math:: - - & local\_learning\_rate = learning\_rate * lars\_coeff * \\ - \\frac{||param||}{||gradient|| + lars\_weight\_decay * ||param||} - - & velocity = mu * velocity + local\_learning\_rate * (gradient + lars\_weight\_decay * param) - - & param = param - velocity - - Parameters: - learning_rate (float|Variable): The learning rate used to update parameters. \ - Can be a float value or a Variable with one float value as data element. \ - momentum (float): momentum factor - lars_coeff (float): Defines how much we trust the layer to change its weights. - lars_weight_decay (float): Weight decay coefficient for decaying using LARS. - parameter_list (Iterable, optional): Iterable of ``Variable`` names to update to minimize ``loss``. \ - This parameter is required in dygraph mode. \ - The default value is None in static mode, at this time all parameters will be updated. - regularization (WeightDecayRegularizer, optional): The strategy of regularization. There are two method: \ - :ref:`api_fluid_regularizer_L1Decay` , :ref:`api_fluid_regularizer_L2Decay` . If a parameter has set \ - regularizer using :ref:`api_fluid_ParamAttr` already, the regularization setting here in optimizer will be \ - ignored for this parameter. Otherwise, the regularization setting here in optimizer will take effect. \ - Default None, meaning there is no regularization. - grad_clip (GradientClipBase, optional): Gradient cliping strategy, it's an instance of - some derived class of ``GradientClipBase`` . There are three cliping strategies - ( :ref:`api_fluid_clip_GradientClipByGlobalNorm` , :ref:`api_fluid_clip_GradientClipByNorm` , - :ref:`api_fluid_clip_GradientClipByValue` ). Default None, meaning there is no gradient clipping. - name (str, optional): This parameter is used by developers to print debugging information. \ - For details, please refer to :ref:`api_guide_Name`. Default is None. - - Examples: - .. code-block:: python - - import paddle.fluid as fluid - import numpy as np - import paddle - - np_inp = np.array([[1.0, 2.0], [3.0, 4.0]], dtype=np.float32) - inp = fluid.layers.data( - name="inp", shape=[2, 2], append_batch_size=False) - out = fluid.layers.fc(inp, size=3) - out = fluid.layers.reduce_sum(out) - optimizer = paddle.optimizer.LarsMomentumOptimizer(learning_rate=0.001, momentum=0.9) - optimizer.minimize(out) - - exe = fluid.Executor(fluid.CPUPlace()) - exe.run(fluid.default_startup_program()) - exe.run( - feed={"inp": np_inp}, - fetch_list=[out.name]) - """ - _velocity_acc_str = "velocity" - - def __init__(self, - learning_rate, - momentum, - lars_coeff=0.001, - lars_weight_decay=0.0005, - parameter_list=None, - regularization=None, - grad_clip=None, - name=None): - assert learning_rate is not None - assert momentum is not None - super(LarsMomentumOptimizer, self).__init__( - learning_rate=learning_rate, - parameter_list=parameter_list, - regularization=regularization, - grad_clip=grad_clip, - name=name) - self.type = "lars_momentum" - self._momentum = momentum - self._lars_coeff = float(lars_coeff) - self._lars_weight_decay = float(lars_weight_decay) - - def _create_accumulators(self, block, parameters): - assert isinstance(block, framework.Block) - - for p in parameters: - self._add_accumulator(self._velocity_acc_str, p) - - def _append_optimize_op(self, block, param_and_grad): - assert isinstance(block, framework.Block) - - velocity_acc = self._get_accumulator(self._velocity_acc_str, - param_and_grad[0]) - # create the momentum optimize op - momentum_op = block.append_op( - type=self.type, - inputs={ - "Param": param_and_grad[0], - "Grad": param_and_grad[1], - "Velocity": velocity_acc, - "LearningRate": self._create_param_lr(param_and_grad) - }, - outputs={ - "ParamOut": param_and_grad[0], - "VelocityOut": velocity_acc - }, - attrs={ - "mu": self._momentum, - "lars_coeff": self._lars_coeff, - "lars_weight_decay": self._lars_weight_decay - }, - stop_gradient=True) - - return momentum_op - - -class AdagradOptimizer(Optimizer): - """ - The Adaptive Gradient optimizer (Adagrad for short) can adaptively assign - different learning rates to individual parameters. - - The parameter ``param_out`` update rule with gradient ``grad``: - - .. math:: - - moment\_out &= moment + grad * grad - - param\_out &= param - \\frac{learning\_rate * grad}{\sqrt{moment\_out} + \epsilon} - - Related paper: `Adaptive Subgradient Methods for Online Learning and - Stochastic Optimization `_. - - The original paper does not have the ``epsilon`` attribute. It is added here - in our implementation as also proposed `Per-parameter adaptive learning rate - methods `_ - for numerical stability to avoid the division by zero error. - - Args: - learning_rate (float|Variable): The learning rate used to update ``Parameter``. - It can be a float value or a ``Variable`` with a float type. - epsilon (float, optional): A small float value for numerical stability. - The default value is 1e-06. - parameter_list (Iterable, optional): Iterable of ``Variable`` names to update to minimize ``loss``. \ - This parameter is required in dygraph mode. \ - The default value is None in static mode, at this time all parameters will be updated. - regularization (WeightDecayRegularizer, optional): The strategy of regularization. There are two method: \ - :ref:`api_fluid_regularizer_L1Decay` , :ref:`api_fluid_regularizer_L2Decay` . If a parameter has set \ - regularizer using :ref:`api_fluid_ParamAttr` already, the regularization setting here in optimizer will be \ - ignored for this parameter. Otherwise, the regularization setting here in optimizer will take effect. \ - Default None, meaning there is no regularization. - grad_clip (GradientClipBase, optional): Gradient cliping strategy, it's an instance of - some derived class of ``GradientClipBase`` . There are three cliping strategies - ( :ref:`api_fluid_clip_GradientClipByGlobalNorm` , :ref:`api_fluid_clip_GradientClipByNorm` , - :ref:`api_fluid_clip_GradientClipByValue` ). Default None, meaning there is no gradient clipping. - name (str, optional): Normally there is no need for user to set this property. - For more information, please refer to :ref:`api_guide_Name`. - The default value is None. - initial_accumulator_value (float, optional): Initial value for moment accumulator. - The default value is 0.0. - - Examples: - .. code-block:: python - - import numpy as np - import paddle - import paddle.fluid as fluid - - np_inp = np.array([[1.0, 2.0], [3.0, 4.0]], dtype=np.float32) - inp = fluid.data(name="inp", shape=[2, 2]) - out = fluid.layers.fc(inp, size=3) - out = fluid.layers.reduce_sum(out) - optimizer = paddle.optimizer.AdagradOptimizer(learning_rate=0.2) - optimizer.minimize(out) - - exe = fluid.Executor(fluid.CPUPlace()) - exe.run(fluid.default_startup_program()) - exe.run( - feed={"inp": np_inp}, - fetch_list=[out.name]) - """ - _moment_acc_str = "moment" - - def __init__(self, - learning_rate, - epsilon=1.0e-6, - parameter_list=None, - regularization=None, - grad_clip=None, - name=None, - initial_accumulator_value=0.0): - assert learning_rate is not None - assert epsilon is not None - super(AdagradOptimizer, self).__init__( - learning_rate=learning_rate, - parameter_list=parameter_list, - regularization=regularization, - grad_clip=grad_clip, - name=name) - self.type = "adagrad" - self._epsilon = epsilon - self.initial_accumulator_value = initial_accumulator_value - - def _create_accumulators(self, block, parameters): - assert isinstance(block, framework.Block) - - for p in parameters: - self._add_accumulator( - self._moment_acc_str, - p, - fill_value=self.initial_accumulator_value) - - def _append_optimize_op(self, block, param_and_grad): - assert isinstance(block, framework.Block) - - moment_acc = self._get_accumulator(self._moment_acc_str, - param_and_grad[0]) - # Create the adagrad optimizer op - adagrad_op = block.append_op( - type=self.type, - inputs={ - "Param": param_and_grad[0], - "Grad": param_and_grad[1], - "Moment": moment_acc, - "LearningRate": self._create_param_lr(param_and_grad) - }, - outputs={"ParamOut": param_and_grad[0], - "MomentOut": moment_acc}, - attrs={"epsilon": self._epsilon}, - stop_gradient=True) - - return adagrad_op - - -class Adam(Optimizer): - """ - The Adam optimizer uses an optimization described at the end - of section 2 of `Adam paper `_ , - it can dynamically adjusts the learning rate of each parameter using - the 1st moment estimates and the 2nd moment estimates of the gradient. - - The parameter ``param_out`` update rule with gradient ``grad``: - - .. math:: - - t & = t + 1 - - moment\_1\_out & = {\\beta}_1 * moment\_1 + (1 - {\\beta}_1) * grad - - moment\_2\_out & = {\\beta}_2 * moment\_2 + (1 - {\\beta}_2) * grad * grad - - learning\_rate & = learning\_rate * \\ - \\frac{\sqrt{1 - {\\beta}_2^t}}{1 - {\\beta}_1^t} - - param\_out & = param - learning\_rate * \\frac{moment\_1}{\sqrt{moment\_2} + \epsilon} - - Related paper: `Adam: A Method for Stochastic Optimization `_ - - Args: - lr (float|Tensor, optional): The learning rate used to update ``Parameter``. - It can be a float value or a ``Tensor`` with a float type. The default value is 0.001. - betas (list[float]|list[Tensor], optional): The list of exponential decay rate for moment estimates. - The elements of list should be float numbers or Tensor with shape [1] and data type as float32. - The default value is [0.9, 0.999]. - eps (float, optional): A small float value for numerical stability. - The default value is 1e-08. - params (Iterable, optional): Iterable of ``Tensor`` names to update to minimize ``loss``. \ - This parameter is required in dygraph mode. \ - The default value is None in static mode, at this time all parameters will be updated. - weight_decay (float, optional): Weight decay of L2 regularization. The default value is 0. - grad_clip (GradientClipBase, optional): Gradient cliping strategy, it's an instance of - some derived class of ``GradientClipBase`` . There are three cliping strategies - ( :ref:`api_fluid_clip_GradientClipByGlobalNorm` , :ref:`api_fluid_clip_GradientClipByNorm` , - :ref:`api_fluid_clip_GradientClipByValue` ). Default None, meaning there is no gradient clipping. - name (str, optional): Normally there is no need for user to set this property. - For more information, please refer to :ref:`api_guide_Name`. - The default value is None. - lazy_mode (bool, optional): The official Adam algorithm has two moving-average accumulators. - The accumulators are updated at every step. Every element of the two moving-average - is updated in both dense mode and sparse mode. If the size of parameter is very large, - then the update may be very slow. The lazy mode only update the element that has - gradient in current mini-batch, so it will be much more faster. But this mode has - different semantics with the original Adam algorithm and may lead to different result. - The default value is False. - - Examples: - .. code-block:: python - - import paddle - import paddle.fluid as fluid - - place = fluid.CPUPlace() - main = fluid.Program() - with fluid.program_guard(main): - x = fluid.data(name='x', shape=[None, 13], dtype='float32') - y = fluid.data(name='y', shape=[None, 1], dtype='float32') - y_predict = fluid.layers.fc(input=x, size=1, act=None) - cost = fluid.layers.square_error_cost(input=y_predict, label=y) - avg_cost = fluid.layers.mean(cost) - - adam_optimizer = paddle.optimizer.Adam(0.01) - adam_optimizer.minimize(avg_cost) - - fetch_list = [avg_cost] - train_reader = paddle.batch( - paddle.dataset.uci_housing.train(), batch_size=1) - feeder = fluid.DataFeeder(place=place, feed_list=[x, y]) - exe = fluid.Executor(place) - exe.run(fluid.default_startup_program()) - for data in train_reader(): - exe.run(main, feed=feeder.feed(data), fetch_list=fetch_list) - - .. code-block:: python - - # Adam with betas as list[Tensor] - import paddle - import paddle.fluid as fluid - import paddle.fluid.layers.learning_rate_scheduler as lr_scheduler - - place = fluid.CPUPlace() - main = fluid.Program() - with fluid.program_guard(main): - x = fluid.data(name='x', shape=[None, 13], dtype='float32') - y = fluid.data(name='y', shape=[None, 1], dtype='float32') - y_predict = fluid.layers.fc(input=x, size=1, act=None) - cost = fluid.layers.square_error_cost(input=y_predict, label=y) - avg_cost = fluid.layers.mean(cost) - - # define beta decay Tensor - def get_decayed_betas(beta1_init, beta2_init, decay_steps, decay_rate): - global_step = lr_scheduler._decay_step_counter() - - beta1 = fluid.layers.create_global_var( - shape=[1], - value=float(beta1_init), - dtype='float32', - # set persistable for save checkpoints and resume - persistable=True, - name="beta1") - beta2 = fluid.layers.create_global_var( - shape=[1], - value=float(beta2_init), - dtype='float32', - # set persistable for save checkpoints and resume - persistable=True, - name="beta2") - - div_res = global_step / decay_steps - decayed_beta1 = beta1_init * (decay_rate**div_res) - decayed_beta2 = beta2_init * (decay_rate**div_res) - fluid.layers.assign(decayed_beta1, beta1) - fluid.layers.assign(decayed_beta2, beta2) - - return [beta1, beta2] - - betas = get_decayed_betas(0.9, 0.99, 1e5, 0.9) - adam_optimizer = paddle.optimizer.Adam( - lr=0.01, - betas=betas) - adam_optimizer.minimize(avg_cost) - - fetch_list = [avg_cost] - train_reader = paddle.batch( - paddle.dataset.uci_housing.train(), batch_size=1) - feeder = fluid.DataFeeder(place=place, feed_list=[x, y]) - exe = fluid.Executor(place) - exe.run(fluid.default_startup_program()) - for data in train_reader(): - exe.run(main, feed=feeder.feed(data), fetch_list=fetch_list) - """ - _moment1_acc_str = "moment1" - _moment2_acc_str = "moment2" - _beta1_pow_acc_str = "beta1_pow_acc" - _beta2_pow_acc_str = "beta2_pow_acc" - - def __init__(self, - lr=0.001, - betas=[0.9, 0.999], - eps=1e-8, - params=None, - weight_decay=0.0, - grad_clip=None, - name=None, - lazy_mode=False): - assert learning_rate is not None - assert betas is not None - assert epsilon is not None - regularization = L2Decay(regularization_coeff=weight_decay) - super(Adam, self).__init__( - learning_rate=lr, - parameter_list=params, - regularization=regularization, - grad_clip=grad_clip, - name=name) - self.type = "adam" - self._beta1 = betas[0] - self._beta2 = betas[1] - self._epsilon = epsilon - self._lazy_mode = lazy_mode - - def _create_accumulators(self, block, parameters): - assert isinstance(block, framework.Block) - - # Create accumulator tensors for first and second moments - for p in parameters: - self._add_accumulator(self._moment1_acc_str, p) - self._add_accumulator(self._moment2_acc_str, p) - self._add_accumulator( - name=self._beta1_pow_acc_str, - param=p, - fill_value=0.9 if isinstance(self._beta1, Variable) \ - else self._beta1, - shape=[1], - type=core.VarDesc.VarType.LOD_TENSOR, device='cpu') - self._add_accumulator( - name=self._beta2_pow_acc_str, - param=p, - fill_value=0.999 if isinstance(self._beta2, Variable) \ - else self._beta2, - shape=[1], - type=core.VarDesc.VarType.LOD_TENSOR, device='cpu') - - def _append_optimize_op(self, block, param_and_grad): - assert isinstance(block, framework.Block) - - moment1 = self._get_accumulator(self._moment1_acc_str, - param_and_grad[0]) - moment2 = self._get_accumulator(self._moment2_acc_str, - param_and_grad[0]) - beta1_pow_acc = self._get_accumulator(self._beta1_pow_acc_str, - param_and_grad[0]) - beta2_pow_acc = self._get_accumulator(self._beta2_pow_acc_str, - param_and_grad[0]) - lr = self._create_param_lr(param_and_grad) - # create the adam optimize op - - if framework.in_dygraph_mode(): - _beta1 = self._beta1 if not isinstance( - self._beta1, Variable) else self._beta1.numpy().item(0) - _beta2 = self._beta2 if not isinstance( - self._beta2, Variable) else self._beta2.numpy().item(0) - _, _, _, _, _ = core.ops.adam( - param_and_grad[0], param_and_grad[1], lr, moment1, moment2, - beta1_pow_acc, beta2_pow_acc, param_and_grad[0], moment1, - moment2, beta1_pow_acc, beta2_pow_acc, 'epsilon', self._epsilon, - 'lazy_mode', self._lazy_mode, 'min_row_size_to_use_multithread', - 1000, 'beta1', _beta1, 'beta2', _beta2) - - return None - - inputs = { - "Param": [param_and_grad[0]], - "Grad": [param_and_grad[1]], - "LearningRate": [lr], - "Moment1": [moment1], - "Moment2": [moment2], - "Beta1Pow": [beta1_pow_acc], - "Beta2Pow": [beta2_pow_acc] - } - outputs = { - "ParamOut": [param_and_grad[0]], - "Moment1Out": [moment1], - "Moment2Out": [moment2], - "Beta1PowOut": [beta1_pow_acc], - "Beta2PowOut": [beta2_pow_acc], - } - attrs = { - "epsilon": self._epsilon, - "lazy_mode": self._lazy_mode, - "min_row_size_to_use_multithread": 1000 - } - - if isinstance(self._beta1, Variable): - inputs['Beta1Tensor'] = self._beta1 - else: - attrs['beta1'] = self._beta1 - if isinstance(self._beta2, Variable): - inputs['Beta2Tensor'] = self._beta2 - else: - attrs['beta2'] = self._beta2 - - adam_op = block.append_op( - type=self.type, - inputs=inputs, - outputs=outputs, - attrs=attrs, - stop_gradient=True) - - return adam_op - - -class AdamaxOptimizer(Optimizer): - """ - The Adamax optimizer is implemented based on the Adamax Optimization - in Section 7 of `Adam paper `_. - The Adamax algorithm is a variant of the Adam algorithm based on the infinite norm, - which makes the learning rate update algorithm more stable and simple. - - The parameter ``param_out`` update rule with gradient ``grad``: - - .. math:: - - t & = t + 1 - - moment\_out & = {\\beta}_1 * moment + (1 - {\\beta}_1) * grad - - inf\_norm\_out & = max({\\beta}_2 * inf\_norm + \epsilon, |grad|) - - learning\_rate & = \\frac{learning\_rate}{1 - {\\beta}_1^t} - - param\_out & = param - learning\_rate * \\frac{moment\_out}{inf\_norm\_out} - - Related paper: `Adam: A Method for Stochastic Optimization `_ - - The original paper does not have an ``epsilon`` attribute, - it is added here for numerical stability to prevent the division by 0 error. - - Args: - lr (float|Tensor, optional): The learning rate used to update ``Parameter``. - It can be a float value or a ``Tensor`` with a float type. The default value is 0.001. - betas (list[float]|list[Tensor], optional): The list of exponential decay rate for moment estimates. - The elements of list should be float numbers or Tensor with shape [1] and data type as float32. - The default value is [0.9, 0.999]. - epsilon (float, optional): A small float value for numerical stability. - The default value is 1e-08. - parms (Iterable, optional): Iterable of ``Tensor`` names to update to minimize ``loss``. \ - This parameter is required in dygraph mode. \ - The default value is None in static mode, at this time all parameters will be updated. - weight_decay (float, optional): Weight decay of L2 regularization. The default value is 0. - grad_clip (GradientClipBase, optional): Gradient cliping strategy, it's an instance of - some derived class of ``GradientClipBase`` . There are three cliping strategies - ( :ref:`api_fluid_clip_GradientClipByGlobalNorm` , :ref:`api_fluid_clip_GradientClipByNorm` , - :ref:`api_fluid_clip_GradientClipByValue` ). Default None, meaning there is no gradient clipping. - name (str, optional): Normally there is no need for user to set this property. - For more information, please refer to :ref:`api_guide_Name`. - The default value is None. - - **Notes**: - **Currently, Adamax doesn't support sparse parameter optimization.** - - Examples: - .. code-block:: python - - import paddle.fluid as fluid - import paddle - import numpy - - # First create the Executor. - place = fluid.CPUPlace() # fluid.CUDAPlace(0) - exe = fluid.Executor(place) - - train_program = fluid.Program() - startup_program = fluid.Program() - with fluid.program_guard(train_program, startup_program): - data = fluid.data(name='X', shape=[None, 1], dtype='float32') - hidden = fluid.layers.fc(input=data, size=10) - loss = fluid.layers.mean(hidden) - adam = paddle.optimizer.Adamax(lr=0.2) - adam.minimize(loss) - - # Run the startup program once and only once. - exe.run(startup_program) - - x = numpy.random.random(size=(10, 1)).astype('float32') - outs = exe.run(program=train_program, - feed={'X': x}, - fetch_list=[loss.name]) - """ - _moment_acc_str = "moment" - _inf_norm_acc_str = "inf_norm" - _beta1_pow_acc_str = "beta1_pow_acc" - - def __init__(self, - learning_rate=0.001, - beta1=0.9, - beta2=0.999, - epsilon=1e-8, - parameter_list=None, - regularization=None, - grad_clip=None, - name=None): - assert learning_rate is not None - assert beta1 is not None - assert beta2 is not None - assert epsilon is not None - super(AdamaxOptimizer, self).__init__( - learning_rate=learning_rate, - parameter_list=parameter_list, - regularization=regularization, - grad_clip=grad_clip, - name=name) - self.type = "adamax" - self._beta1 = beta1 - self._beta2 = beta2 - self._epsilon = epsilon - - def _create_accumulators(self, block, parameters): - # Create accumulator tensors for first moment and infinity norm - for p in parameters: - self._add_accumulator(self._moment_acc_str, p) - self._add_accumulator(self._inf_norm_acc_str, p) - self._add_accumulator( - name=self._beta1_pow_acc_str, - param=p, - fill_value=self._beta1, - shape=[1]) - - def _append_optimize_op(self, block, param_and_grad): - assert isinstance(block, framework.Block) - - moment = self._get_accumulator(self._moment_acc_str, param_and_grad[0]) - inf_norm = self._get_accumulator(self._inf_norm_acc_str, - param_and_grad[0]) - beta1_pow_acc = self._get_accumulator(self._beta1_pow_acc_str, - param_and_grad[0]) - # create the adamax optimize op - adamax_op = block.append_op( - type=self.type, - inputs={ - "Param": param_and_grad[0], - "Grad": param_and_grad[1], - "LearningRate": self._create_param_lr(param_and_grad), - "Moment": moment, - "InfNorm": inf_norm, - "Beta1Pow": beta1_pow_acc - }, - outputs={ - "ParamOut": param_and_grad[0], - "MomentOut": moment, - "InfNormOut": inf_norm - }, - attrs={ - "beta1": self._beta1, - "beta2": self._beta2, - "epsilon": self._epsilon - }, - stop_gradient=True) - - return adamax_op - - def _finish_update(self, block, parameters_and_grads): - """Update Beta1 Power accumulator - """ - assert isinstance(block, framework.Block) - for param, grad in parameters_and_grads: - if grad is None or param.trainable is False: - continue - with param.block.program._optimized_guard( - [param, grad]), name_scope('adamax'): - beta1_pow_acc = self._get_accumulator(self._beta1_pow_acc_str, - param) - block.append_op( - type="scale", - inputs={"X": beta1_pow_acc}, - outputs={"Out": beta1_pow_acc}, - attrs={"scale": self._beta1}, - stop_gradient=True) - - -class AdamW(optimizer): - """ - The AdamW optimizer is implemented based on the AdamW Optimization - in paper `DECOUPLED WEIGHT DECAY REGULARIZATION `_. - it can resolves the problem of L2 regularization failure in the Adam optimizer. - - .. math:: - - t & = t + 1 - - moment\_1\_out & = {\\beta}_1 * moment\_1 + (1 - {\\beta}_1) * grad - - moemnt\_2\_out & = {\\beta}_2 * moment\_2 + (1 - {\\beta}_2) * grad * grad - - learning\_rate & = learning\_rate * \\ - \\frac{\sqrt{1 - {\\beta}_2^t}}{1 - {beta}_1^t} - - param\_out & = param - learning\_rate * (\\frac{moment\_1}{\sqrt{moment\_2} + \epsilon} + \lambda * param) - - - Args: - lr (float|Tensor, optional): The learning rate used to update ``Parameter``. - It can be a float value or a ``Tensor`` with a float type. The default value is 0.001. - betas (list[float]|list[Tensor], optional): The list of exponential decay rate for moment estimates. - The elements of list should be float numbers or Tensor with shape [1] and data type as float32. - The default value is [0.9, 0.999]. - epsilon (float, optional): A small float value for numerical stability. - The default value is 1e-08. - parms (Iterable, optional): Iterable of ``Tensor`` names to update to minimize ``loss``. \ - This parameter is required in dygraph mode. \ - The default value is None in static mode, at this time all parameters will be updated. - weight_decay (float, optional): Weight decay of L2 regularization. The default value is 0. - grad_clip (GradientClipBase, optional): Gradient cliping strategy, it's an instance of - some derived class of ``GradientClipBase`` . There are three cliping strategies - ( :ref:`api_fluid_clip_GradientClipByGlobalNorm` , :ref:`api_fluid_clip_GradientClipByNorm` , - :ref:`api_fluid_clip_GradientClipByValue` ). Default None, meaning there is no gradient clipping. - name (str, optional): Normally there is no need for user to set this property. - For more information, please refer to :ref:`api_guide_Name`. - The default value is None. - - **Notes**: - **Currently, AdamW doesn't support sparse parameter optimization.** - - Examples: - .. code-block:: python - - import paddle.fluid as fluid - import paddle - import numpy - - # First create the Executor. - place = fluid.CPUPlace() # fluid.CUDAPlace(0) - exe = fluid.Executor(place) - - train_program = fluid.Program() - startup_program = fluid.Program() - with fluid.program_guard(train_program, startup_program): - data = fluid.data(name='X', shape=[None, 1], dtype='float32') - hidden = fluid.layers.fc(input=data, size=10) - loss = fluid.layers.mean(hidden) - adam = paddle.optimizer.AdamW(lr=0.2) - adam.minimize(loss) - - # Run the startup program once and only once. - exe.run(startup_program) - - x = numpy.random.random(size=(10, 1)).astype('float32') - outs = exe.run(program=train_program, - feed={'X': x}, - fetch_list=[loss.name]) - """ - - -class DpsgdOptimizer(Optimizer): - """ - We implement the Dpsgd optimizer according to CCS16 paper - - Deep Learning with Differential Privacy. - - Examples: - .. code-block:: python - - import paddle.fluid as fluid - import paddle - import numpy - - # First create the Executor. - place = fluid.CPUPlace() # fluid.CUDAPlace(0) - exe = fluid.Executor(place) - - train_program = fluid.Program() - startup_program = fluid.Program() - with fluid.program_guard(train_program, startup_program): - data = fluid.layers.data(name='X', shape=[1], dtype='float32') - hidden = fluid.layers.fc(input=data, size=10) - loss = fluid.layers.mean(hidden) - optimizer = paddle.optimizer.Dpsgd(learning_rate=0.01, clip=10.0, batch_size=16.0, sigma=1.0) - optimizer.minimize(loss) - - # Run the startup program once and only once. - exe.run(startup_program) - - x = numpy.random.random(size=(10, 1)).astype('float32') - outs = exe.run(program=train_program, - feed={'X': x}, - fetch_list=[loss.name]) - - Args: - learning_rate (float|Variable): the learning rate used to update parameters. \ - Can be a float value or a Variable with one float value as data element. - clip (float): clipping threshold - batch_size (float): batch size. - sigma (float): for gaussian noise. - parameter_list (Iterable, optional): Iterable of ``Variable`` names to update to minimize ``loss``. \ - This parameter is required in dygraph mode. \ - The default value is None in static mode, at this time all parameters will be updated. - Notes: - Currently, DpsgdOptimizer doesn't support sparse parameter optimization. - """ - - def __init__(self, - learning_rate=0.001, - clip=0.9, - batch_size=0.999, - sigma=1e-8, - parameter_list=None): - assert learning_rate is not None - assert clip is not None - assert batch_size is not None - assert sigma is not None - super(DpsgdOptimizer, self).__init__( - learning_rate=learning_rate, parameter_list=parameter_list) - self.type = "dpsgd" - self._clip = clip - self._batch_size = batch_size - self._sigma = sigma - ''' - Note(wangzhongpu): - This property is only used for debugging, do not need to set it! - Dpsgd operator use time(NULL) as random seed to generate random number. - However, during debugging, we need determinated result, so we will set self._seed to a fixed number. - ''' - self._seed = None - - def _append_optimize_op(self, block, param_and_grad): - assert isinstance(block, framework.Block) - - # create the dpsgd optimize op - if self._seed == None: - self._seed = 0 - - dpsgd_op = block.append_op( - type=self.type, - inputs={ - "Param": param_and_grad[0], - "Grad": param_and_grad[1], - "LearningRate": self._create_param_lr(param_and_grad) - }, - outputs={"ParamOut": param_and_grad[0]}, - attrs={ - "clip": self._clip, - "batch_size": self._batch_size, - "sigma": self._sigma, - "seed": self._seed - }, - stop_gradient=True) - - return dpsgd_op - - -class DecayedAdagradOptimizer(Optimizer): - """ - The Decayed Adagrad optimizer can be seen as an Adagrad algorithm that introduces - the decay rate to solve the problem of a sharp drop in the learning rate - during model training when using the AdagradOptimizer. - - The parameter ``param_out`` update rule with gradient ``grad``: - - .. math:: - - moment\_out & = decay * moment + (1 - decay) * grad * grad - - param\_out & = param - \\frac{learning\_rate * grad}{\sqrt{moment\_out} + \epsilon} - - Related paper: `Adaptive Subgradient Methods for Online Learning and Stochastic - Optimization `_. - - The original paper does not have an ``epsilon`` attribute. It is added here for numerical - stability to avoid the division by zero error. - - Args: - learning_rate (float|Variable): The learning rate used to update ``Parameter``. - It can be a float value or a ``Variable`` with a float type. - decay (float, optional): The decay rate. The default value is 0.95. - epsilon (float, optional): A small float value for numerical stability. - The default value is 1e-06. - parameter_list (Iterable, optional): Iterable of ``Variable`` names to update to minimize ``loss``. \ - This parameter is required in dygraph mode. \ - The default value is None in static mode, at this time all parameters will be updated. - regularization (WeightDecayRegularizer, optional): The strategy of regularization. There are two method: \ - :ref:`api_fluid_regularizer_L1Decay` , :ref:`api_fluid_regularizer_L2Decay` . If a parameter has set \ - regularizer using :ref:`api_fluid_ParamAttr` already, the regularization setting here in optimizer will be \ - ignored for this parameter. Otherwise, the regularization setting here in optimizer will take effect. \ - Default None, meaning there is no regularization. - grad_clip (GradientClipBase, optional): Gradient cliping strategy, it's an instance of - some derived class of ``GradientClipBase`` . There are three cliping strategies - ( :ref:`api_fluid_clip_GradientClipByGlobalNorm` , :ref:`api_fluid_clip_GradientClipByNorm` , - :ref:`api_fluid_clip_GradientClipByValue` ). Default None, meaning there is no gradient clipping. - name (str, optional): Normally there is no need for user to set this property. - For more information, please refer to :ref:`api_guide_Name`. - The default value is None. - - **Notes**: - **Currently, DecayedAdagradOptimizer doesn't support sparse parameter optimization.** - - Examples: - .. code-block:: python - - import paddle.fluid as fluid - import paddle - - x = fluid.data( name='x', shape=[None, 10], dtype='float32' ) - trans = fluid.layers.fc( x, 100 ) - cost = fluid.layers.reduce_mean( trans ) - optimizer = paddle.optimizer.DecayedAdagradOptimizer(learning_rate=0.2) - optimizer.minimize(cost) - """ - _moment_acc_str = "moment" - - def __init__(self, - learning_rate, - decay=0.95, - epsilon=1.0e-6, - parameter_list=None, - regularization=None, - grad_clip=None, - name=None): - assert learning_rate is not None - assert decay is not None - assert epsilon is not None - - super(DecayedAdagradOptimizer, self).__init__( - learning_rate=learning_rate, - parameter_list=parameter_list, - regularization=regularization, - grad_clip=grad_clip, - name=name) - self.type = "decayed_adagrad" - self._decay = decay - self._epsilon = epsilon - - def _create_accumulators(self, block, parameters): - assert isinstance(block, framework.Block) - - for p in parameters: - self._add_accumulator(self._moment_acc_str, p) - - def _append_optimize_op(self, block, param_and_grad): - assert isinstance(block, framework.Block) - - moment_acc = self._get_accumulator(self._moment_acc_str, - param_and_grad[0]) - - # Create the decayed adagrad optimizer op - decayed_adagrad_op = block.append_op( - type=self.type, - inputs={ - "Param": param_and_grad[0], - "Grad": param_and_grad[1], - "Moment": moment_acc, - "LearningRate": self._create_param_lr(param_and_grad) - }, - outputs={"ParamOut": param_and_grad[0], - "MomentOut": moment_acc}, - attrs={"epsilon": self._epsilon, - "decay": self._decay}, - stop_gradient=True) - - return decayed_adagrad_op - - -class AdadeltaOptimizer(Optimizer): - """ - **Notes: This API does not support sparse parameter optimization.** - - Adadelta Optimizer. Please refer to this for details: - `ADADELTA: AN ADAPTIVE LEARNING RATE METHOD `_. - - The update is done as follows: - - .. math:: - - E(g_t^2) &= \\rho * E(g_{t-1}^2) + (1-\\rho) * g^2 - - learning\_rate &= \sqrt{ ( E(dx_{t-1}^2) + \\epsilon ) / ( E(g_t^2) + \\epsilon ) } - - E(dx_t^2) &= \\rho * E(dx_{t-1}^2) + (1-\\rho) * (-g*learning\_rate)^2 - - Args: - learning_rate (float|Variable): global learning rate. - epsilon (float): a small float number for numeric stability. Default 1.0e-6. - rho (float): a floating point value indicating the decay rate. Default 0.95. - parameter_list (Iterable, optional): Iterable of ``Variable`` names to update to minimize ``loss``. \ - This parameter is required in dygraph mode. \ - The default value is None in static mode, at this time all parameters will be updated. - regularization (WeightDecayRegularizer, optional): The strategy of regularization. There are two method: \ - :ref:`api_fluid_regularizer_L1Decay` , :ref:`api_fluid_regularizer_L2Decay` . If a parameter has set \ - regularizer using :ref:`api_fluid_ParamAttr` already, the regularization setting here in optimizer will be \ - ignored for this parameter. Otherwise, the regularization setting here in optimizer will take effect. \ - Default None, meaning there is no regularization. - grad_clip (GradientClipBase, optional): Gradient cliping strategy, it's an instance of - some derived class of ``GradientClipBase`` . There are three cliping strategies - ( :ref:`api_fluid_clip_GradientClipByGlobalNorm` , :ref:`api_fluid_clip_GradientClipByNorm` , - :ref:`api_fluid_clip_GradientClipByValue` ). Default None, meaning there is no gradient clipping. - name (str, optional): The default value is None. Normally there is no need for user - to set this property. For more information, please refer to - :ref:`api_guide_Name` . - - Examples: - .. code-block:: python - - import paddle.fluid as fluid - import paddle - - image = fluid.data(name='image', shape=[None, 28], dtype='float32') - fc = fluid.layers.fc(image, size=10) - cost = fluid.layers.reduce_mean(fc) - optimizer = paddle.optimizer.Adadelta( - learning_rate=0.0003, epsilon=1.0e-6, rho=0.95) - - # optimizer_ops is a list of optimizer operators to update parameters - # params_grads is a list of (param, param_grad), where param is each - # parameter and param_grad is the gradient variable of param. - optimizer_ops, params_grads = optimizer.minimize(cost) - """ - - _avg_squared_grad_acc_str = "_avg_squared_grad" - _avg_squared_update_acc_str = "_avg_squared_update" - - def __init__(self, - learning_rate, - epsilon=1.0e-6, - rho=0.95, - parameter_list=None, - regularization=None, - grad_clip=None, - name=None): - if learning_rate is None: - raise ValueError("learning_rate is not set.") - if epsilon is None: - raise ValueError("epsilon is not set.") - if rho is None: - raise ValueError("rho is not set.") - super(AdadeltaOptimizer, self).__init__( - learning_rate=learning_rate, - parameter_list=parameter_list, - regularization=regularization, - grad_clip=grad_clip, - name=name) - self.type = "adadelta" - self._epsilon = epsilon - self._rho = rho - - def _create_accumulators(self, block, parameters): - if not isinstance(block, framework.Block): - raise TypeError("block is not instance of framework.Block.") - - for p in parameters: - self._add_accumulator(self._avg_squared_grad_acc_str, p) - self._add_accumulator(self._avg_squared_update_acc_str, p) - - def _append_optimize_op(self, block, param_and_grad): - if not isinstance(block, framework.Block): - raise TypeError("block is not instance of framework.Block.") - - avg_squared_grad_acc = self._get_accumulator( - self._avg_squared_grad_acc_str, param_and_grad[0]) - avg_squared_update_acc = self._get_accumulator( - self._avg_squared_update_acc_str, param_and_grad[0]) - - # Create the adadelta optimizer op - adadelta_op = block.append_op( - type=self.type, - inputs={ - "Param": param_and_grad[0], - "Grad": param_and_grad[1], - "AvgSquaredGrad": avg_squared_grad_acc, - "AvgSquaredUpdate": avg_squared_update_acc - }, - outputs={ - "ParamOut": param_and_grad[0], - "AvgSquaredGradOut": avg_squared_grad_acc, - "AvgSquaredUpdateOut": avg_squared_update_acc - }, - attrs={"epsilon": self._epsilon, - "rho": self._rho}, - stop_gradient=True) - - return adadelta_op - - -class RMSPropOptimizer(Optimizer): - """ - Root Mean Squared Propagation (RMSProp) is an unpublished, adaptive learning - rate method. The original slides proposed RMSProp: Slide 29 of - http://www.cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf . - - The original equation is as follows: - - .. math:: - - r(w, t) & = \\rho r(w, t-1) + (1 - \\rho)(\\nabla Q_{i}(w))^2 - - w & = w - \\frac{\\eta} {\\sqrt{r(w,t) + \\epsilon}} \\nabla Q_{i}(w) - - The first equation calculates moving average of the squared gradient for - each weight. Then dividing the gradient by :math:`sqrt{v(w,t)}`. - - In some cases, adding a momentum term :math: `\\beta` is beneficial. - In our implementation, Nesterov momentum is used: - - .. math:: - - r(w, t) & = \\rho r(w, t-1) + (1 - \\rho)(\\nabla Q_{i}(w))^2 - - v(w, t) & = \\beta v(w, t-1) + \\frac{\\eta} {\\sqrt{r(w,t) + - \\epsilon}} \\nabla Q_{i}(w) - - w & = w - v(w, t) - - if centered is True: - - .. math:: - - r(w, t) & = \\rho r(w, t-1) + (1 - \\rho)(\\nabla Q_{i}(w))^2 - - g(w, t) & = \\rho g(w, t-1) + (1 - \\rho)\\nabla Q_{i}(w) - - v(w, t) & = \\beta v(w, t-1) + \\frac{\\eta} {\\sqrt{r(w,t) - (g(w, t))^2 + - \\epsilon}} \\nabla Q_{i}(w) - - w & = w - v(w, t) - - where, :math:`\\rho` is a hyperparameter and typical values are 0.9, 0.95 - and so on. :math: `beta` is the momentum term. :math: `\\epsilon` is a - smoothing term to avoid division by zero, usually set somewhere in range - from 1e-4 to 1e-8. - - - Parameters: - lr(float): Global learning rate. - alpha(float): rho is :math: `\\rho` in equation, default is 0.95. - eps(float): :math: `\\epsilon` in equation is smoothing term to - avoid division by zero, default is 1e-6. - momentum(float): :math:`\\beta` in equation is the momentum term, - default is 0.0. - centered(bool): If True, gradients are normalized by the estimated variance of - the gradient; if False, by the uncentered second moment. Setting this to - True may help with training, but is slightly more expensive in terms of - computation and memory. Defaults to False. - params (Iterable, optional): Iterable of ``Tensor`` names to update to minimize ``loss``. \ - This parameter is required in dygraph mode. \ - The default value is None in static mode, at this time all parameters will be updated. - weight_decay (float, optional): Weight decay of L2 regularization. The default value is 0. - grad_clip (GradientClipBase, optional): Gradient cliping strategy, it's an instance of - some derived class of ``GradientClipBase`` . There are three cliping strategies - ( :ref:`api_fluid_clip_GradientClipByGlobalNorm` , :ref:`api_fluid_clip_GradientClipByNorm` , - :ref:`api_fluid_clip_GradientClipByValue` ). Default None, meaning there is no gradient clipping. - name (str, optional): This parameter is used by developers to print debugging information. \ - For details, please refer to :ref:`api_guide_Name`. Default is None. - - Raises: - ValueError: If lr, rho, epsilon, momentum are None. - - Examples: - .. code-block:: python - - import paddle - import paddle.fluid as fluid - import numpy as np - - place = fluid.CPUPlace() - main = fluid.Program() - with fluid.program_guard(main): - x = fluid.layers.data(name='x', shape=[13], dtype='float32') - y = fluid.layers.data(name='y', shape=[1], dtype='float32') - y_predict = fluid.layers.fc(input=x, size=1, act=None) - cost = fluid.layers.square_error_cost(input=y_predict, label=y) - avg_cost = fluid.layers.mean(cost) - - rms_optimizer = paddle.optimizer.RMSProp(lr=0.1) - rms_optimizer.minimize(avg_cost) - - fetch_list = [avg_cost] - train_reader = paddle.batch( - paddle.dataset.uci_housing.train(), batch_size=1) - feeder = fluid.DataFeeder(place=place, feed_list=[x, y]) - exe = fluid.Executor(place) - exe.run(fluid.default_startup_program()) - for data in train_reader(): - exe.run(main, feed=feeder.feed(data), fetch_list=fetch_list) - - """ - - _momentum_acc_str = "momentum" - _mean_square_acc_str = "mean_square" - _mean_grad_acc_str = "mean_grad" - - def __init__(self, - learning_rate, - rho=0.95, - epsilon=1.0e-6, - momentum=0.0, - centered=False, - parameter_list=None, - regularization=None, - grad_clip=None, - name=None): - super(RMSPropOptimizer, self).__init__( - learning_rate=learning_rate, - parameter_list=parameter_list, - regularization=regularization, - grad_clip=grad_clip, - name=name) - if learning_rate is None: - raise ValueError("learning_rate is not set.") - if rho is None: - raise ValueError("rho is not set.") - if epsilon is None: - raise ValueError("epsilon is not set.") - if momentum is None: - raise ValueError("momentum is not set.") - - self.type = "rmsprop" - self._rho = rho - self._epsilon = epsilon - self._momentum = momentum - self._centered = centered - - def _create_accumulators(self, block, parameters): - if not isinstance(block, framework.Block): - raise TypeError("block is not instance of framework.Block.") - - for p in parameters: - self._add_accumulator(self._momentum_acc_str, p) - self._add_accumulator(self._mean_square_acc_str, p) - self._add_accumulator(self._mean_grad_acc_str, p) - - def _append_optimize_op(self, block, param_and_grad): - if not isinstance(block, framework.Block): - raise TypeError("block is not instance of framework.Block.") - - momentum_acc = self._get_accumulator(self._momentum_acc_str, - param_and_grad[0]) - mean_square_acc = self._get_accumulator(self._mean_square_acc_str, - param_and_grad[0]) - mean_grad_acc = self._get_accumulator(self._mean_grad_acc_str, - param_and_grad[0]) - rmsprop_op = block.append_op( - type=self.type, - inputs={ - "Param": param_and_grad[0], - "Grad": param_and_grad[1], - "Moment": momentum_acc, - "MeanSquare": mean_square_acc, - "MeanGrad": mean_grad_acc, - "LearningRate": self._create_param_lr(param_and_grad), - }, - outputs={ - "ParamOut": param_and_grad[0], - "MomentOut": momentum_acc, - "MeanSquareOut": mean_square_acc, - "MeanGradOut": mean_grad_acc - }, - attrs={ - "epsilon": self._epsilon, - "decay": self._rho, - "momentum": self._momentum, - "centered": self._centered - }, - stop_gradient=True) - - return rmsprop_op - - -class FtrlOptimizer(Optimizer): - """ - FTRL (Follow The Regularized Leader) Optimizer. - - The paper that proposed Follow The Regularized Leader (FTRL): - (https://www.eecs.tufts.edu/~dsculley/papers/ad-click-prediction.pdf) - - .. math:: - - &new\_accum = squared\_accum + grad^2 - - &if (lr\_power == -0.5): - - &\quad linear\_accum += grad - \\frac{\\sqrt{new\_accum} - \\sqrt{squared\_accum}}{learning\_rate * param} - - &else: - - &\quad linear\_accum += grad - \\frac{new\_accum^{-lr\_power} - accum^{-lr\_power}}{learning\_rate * param} - - - &x = l1 * sign(linear\_accum) - linear\_accum - - &if (lr\_power == -0.5): - - &\quad y = \\frac{\\sqrt{new\_accum}}{learning\_rate} + (2 * l2) - - &\quad pre\_shrink = \\frac{x}{y} - - &\quad param = (abs(linear\_accum) > l1).select(pre\_shrink, 0.0) - - &else: - - &\quad y = \\frac{new\_accum^{-lr\_power}}{learning\_rate} + (2 * l2) - - &\quad pre\_shrink = \\frac{x}{y} - - &\quad param = (abs(linear\_accum) > l1).select(pre\_shrink, 0.0) - - &squared\_accum += grad^2 - - Parameters: - learning_rate (float|Variable): Global learning rate. - l1 (float): L1 regularization strength, default is 0.0. - l2 (float): L2 regularization strength, default is 0.0. - lr_power (float): Learning Rate Power, default is -0.5. - parameter_list (Iterable, optional): Iterable of ``Variable`` names to update to minimize ``loss``. \ - This parameter is required in dygraph mode. \ - The default value is None in static mode, at this time all parameters will be updated. - regularization (WeightDecayRegularizer, optional): The strategy of regularization. There are two method: \ - :ref:`api_fluid_regularizer_L1Decay` , :ref:`api_fluid_regularizer_L2Decay` . If a parameter has set \ - regularizer using :ref:`api_fluid_ParamAttr` already, the regularization setting here in optimizer will be \ - ignored for this parameter. Otherwise, the regularization setting here in optimizer will take effect. \ - Default None, meaning there is no regularization. - grad_clip (GradientClipBase, optional): Gradient cliping strategy, it's an instance of - some derived class of ``GradientClipBase`` . There are three cliping strategies - ( :ref:`api_fluid_clip_GradientClipByGlobalNorm` , :ref:`api_fluid_clip_GradientClipByNorm` , - :ref:`api_fluid_clip_GradientClipByValue` ). Default None, meaning there is no gradient clipping. - name (str, optional): This parameter is used by developers to print debugging information. \ - For details, please refer to :ref:`api_guide_Name`. Default is None. - - Raises: - ValueError: If learning_rate, rho, epsilon, momentum are None. - - Examples: - .. code-block:: python - - import paddle - import paddle.fluid as fluid - import numpy as np - - place = fluid.CPUPlace() - main = fluid.Program() - with fluid.program_guard(main): - x = fluid.layers.data(name='x', shape=[13], dtype='float32') - y = fluid.layers.data(name='y', shape=[1], dtype='float32') - y_predict = fluid.layers.fc(input=x, size=1, act=None) - cost = fluid.layers.square_error_cost(input=y_predict, label=y) - avg_cost = fluid.layers.mean(cost) - - ftrl_optimizer = paddle.optimizer.Ftrl(learning_rate=0.1) - ftrl_optimizer.minimize(avg_cost) - - fetch_list = [avg_cost] - train_reader = paddle.batch( - paddle.dataset.uci_housing.train(), batch_size=1) - feeder = fluid.DataFeeder(place=place, feed_list=[x, y]) - exe = fluid.Executor(place) - exe.run(fluid.default_startup_program()) - for data in train_reader(): - exe.run(main, feed=feeder.feed(data), fetch_list=fetch_list) - - NOTE: - Currently, FtrlOptimizer doesn't support sparse parameter optimization. - """ - - _squared_acc_str = "squared" - _linear_acc_str = "linear" - - def __init__(self, - learning_rate, - l1=0.0, - l2=0.0, - lr_power=-0.5, - parameter_list=None, - regularization=None, - grad_clip=None, - name=None): - super(FtrlOptimizer, self).__init__( - learning_rate=learning_rate, - parameter_list=parameter_list, - regularization=regularization, - grad_clip=grad_clip, - name=name) - if learning_rate is None: - raise ValueError("learning_rate is not set.") - - self.type = "ftrl" - self._l1 = l1 - self._l2 = l2 - self._lr_power = lr_power - - def _create_accumulators(self, block, parameters): - if not isinstance(block, framework.Block): - raise TypeError("block is not instance of framework.Block.") - - for p in parameters: - self._add_accumulator(self._squared_acc_str, p) - self._add_accumulator(self._linear_acc_str, p) - - def _append_optimize_op(self, block, param_and_grad): - if not isinstance(block, framework.Block): - raise TypeError("block is not instance of framework.Block.") - - squared_acc = self._get_accumulator(self._squared_acc_str, - param_and_grad[0]) - linear_acc = self._get_accumulator(self._linear_acc_str, - param_and_grad[0]) - ftrl_op = block.append_op( - type=self.type, - inputs={ - "Param": param_and_grad[0], - "Grad": param_and_grad[1], - "SquaredAccumulator": squared_acc, - "LinearAccumulator": linear_acc, - "LearningRate": self._create_param_lr(param_and_grad), - }, - outputs={ - "ParamOut": param_and_grad[0], - "SquaredAccumOut": squared_acc, - "LinearAccumOut": linear_acc - }, - attrs={"l1": self._l1, - "l2": self._l2, - "lr_power": self._lr_power}, - stop_gradient=True) - - return ftrl_op - - -class LambOptimizer(Adam): - """ - LAMB (Layer-wise Adaptive Moments optimizer for Batching training) Optimizer. - - LAMB Optimizer is designed to scale up the batch size of training without losing - accuracy, which supports adaptive element-wise updating and accurate layer-wise - correction. For more information, please refer to `Large Batch Optimization for - Deep Learning: Training BERT in 76 minutes `_ . - - The updating of parameters follows: - - .. math:: - - m_t &= \\beta_1 m_{t - 1}+ (1 - \\beta_1)g_t - - v_t &= \\beta_2 v_{t - 1} + (1 - \\beta_2)g_t^2 - - r_t &= \\frac{m_t}{\\sqrt{v_t}+\\epsilon} - - w_t &= w_{t-1} -\\eta_t \\frac{\\left \| w_{t-1}\\right \|}{\\left \| r_t + \\lambda w_{t-1}\\right \|} (r_t + \\lambda w_{t-1}) - - - where :math:`m` is the 1st moment, and :math:`v` the 2nd moment, :math:`\\eta` the - learning rate, :math:`\\lambda` the LAMB weight decay rate. - - Args: - learning_rate (float|Variable, optional): the learning rate used to update parameters. \ - Can be a float value or a Variable with data type float32. Default 0.001. - lamb_weight_decay (float, optional): The LAMB weight decay rate. Default 0.01. - beta1 (float, optional): The exponential decay rate for the 1st moment estimates. - Default 0.9. - beta2 (float, optional): The exponential decay rate for the 2nd moment estimates. - Default 0.999. - epsilon (float, optional): A small float value for numerical stability. Default 1e-6. - parameter_list (Iterable, optional): Iterable of ``Variable`` names to update to minimize ``loss``. \ - This parameter is required in dygraph mode. \ - The default value is None in static mode, at this time all parameters will be updated. - regularization (WeightDecayRegularizer, optional): The strategy of regularization. There are two method: \ - :ref:`api_fluid_regularizer_L1Decay` , :ref:`api_fluid_regularizer_L2Decay` . If a parameter has set \ - regularizer using :ref:`api_fluid_ParamAttr` already, the regularization setting here in optimizer will be \ - ignored for this parameter. Otherwise, the regularization setting here in optimizer will take effect. \ - Default None, meaning there is no regularization. - grad_clip (GradientClipBase, optional): Gradient cliping strategy, it's an instance of - some derived class of ``GradientClipBase`` . There are three cliping strategies - ( :ref:`api_fluid_clip_GradientClipByGlobalNorm` , :ref:`api_fluid_clip_GradientClipByNorm` , - :ref:`api_fluid_clip_GradientClipByValue` ). Default None, meaning there is no gradient clipping. - exclude_from_weight_decay_fn (function|None): Exclude a parameter from weight - decay when **exclude_from_weight_decay_fn(parameter)** returns true. - Default None. - name(str|None): For detailed information, please refer to - :ref:`api_guide_Name` . Usually name is no need to set and None by default. - - Examples: - .. code-block:: python - - import paddle.fluid as fluid - import paddle - - data = fluid.data(name='x', shape=[-1, 5], dtype='float32') - hidden = fluid.layers.fc(input=data, size=10) - cost = fluid.layers.mean(hidden) - - def exclude_fn(param): - return param.name.endswith('.b_0') - - optimizer = paddle.optimizer.Lamb(learning_rate=0.002, - exclude_from_weight_decay_fn=exclude_fn) - optimizer.minimize(cost) - """ - _moment1_acc_str = "moment1" - _moment2_acc_str = "moment2" - # these two not used in op temporarily - _beta1_pow_acc_str = "beta1_pow_acc" - _beta2_pow_acc_str = "beta2_pow_acc" - - def __init__(self, - learning_rate=0.001, - lamb_weight_decay=0.01, - beta1=0.9, - beta2=0.999, - epsilon=1e-6, - parameter_list=None, - regularization=None, - grad_clip=None, - exclude_from_weight_decay_fn=None, - name=None): - assert learning_rate is not None - assert lamb_weight_decay is not None - assert beta1 is not None - assert beta2 is not None - assert epsilon is not None - super(LambOptimizer, self).__init__( - learning_rate=learning_rate, - parameter_list=parameter_list, - regularization=regularization, - grad_clip=grad_clip, - beta1=beta1, - beta2=beta2, - epsilon=epsilon, - name=name) - self.type = "lamb" - self._weight_decay = lamb_weight_decay - self._exclude_from_weight_decay_fn = exclude_from_weight_decay_fn - - def _append_optimize_op(self, block, param_and_grad): - assert isinstance(block, framework.Block) - block.program._use_lamb = True - - moment1 = self._get_accumulator(self._moment1_acc_str, - param_and_grad[0]) - moment2 = self._get_accumulator(self._moment2_acc_str, - param_and_grad[0]) - beta1_pow_acc = self._get_accumulator(self._beta1_pow_acc_str, - param_and_grad[0]) - beta2_pow_acc = self._get_accumulator(self._beta2_pow_acc_str, - param_and_grad[0]) - - if self._exclude_from_weight_decay_fn is not None \ - and self._exclude_from_weight_decay_fn(param_and_grad[0]): - weight_decay = 0.0 - else: - weight_decay = self._weight_decay - - # create the lamb optimize op - lamb_op = block.append_op( - type=self.type, - inputs={ - "Param": param_and_grad[0], - "Grad": param_and_grad[1], - "LearningRate": self._create_param_lr(param_and_grad), - "Moment1": moment1, - "Moment2": moment2, - "Beta1Pow": beta1_pow_acc, - "Beta2Pow": beta2_pow_acc - }, - outputs={ - "ParamOut": param_and_grad[0], - "Moment1Out": moment1, - "Moment2Out": moment2 - }, - attrs={ - "beta1": self._beta1, - "beta2": self._beta2, - "epsilon": self._epsilon, - "weight_decay": weight_decay - }, - stop_gradient=True) - - return lamb_op - - -# We short the class name, since users will use the optimizer with the package -# name. The sample code: -# -# import paddle.fluid as fluid -# -# sgd = paddle.optimizer.SGD(...) -# -# It is no need to add an `Optimizer` as the class suffix -SGD = SGDOptimizer -Momentum = MomentumOptimizer -Adagrad = AdagradOptimizer -Adamax = AdamaxOptimizer -Dpsgd = DpsgdOptimizer -DecayedAdagrad = DecayedAdagradOptimizer -Adadelta = AdadeltaOptimizer -RMSProp = RMSPropOptimizer -Ftrl = FtrlOptimizer -LarsMomentum = LarsMomentumOptimizer -Lamb = LambOptimizer - - -class ModelAverage(Optimizer): - """ - :api_attr: Static Graph - - The ModelAverage optimizer accumulates specific continuous historical parameters - during training. The accumulated historical range can be controlled by the passed - ``average_window_rate`` argument. The averaged ``Parameter`` are used in the prediction, - which usually can improve the accuracy of the prediction. - - Accumulate the average of the ``Parameter`` in the sliding window, the result will be saved - in a temporary variable, can be applied to the current model's ``Parameter`` by calling - the ``apply()`` method, and the current model ``Parameter`` can be restored by calling - the ``restore()`` method. - - The window size for calculating the average is determined by ``average_window_rate``, - ``min_average_window``, ``max_average_window`` and the current ``Parameter`` update times (num_updates). - - When the cumulative times (num_accumulates) is greater than the specific window - threshold (average_window), the accumulated ``Parameter`` temporary variable is set to 0.0. - The following example will help to understand the role of these arguments: - - :: - - if num_accumulates >= min_average_window and num_accumulates >= min(max_average_window, num_updates * average_window_rate): - num_accumulates = 0 - - In the above conditional judgment statement, ``num_accumulates`` indicates the current - accumulated number, which can be abstractly understood as the length of the cumulative window. - The length of the window must be at least the length set by the ``min_average_window`` argument, - and cannot exceed the length specified by the ``max_average_window`` argument or - ``num_updates * average_window_rate``, where ``num_updates`` indicates the current ``Parameter`` - update times, ``average_window_rate`` is a coefficient that calculates the length of the window. - - Args: - average_window_rate (float): The calculate ratio of the window length relative to ``Parameter`` update times. - min_average_window (int, optional): the minimum size of average window length. The default value is 10000. - max_average_window (int, optional): The maximum size of average window length. The default value is 10000. - regularization (WeightDecayRegularizer, optional): The strategy of regularization. There are two method: \ - :ref:`api_fluid_regularizer_L1Decay` , :ref:`api_fluid_regularizer_L2Decay` . If a parameter has set \ - regularizer using :ref:`api_fluid_ParamAttr` already, the regularization setting here in optimizer will be \ - ignored for this parameter. Otherwise, the regularization setting here in optimizer will take effect. \ - Default None, meaning there is no regularization. - name (str, optional): Normally there is no need for user to set this property. - For more information, please refer to :ref:`api_guide_Name`. - The default value is None. - - Examples: - - .. code-block:: python - - import paddle.fluid as fluid - import paddle - import numpy - - # First create the Executor. - place = fluid.CPUPlace() # fluid.CUDAPlace(0) - exe = fluid.Executor(place) - - train_program = fluid.Program() - startup_program = fluid.Program() - with fluid.program_guard(train_program, startup_program): - # build net - data = fluid.data(name='X', shape=[None, 1], dtype='float32') - hidden = fluid.layers.fc(input=data, size=10) - loss = fluid.layers.mean(hidden) - optimizer = paddle.optimizer.Momentum(learning_rate=0.2, momentum=0.1) - optimizer.minimize(loss) - - # build ModelAverage optimizer - model_average = paddle.optimizer.ModelAverage(0.15, - min_average_window=10000, - max_average_window=12500) - - exe.run(startup_program) - for i in range(12500): - x = numpy.random.random(size=(10, 1)).astype('float32') - outs = exe.run(program=train_program, - feed={'X': x}, - fetch_list=[loss.name]) - - # apply ModelAverage - with model_average.apply(exe): - x = numpy.random.random(size=(10, 1)).astype('float32') - exe.run(program=train_program, - feed={'X': x}, - fetch_list=[loss.name]) - """ - - def __init__(self, - average_window_rate, - min_average_window=10000, - max_average_window=10000, - regularization=None, - name=None): - if framework.in_dygraph_mode(): - raise Exception("In dygraph, don't support ModelAverage.") - super(ModelAverage, self).__init__( - 0.0, regularization=regularization, name=name) - self.average_window = average_window_rate - self.min_average_window = min_average_window - self.max_average_window = max_average_window - - self.params_grads = [] - for param in framework.default_main_program().global_block( - ).all_parameters(): - if param.do_model_average != False: - grad = param.block.create_var( - name=unique_name.generate_with_ignorable_key(".".join( - [param.name, 'tmp'])), - dtype=param.dtype, - persistable=False, - stop_gradient=True) - self.params_grads.append((param, grad)) - - for param, grad in self.params_grads: - if grad is None: - continue - with param.block.program._optimized_guard( - [param, grad]), name_scope('move_average'): - self._append_average_accumulate_op(param) - - self.apply_program = Program() - block = self.apply_program.global_block() - with program_guard(main_program=self.apply_program): - for param_grad in self.params_grads: - self._add_average_apply_op(block, param_grad) - - self.restore_program = Program() - block = self.restore_program.global_block() - with program_guard(main_program=self.restore_program): - for param_grad in self.params_grads: - self._add_average_restore_op(block, param_grad) - - def _add_average_apply_op(self, block, param_grad): - param = block._clone_variable(param_grad[0]) - grad = block._clone_variable(param_grad[1]) - sum_1 = block._clone_variable(self._get_accumulator('sum_1', param)) - sum_2 = block._clone_variable(self._get_accumulator('sum_2', param)) - sum_3 = block._clone_variable(self._get_accumulator('sum_3', param)) - num_accumulates = block._clone_variable( - self._get_accumulator('num_accumulates', param)) - old_num_accumulates = block._clone_variable( - self._get_accumulator('old_num_accumulates', param)) - num_updates = block._clone_variable( - self._get_accumulator('num_updates', param)) - # backup param value to grad - layers.assign(input=param, output=grad) - # param = (sum_1 + sum_2 + sum_3) / (num_accumulates + old_num_accumulates) - tmp = layers.sum(x=[num_accumulates, old_num_accumulates]) - sum = layers.sum(x=[sum_1, sum_2, sum_3]) - tmp = layers.cast( - x=tmp, dtype='float32' if self._dtype == None else self._dtype) - sum = layers.cast( - x=sum, dtype='float32' if self._dtype == None else self._dtype) - ops._elementwise_div(x=sum, y=tmp, out=param) - - def _add_average_restore_op(self, block, param_grad): - param = block._clone_variable(param_grad[0]) - grad = block._clone_variable(param_grad[1]) - layers.assign(input=grad, output=param) - - def _append_average_accumulate_op(self, param): - self.helper = LayerHelper("average_accumulate") - sum_1 = self._add_accumulator('sum_1', param) - sum_2 = self._add_accumulator('sum_2', param) - sum_3 = self._add_accumulator('sum_3', param) - num_accumulates = self._add_accumulator( - 'num_accumulates', param, dtype='int64', shape=[1]) - old_num_accumulates = self._add_accumulator( - 'old_num_accumulates', param, dtype='int64', shape=[1]) - num_updates = self._add_accumulator( - 'num_updates', param, dtype='int64', shape=[1]) - - self.helper.append_op( - type='average_accumulates', - inputs={ - "param": param, - "in_sum_1": sum_1, - "in_sum_2": sum_2, - "in_sum_3": sum_3, - "in_num_accumulates": num_accumulates, - "in_old_num_accumulates": old_num_accumulates, - "in_num_updates": num_updates - }, - outputs={ - "out_sum_1": sum_1, - "out_sum_2": sum_2, - "out_sum_3": sum_3, - "out_num_accumulates": num_accumulates, - "out_old_num_accumulates": old_num_accumulates, - "out_num_updates": num_updates, - }, - attrs={ - "average_window": self.average_window, - "min_average_window": self.min_average_window, - "max_average_window": self.max_average_window, - }, - stop_gradient=True) - - @signature_safe_contextmanager - def apply(self, executor, need_restore=True): - """ - Apply the average of the cumulative ``Parameter`` to the parameters of the current model. - - Args: - executor(fluid.Executor): The current network executor. - need_restore(bool): Restore flag variable, if set to True, the network will restore - the parameters of the network to the default value, if set to False, - it will not be restored. The default value is True. - - Examples: - - .. code-block:: python - - import paddle.fluid as fluid - import paddle - import numpy - - # First create the Executor. - place = fluid.CPUPlace() # fluid.CUDAPlace(0) - exe = fluid.Executor(place) - - train_program = fluid.Program() - startup_program = fluid.Program() - with fluid.program_guard(train_program, startup_program): - # build net - data = fluid.data(name='X', shape=[None, 1], dtype='float32') - hidden = fluid.layers.fc(input=data, size=10) - loss = fluid.layers.mean(hidden) - optimizer = paddle.optimizer.Momentum(learning_rate=0.2, momentum=0.1) - optimizer.minimize(loss) - - # build ModelAverage optimizer - model_average = paddle.optimizer.ModelAverage(0.15, - min_average_window=10000, - max_average_window=12500) - - exe.run(startup_program) - for i in range(12500): - x = numpy.random.random(size=(10, 1)).astype('float32') - outs = exe.run(program=train_program, - feed={'X': x}, - fetch_list=[loss.name]) - - # apply ModelAverage - with model_average.apply(exe): - x = numpy.random.random(size=(10, 1)).astype('float32') - exe.run(program=train_program, - feed={'X': x}, - fetch_list=[loss.name]) - """ - executor.run(self.apply_program) - try: - yield - finally: - if need_restore: - self.restore(executor) - - def restore(self, executor): - """ - Restore ``Parameter`` values of current model. - - Args: - executor(fluid.Executor): The current network executor. - - Examples: - - .. code-block:: python - - import paddle.fluid as fluid - import paddle - import numpy - - # First create the Executor. - place = fluid.CPUPlace() # fluid.CUDAPlace(0) - exe = fluid.Executor(place) - - train_program = fluid.Program() - startup_program = fluid.Program() - with fluid.program_guard(train_program, startup_program): - # build net - data = fluid.data(name='X', shape=[None, 1], dtype='float32') - hidden = fluid.layers.fc(input=data, size=10) - loss = fluid.layers.mean(hidden) - optimizer = paddle.optimizer.Momentum(learning_rate=0.2, momentum=0.1) - optimizer.minimize(loss) - - # build ModelAverage optimizer - model_average = paddle.optimizer.ModelAverage(0.15, - min_average_window=10000, - max_average_window=12500) - - exe.run(startup_program) - for i in range(12500): - x = numpy.random.random(size=(10, 1)).astype('float32') - outs = exe.run(program=train_program, - feed={'X': x}, - fetch_list=[loss.name]) - - # apply ModelAverage - with model_average.apply(exe, False): - x = numpy.random.random(size=(10, 1)).astype('float32') - exe.run(program=train_program, - feed={'X': x}, - fetch_list=[loss.name]) - - # restore Parameters - model_average.restore(exe) - """ - executor.run(self.restore_program) - - -class ExponentialMovingAverage(object): - """ - :api_attr: Static Graph - - Compute the moving average of parameters with exponential decay. - Given a parameter :math:`\\theta`, its exponential moving average (EMA) - will be - - .. math:: - - \\text{EMA}_0 & = 0 - - \\text{EMA}_t & = \\text{decay} * \\text{EMA}_{t-1} + (1 - \\text{decay}) * \\theta_t - - The average results calculated by **update()** method will be saved in - temporary variables which are created and maintained by the object, and can - be applied to parameters of current model by calling **apply()** method. And - the **restore()** method is used to restore the parameters. - - **Bias correction**. All EMAs are initialized to :math:`0` and hence they will be - zero biased, which can be corrected by divided by a factor - :math:`(1 - \\text{decay}^t)` , i.e., the actual EMAs applied to parameters - when calling **apply()** method would be - - .. math:: - - \\widehat{\\text{EMA}}_t = \\frac{\\text{EMA}_t}{1 - \\text{decay}^t} - - **Decay rate scheduling**. A large decay rate very close to 1 would result - in that the averages move very slowly. And a better strategy is to set a - relative smaller decay rate in the very beginning. The argument **thres_steps** - allows users to pass a Variable to schedule the decay rate, in this case, - the actual decay rate becomes - - .. math:: - - \\min(\\text{decay}, \\frac{1 + \\text{thres_steps}}{10 + \\text{thres_steps}}) - - Usually **thres_steps** can be the global training steps. - - - Args: - decay (float, optional): The exponential decay rate, usually close to 1, such as - 0.999, 0.9999, ... . Default 0.999. - thres_steps (Variable|None): If not `None`, schedule the decay rate. - Default None. - name (str|None): For detailed information, please refer to - :ref:`api_guide_Name`. Usually name is no need to set and None by - default. - - - Examples: - - .. code-block:: python - - import numpy - import paddle - import paddle.fluid as fluid - - data = fluid.data(name='x', shape=[-1, 5], dtype='float32') - hidden = fluid.layers.fc(input=data, size=10) - cost = fluid.layers.mean(hidden) - - test_program = fluid.default_main_program().clone(for_test=True) - - optimizer = paddle.optimizer.Adam(lr=0.001) - optimizer.minimize(cost) - - global_steps = fluid.layers.autoincreased_step_counter() - ema = paddle.optimizer.ExponentialMovingAverage(0.999, thres_steps=global_steps) - ema.update() - - place = fluid.CPUPlace() - exe = fluid.Executor(place) - exe.run(fluid.default_startup_program()) - - for pass_id in range(3): - for batch_id in range(6): - data = numpy.random.random(size=(10, 5)).astype('float32') - exe.run(program=fluid.default_main_program(), - feed={'x': data}, - fetch_list=[cost.name]) - - # usage 1 - with ema.apply(exe): - data = numpy.random.random(size=(10, 5)).astype('float32') - exe.run(program=test_program, - feed={'x': data}, - fetch_list=[hidden.name]) - - - # usage 2 - with ema.apply(exe, need_restore=False): - data = numpy.random.random(size=(10, 5)).astype('float32') - exe.run(program=test_program, - feed={'x': data}, - fetch_list=[hidden.name]) - ema.restore(exe) - """ - - def __init__(self, decay=0.999, thres_steps=None, name=None): - if framework.in_dygraph_mode(): - raise Exception( - "In dygraph, don't support ExponentialMovingAverage.") - self._decay = decay - self._thres_steps = thres_steps - self._name = name if name is not None else '' - self._decay_var = self._get_ema_decay() - - self._step_counter_name = "@EMA_STEP_COUNTER@" - self._params_tmps = [] - for param in default_main_program().global_block().all_parameters(): - if param.do_model_average != False: - tmp = param.block.create_var( - name=unique_name.generate(".".join( - [self._name + param.name, 'ema_tmp'])), - dtype=param.dtype, - persistable=False, - stop_gradient=True) - self._params_tmps.append((param, tmp)) - - self._ema_vars = {} - for param, tmp in self._params_tmps: - with param.block.program._optimized_guard( - [param, tmp]), name_scope('moving_average'): - self._ema_vars[param.name] = self._create_ema_vars(param) - - self.apply_program = Program() - block = self.apply_program.global_block() - with program_guard(main_program=self.apply_program): - decay_pow, global_step = self._get_decay_pow(block) - for param, tmp in self._params_tmps: - param = block._clone_variable(param) - tmp = block._clone_variable(tmp) - ema = block._clone_variable(self._ema_vars[param.name]) - layers.assign(input=param, output=tmp) - # bias correction - with layers.control_flow.Switch() as switch: - with switch.case(global_step > 0): - layers.assign(output=ema, input=ema / (1.0 - decay_pow)) - layers.assign(input=ema, output=param) - - self.restore_program = Program() - block = self.restore_program.global_block() - with program_guard(main_program=self.restore_program): - for param, tmp in self._params_tmps: - tmp = block._clone_variable(tmp) - param = block._clone_variable(param) - layers.assign(input=tmp, output=param) - - def _get_ema_decay(self): - with default_main_program()._lr_schedule_guard(): - decay_var = layers.tensor.create_global_var( - shape=[1], - value=self._decay, - dtype='float32', - persistable=True, - name="scheduled_ema_decay_rate") - - if self._thres_steps is not None: - decay_t = (self._thres_steps + 1.0) / (self._thres_steps + 10.0) - with layers.control_flow.Switch() as switch: - with switch.case(decay_t < self._decay): - layers.tensor.assign(decay_t, decay_var) - with switch.default(): - layers.tensor.assign( - np.array( - [self._decay], dtype=np.float32), - decay_var) - return decay_var - - def _get_decay_pow(self, block): - global_step = layers.create_global_var( - name=self._step_counter_name, - shape=[1], - value=0, - dtype='int64', - persistable=True) - global_step = layers.cast(global_step, "float32") - decay_var = block._clone_variable(self._decay_var) - decay_pow_acc = layers.elementwise_pow(decay_var, global_step) - return decay_pow_acc, global_step - - def _create_ema_vars(self, param): - param_ema = layers.create_global_var( - name=unique_name.generate(self._name + param.name + '_ema'), - shape=param.shape, - value=0.0, - dtype=param.dtype, - persistable=True) - - return param_ema - - def update(self): - """ - Update Exponential Moving Average. Should only call this method in - train program. - """ - global_step = layers.autoincreased_step_counter( - counter_name=self._step_counter_name) - param_master_emas = [] - for param, tmp in self._params_tmps: - with param.block.program._optimized_guard( - [param, tmp]), name_scope('moving_average'): - param_ema = self._ema_vars[param.name] - if param.name + '.master' in self._ema_vars: - master_ema = self._ema_vars[param.name + '.master'] - param_master_emas.append([param_ema, master_ema]) - else: - ema_t = param_ema * self._decay_var + param * ( - 1 - self._decay_var) - layers.assign(input=ema_t, output=param_ema) - - # for fp16 params - for param_ema, master_ema in param_master_emas: - default_main_program().global_block().append_op( - type="cast", - inputs={"X": master_ema}, - outputs={"Out": param_ema}, - attrs={ - "in_dtype": master_ema.dtype, - "out_dtype": param_ema.dtype - }) - - @signature_safe_contextmanager - def apply(self, executor, need_restore=True): - """ - Apply moving average to parameters for evaluation. - - Args: - executor (Executor): The Executor to execute applying. - need_restore (bool, optional): Whether to restore parameters after - applying. Default True. - """ - executor.run(self.apply_program) - try: - yield - finally: - if need_restore: - self.restore(executor) - - def restore(self, executor): - """Restore parameters. - - Args: - executor (Executor): The Executor to execute restoring. - """ - executor.run(self.restore_program) - - -class PipelineOptimizer(object): - """ - :api_attr: Static Graph - - Pipeline Optimizer: Make a program to run as pipeline, that is splitting a - program into multiple sections (sub-programs) and each section run on a - device to enable the training of large scale models and the use of - heterogeneous devices. Meanwhile, all sections run in the stype of pipeline. - - Args: - optimizer (Optimizer): The optimizer to use, such as SGD. - num_microbatches (int): Number of microbatches. [Optional. Default:1]. - start_cpu_core_id (int): The first cpu core id to use. [Optional. Default:0]. - - Examples: - .. code-block:: python - - import paddle.fluid as fluid - import paddle - import paddle.fluid.layers as layers - - with fluid.device_guard("gpu:0"): - x = fluid.layers.data(name='x', shape=[1], dtype='int64', lod_level=0) - y = fluid.layers.data(name='y', shape=[1], dtype='int64', lod_level=0) - data_loader = fluid.io.DataLoader.from_generator( - feed_list=[x, y], - capacity=64, - use_double_buffer=True, - iterable=False) - - emb_x = layers.embedding(input=x, param_attr=fluid.ParamAttr(name="embx"), size=[10,2], is_sparse=False) - emb_y = layers.embedding(input=y, param_attr=fluid.ParamAttr(name="emby",learning_rate=0.9), size=[10,2], is_sparse=False) - - with fluid.device_guard("gpu:1"): - concat = layers.concat([emb_x, emb_y], axis=1) - fc = layers.fc(input=concat, name="fc", size=1, num_flatten_dims=1, bias_attr=False) - loss = layers.reduce_mean(fc) - optimizer = paddle.optimizer.SGD(learning_rate=0.5) - optimizer = paddle.optimizer.PipelineOptimizer(optimizer) - optimizer.minimize(loss) - - def train_reader(): - for _ in range(4): - x = np.random.random(size=[1]).astype('int64') - y = np.random.random(size=[1]).astype('int64') - yield x, y - data_loader.set_sample_generator(train_reader, batch_size=1) - - place = fluid.CUDAPlace(0) - exe = fluid.Executor(place) - exe.run(fluid.default_startup_program()) - batch_size = 1 - filelist = [] # you should set your own filelist, e.g. filelist = ["dataA.txt"] - dataset = fluid.DatasetFactory().create_dataset("FileInstantDataset") - dataset.set_use_var([x,y]) - dataset.set_batch_size(batch_size) - dataset.set_filelist(filelist) - data_loader.start() - exe.train_from_dataset( - fluid.default_main_program(), - dataset) - data_loader.reset() - """ - - def __init__(self, optimizer, num_microbatches=1, start_cpu_core_id=0): - if framework.in_dygraph_mode(): - raise Exception("In dygraph, don't support PipelineOptimizer.") - if not isinstance(optimizer, Optimizer): - raise ValueError("The 'optimizer' parameter for " - "PipelineOptimizer must be an instance of " - "Optimizer, but the given type is {}.".format( - type(optimizer))) - self._optimizer = optimizer - assert num_microbatches >= 1, ( - "num_microbatches must be a positive value.") - self._num_microbatches = num_microbatches - assert start_cpu_core_id >= 0, ( - "start_cpu_core_id must be greater than or equal to 0.") - self._start_cpu_core_id = start_cpu_core_id - self._place_list = None - op_maker = core.op_proto_and_checker_maker - self._op_role = op_maker.OpRole - self._op_role_key = op_maker.kOpRoleAttrName() - self._op_role_var_key = op_maker.kOpRoleVarAttrName() - self._op_device_key = op_maker.kOpDeviceAttrName() - self._param_device_map = dict() - - def _create_vars(self, block, main_program): - # Create vars for block, copied from main_program's global block - used_var_set = set() - for op_idx in range(block.desc.op_size()): - op_desc = block.desc.op(op_idx) - vars = op_desc.input_arg_names() + op_desc.output_arg_names() - for var in vars: - # a var whose name contains "blocking_queue" - # only exists in startup program - if var in used_var_set or "_blocking_queue" in var: - continue - used_var_set.add(var) - source_var = main_program.block(0).var(str(var)) - if source_var.type == core.VarDesc.VarType.READER: - block.create_var(name=var, type=core.VarDesc.VarType.READER) - else: - block._clone_variable(source_var, False) - - def _is_loss_grad_op(self, op): - if self._op_role_key not in op.attr_names: - return False - op_role = int(op.all_attrs()[self._op_role_key]) - return op_role & int(self._op_role.Backward) and op_role & int( - self._op_role.Loss) - - def _is_backward_op(self, op): - return self._op_role_key in op.attr_names and int(op.all_attrs()[ - self._op_role_key]) & int(self._op_role.Backward) - - def _is_optimize_op(self, op): - return self._op_role_key in op.attr_names and int(op.all_attrs()[ - self._op_role_key]) & int(self._op_role.Optimize) - - def _is_update_op(self, op): - return 'Param' in op.input_names and 'Grad' in op.input_names and ( - "LearningRate" in op.input_names) - - def _split_program(self, main_program): - """ - Split a program into sections according to devices that ops run on. - - Args: - main_program (Program): the main program - """ - programs = [] - # Map from device to its corresponding section program info - device_program_map = dict() - block = main_program.block(0) - - for op in block.ops: - device = op.attr(self._op_device_key) - - if device not in device_program_map: - program = {"program": Program()} - device_program_map[device] = program - program = device_program_map[device] - op_desc = op.desc - ap_op = program["program"].block(0).desc.append_op() - ap_op.copy_from(op_desc) - - for key in sorted(device_program_map.keys()): - program = device_program_map[key] - program['program']._sync_with_cpp() - programs.append(program) - - return programs - - def _find_post_op(self, ops, cur_op, var_name): - """ - Find the real post op that has variable named var_name as input. - - Args: - ops (list): A list of ops. - cur_op (Operator): Current operator which has variable named - var_name as output. - var_name (string): Variable name. - """ - post_op = [] - before = True - for op in ops: - if op == cur_op: - before = False - continue - if before: - continue - for in_var_name in op.input_arg_names: - if in_var_name == var_name: - post_op.append(op) - if post_op: - if not len(post_op) == 1: - raise ValueError("Each op can only have one post op.") - return post_op[0] - return None - - def _find_real_prev_op(self, ops, cur_op, var_name): - """ - Find the real previous op that outputs variable named var_name. - - Args: - ops (list): A list of ops. - cur_op (Operator): Current operator which has variable named - var_name as input. - var_name (string): Variable name. - """ - prev_op = [] - for op in ops: - if op == cur_op: - break - for out_var_name in op.output_arg_names: - if out_var_name == var_name: - prev_op.append(op) - if prev_op: - # A op may have more than one prev op, - # e.g., for 'learning_rate', there may be multiple ops have it as - # output. - return prev_op[-1] - return None - - def _rename_arg(self, op, old_name, new_name): - op_desc = op.desc - if isinstance(op_desc, tuple): - op_desc = op_desc[0] - op_desc._rename_input(old_name, new_name) - op_desc._rename_output(old_name, new_name) - - def _create_var(self, block, ref_var, name): - """ - Create a new var for block, which has the same type, - shape and dtype as ref_var, then rename it with the - name `name`. - """ - new_var = block.create_var( - name=name, - shape=ref_var.shape, - dtype=ref_var.dtype, - type=ref_var.type, - lod_level=ref_var.lod_level, - persistable=False, - is_data=False, - need_check_feed=ref_var.desc.need_check_feed()) - return new_var - - def _get_data_var_info(self, block): - """ - Get all vars whose is_data attribute are true and then rename them. - - For PipelineTrainer, all data vars are binded to - minibatch scope, so we have to feed them to the microbatch - to avoid conflicts. The vars feeded to microbatch have to - be renamed. - """ - # A map from var name to the renamed name. - raw_name_new_name_map = dict() - # Because we will create vars in block, it is more safe - # to get all var_names before iteration. - var_names = list(block.vars.keys()) - for var_name in var_names: - var = block.var(var_name) - if not var.is_data: - continue - assert var_name not in raw_name_new_name_map, ( - "{} has already been processed.".format(var_name)) - new_name = unique_name.generate(var_name) - raw_name_new_name_map[var_name] = new_name - new_var = self._create_var(block, var, new_name) - new_var.is_data = False - - # map of data to devices that that data on - data_devices_map = dict() - for op in block.ops: - dev_spec = op.attr(self._op_device_key) - for var_name in op.input_arg_names: - if var_name not in raw_name_new_name_map: - continue - if not var_name in data_devices_map: - data_devices_map[var_name] = [] - if not dev_spec in data_devices_map[var_name]: - data_devices_map[var_name].append(dev_spec) - new_name = raw_name_new_name_map[var_name] - #self._rename_arg(op, var_name, new_name) - return data_devices_map, raw_name_new_name_map - - def _rename_var_in_block(self, block, raw_name_new_name_map): - """ - Rename vars whose names in raw_name_new_name_map to the corresponding - new names. - """ - for op in block.ops: - if op.type == "enqueue" or op.type == "dequeue": - continue - for var_name in op.input_arg_names: - if var_name in raw_name_new_name_map: - new_name = raw_name_new_name_map[var_name] - self._rename_arg(op, var_name, new_name) - - def _insert_enq_deq_for_data_var(self, main_block, programs, startup, - devices): - """ - Insert enqueue and dequeue ops for data var - - Args: - main_block (Block): Global block for main program - programs (dict): Dictionary for section params - startup (Program): Startup program - devices (list): List of devices in the format (dev:dev_index) - """ - main_program = main_block.program - data_devices_map, raw_name_new_name_map = self._get_data_var_info( - main_block) - - first_prog = programs[0]['program'] - first_block = first_prog.block(0) - enqueue_index = 0 - if first_block.ops[0].type == "create_py_reader" or ( - first_block.ops[1].type == "create_py_reader"): - for op in first_block.ops: - if op.type == "read": - enqueue_index += 1 - break - enqueue_index += 1 - first_dev_spec = devices[0] - for var_name in data_devices_map.keys(): - for device in data_devices_map[var_name]: - # step1: generate queue for each pair of data var and device - # that that data on - queue_name = var_name + "_blocking_queue" - queue_name = unique_name.generate(queue_name) - queue_var = startup.block(0).create_var( - name=queue_name, - persistable=True, - type=core.VarDesc.VarType.RAW) - startup.block(0).append_op( - type='queue_generator', - attrs={ - 'names': [queue_name], - 'capacity': self._num_microbatches - }) - main_var = main_block.var(var_name) - assert main_var.is_data - if not var_name in first_block.vars: - self._create_var(first_block, main_var, var_name) - first_block._insert_op( - index=enqueue_index, - type='enqueue', - inputs={'X': first_block.var(var_name)}, - attrs={ - 'queue_name': queue_name, - self._op_device_key: first_dev_spec, - self._op_role_key: self._op_role.Forward - }) - # Get the device that that data on - assert device in devices - prog_index = devices.index(device) - prog = programs[prog_index]['program'] - block = prog.block(0) - index = 0 - if device == first_dev_spec: - index = enqueue_index + 1 - new_name = raw_name_new_name_map[var_name] - source_var = main_program.block(0).var(var_name) - new_var = self._create_var(block, source_var, new_name) - block._insert_op( - index=index, - type='dequeue', - outputs={'Out': [new_var]}, - attrs={ - self._op_device_key: device, - self._op_role_key: self._op_role.Forward, - 'queue_name': queue_name, - }) - self._rename_var_in_block(block, raw_name_new_name_map) - - def _strip_grad_suffix(self, name): - """ - Strip the grad suffix from the given variable name - """ - pos = name.find(core.grad_var_suffix()) - return name[:pos] if pos != -1 else name - - def _append_grad_suffix(self, name): - """ - Append grad suffix to the given variable name - """ - return name + core.grad_var_suffix() - - def _update_param_device_map(self, params_grads, block): - for param_grad in params_grads: - if not param_grad[0].trainable: continue - param_name = param_grad[0].name - ops = block.ops - for op in ops: - input_arg_names = op.input_arg_names - if param_name in input_arg_names: - self._param_device_map[param_name] = op.attr( - self._op_device_key) - break - - def _add_opdevice_attr_for_regularization_clip(self, block): - """ - Add op_device attribute for regulization and clip ops. - """ - for op in block.ops: - # role for regularization and clip ops is optimize - if int(op.attr(self._op_role_key)) != int(self._op_role.Optimize): - continue - if op.has_attr(self._op_device_key) and ( - op.attr(self._op_device_key) != ""): - continue - assert self._op_role_var_key in op.attr_names - op_role_var = op.all_attrs()[self._op_role_var_key] - assert len(op_role_var) == 2 - param_name = block.vars[op_role_var[0]].name - device = self._param_device_map[param_name] - op._set_attr(self._op_device_key, device) - - def _add_default_opdevice_attr(self, block): - """ - 1. Add default op_device attribute for lr-related ops. - The default value is the one that of the first place. - 2. Add default op_device attribute for sum ops added during - backward. For these ops, we set the op_device attribute - as the one of its post op, i.e, which op has the output of the - sum op as an input. - """ - first_devcie = "" - - # Get the device spec of the first place. - # device_spec: 'cpu' for cpu device and 'gpu:id' for gpu device, - # e.g. 'gpu:0', 'gpu:1', etc. - for op in block.ops: - if op.has_attr(self._op_device_key) and ( - op.attr(self._op_device_key) != ""): - first_device = op.attr(self._op_device_key) - break - assert first_device - - # set op_device attr for lr-related ops - lrsched_role = int(self._op_role.LRSched) - for op in block.ops: - if not op.has_attr(self._op_device_key) or ( - op.attr(self._op_device_key) == ""): - if op.type == "sum": - # For sum ops that compute the sum of @RENAMED@ vars - for name in op.desc.input_arg_names(): - assert '@RENAME@' in name - assert len(op.desc.output_arg_names()) == 1 - out_name = op.desc.output_arg_names()[0] - post_op = self._find_post_op(block.ops, op, out_name) - device = post_op.attr(self._op_device_key) - assert device - op._set_attr(self._op_device_key, device) - continue - - assert op.attr(self._op_role_key) == lrsched_role, ( - "Op whose op_device attr has not been set for pipeline" - " must be of the role LRSched.") - op._set_attr(self._op_device_key, first_device) - - def _check_validation(self, block): - """ - Check whether ops in a block are all validate (i.e., the - op_device attribute has been set). - Then, return all device specifications in order. - """ - device_specs = [] - for op in block.ops: - type = op.type - if not op._has_kernel(type): - assert op.type == "conditional_block" and ( - op.attr(self._op_role_key) == int(self._op_role.LRSched)), ( - "Now, the only supported op without kernel is " - "conditional_block, and its op role must be LRSched.") - assert op.has_attr(self._op_device_key), ( - "op ({}) has no {} attribute.".format(op.type, - self._op_device_key)) - dev_spec = op.attr(self._op_device_key) - assert dev_spec, ("op_device attribute for op " - "{} has not been set.".format(op.type)) - if not dev_spec in device_specs: - device_specs.append(dev_spec) - return device_specs - - def _insert_enq_deq_ops_for_boundaries(self, block, origin_block, - startup_program): - """ - Insert a pair of enqueue and dequeue ops for every two - consecutive ops on different devices. - """ - startup_block = startup_program.global_block() - extra_index = 0 - - # A map from var to device spec where op takes it as input, - # avoiding multiple enqueue and dequeue ops. - var_devspec = dict() - - for index, op in list(enumerate(origin_block.ops)): - cur_device_spec = op.attr(self._op_device_key) - for var_name in op.input_arg_names: - # i.e., lod_tensor_blocking_queue created by DataLoader, - # which only exists in startup program. - if not var_name in origin_block.vars: continue - var = block.var(var_name) - # skip data, because we will process it later - if var.is_data: continue - prev_op = self._find_real_prev_op(origin_block.ops, op, - var_name) - if prev_op is None: - continue - prev_device_spec = prev_op.attr(self._op_device_key) - - if prev_device_spec != cur_device_spec: - if var_name not in var_devspec: - var_devspec[var_name] = [] - if cur_device_spec in var_devspec[var_name]: continue - var_devspec[var_name].append(cur_device_spec) - - queue_name = var_name + "_blocking_queue" - queue_name = unique_name.generate(queue_name) - queue_var = startup_block.create_var( - name=queue_name, - persistable=True, - type=core.VarDesc.VarType.RAW) - startup_block.append_op( - type='queue_generator', - attrs={ - 'names': [queue_name], - 'capacity': self._num_microbatches - }) - op_role = op.all_attrs()[self._op_role_key] - var = block.vars[var_name] - block._insert_op( - index=index + extra_index, - type='enqueue', - inputs={'X': var}, - attrs={ - 'queue_name': queue_name, - self._op_device_key: prev_device_spec, - self._op_role_key: op_role - }) - extra_index += 1 - block._insert_op( - index=index + extra_index, - type='dequeue', - outputs={'Out': [var]}, - attrs={ - self._op_device_key: cur_device_spec, - 'queue_name': queue_name, - self._op_role_key: op_role - }) - extra_index += 1 - - def _add_dequeue_ops_for_optimize(self, block, startup_program): - startup_block = startup_program.global_block() - grad_queue_map = dict() - grad_device_map = dict() - optimize_index = None - grad_names_to_dequeue = [] - - for index, op in reversed(list(enumerate(block.ops))): - device = op.attr(self._op_device_key) - # Optimizer pass - if not self._is_optimize_op(op): - optimize_index = index + 1 - break - if not self._is_update_op(op): continue - assert self._op_role_var_key in op.attr_names - op_role_var = op.all_attrs()[self._op_role_var_key] - assert len(op_role_var) == 2 - grad_name = op_role_var[1] - assert grad_name not in grad_device_map - assert grad_name not in grad_names_to_dequeue - grad_device_map[grad_name] = device - grad_names_to_dequeue.append(grad_name) - - for grad_name in grad_names_to_dequeue: - device = grad_device_map[grad_name] - grad_names = [] - grads = [] - queue_name = grad_name + "_blocking_queue" - queue_name = unique_name.generate(queue_name) - grad_queue_map[grad_name] = queue_name - ref_var = block.vars[grad_name] - queue_var = startup_block.create_var( - name=queue_name, - persistable=True, - type=core.VarDesc.VarType.RAW) - startup_block.append_op( - type='queue_generator', - attrs={ - 'names': [queue_name], - 'capacity': self._num_microbatches - }) - orig_var_name = self._strip_grad_suffix(grad_name) - for _ in range(self._num_microbatches): - u_name = unique_name.generate(orig_var_name) - u_grad_name = self._append_grad_suffix(u_name) - grad_var = self._create_var(block, ref_var, u_grad_name) - grad_names.append(u_grad_name) - grads.append(grad_var) - block._insert_op( - index=optimize_index, - type='dequeue', - outputs={'Out': grads}, - attrs={ - self._op_device_key: device, - 'queue_name': queue_name, - self._op_role_key: self._op_role.Optimize - }) - block._insert_op( - index=optimize_index + 1, - type='sum', - inputs={'X': grad_names}, - outputs={'Out': ref_var}, - attrs={ - self._op_device_key: device, - self._op_role_key: self._op_role.Optimize - }) - return grad_queue_map - - def _insert_enq_deq_ops_for_update(self, block, startup_program): - """ - Insert enqueue and dequeue ops for gradients of parameters. - """ - startup_block = startup_program.global_block() - grad_queue_map = self._add_dequeue_ops_for_optimize(block, - startup_program) - - for index, op in reversed(list(enumerate(block.ops))): - offset = index - device = op.attr(self._op_device_key) - - # Backward pass - if self._is_loss_grad_op(op): - loss_grad_var = block.vars[op.output_arg_names[0]] - scale_factor = self._num_microbatches - block._insert_op( - index=index + 1, - type='scale', - inputs={'X': loss_grad_var}, - outputs={'Out': loss_grad_var}, - attrs={ - 'scale': 1.0 / scale_factor, - self._op_device_key: device, - self._op_role_key: self._op_role.Backward - }) - break - if self._is_backward_op(op) and ( - self._op_role_var_key in op.attr_names): - op_role_var = op.all_attrs()[self._op_role_var_key] - - if len(op_role_var) == 0: - continue - assert len(op_role_var) % 2 == 0 - for i in range(0, len(op_role_var), 2): - grad_name = op_role_var[i + 1] - grad_var = block.vars[grad_name] - assert grad_name in grad_queue_map - queue_name = grad_queue_map[grad_name] - block._insert_op( - index=offset + 1, - type='enqueue', - inputs={'X': block.vars[grad_name]}, - attrs={ - 'queue_name': queue_name, - self._op_device_key: device, - self._op_role_key: self._op_role.Backward - }) - offset += 1 - - def _add_sub_blocks(self, main_block, program_list): - main_program = main_block.program - for prog_info in program_list: - prog = prog_info['program'] - for op in prog.block(0).ops: - if not op.has_attr('sub_block'): - continue - origin_sub_block_id = op.attr('sub_block').id - origin_sub_block = main_program.block(origin_sub_block_id) - new_sub_block = prog._create_block(parent_idx=0) - for op in origin_sub_block.ops: - op_desc = op.desc - ap_op = new_sub_block.desc.append_op() - ap_op.copy_from(op_desc) - new_sub_block._sync_with_cpp() - op._set_attr('sub_block:', new_sub_block) - - def _get_device_info(self, block): - for op in block.ops: - if not op._has_kernel(op.type): continue - op_device = op.attr(self._op_device_key) - return op_device - - def _process_persistable_vars_in_multi_sections(self, main_program, - startup_prog, program_list): - """ - Special Case: process persistable vars that exist in - multiple sections, e.g., shared weight - """ - # var_info = {var_name: [program1, program2...]}, - # persistable var only - var_info = dict() - for prog_info in program_list: - prog = prog_info['program'] - block = prog.block(0) - for var_name in block.vars: - var = block.var(var_name) - if not var.persistable: continue - if not var_name in var_info: - var_info[var_name] = [] - if not prog in var_info[var_name]: - var_info[var_name].append(prog) - for var_name in list(var_info.keys()): - if len(var_info[var_name]) == 1: - var_info.pop(var_name) - - # write_info = {var_name: program}, where program is the only program - # in which the var named var_name is written. - write_info = dict() - for var_name in var_info.keys(): - for prog in var_info[var_name]: - block = prog.block(0) - for op in block.ops: - if op.type == "dequeue": continue - # We have processed lr related vars - if op.attr(self._op_role_key) == int( - self._op_role.Optimize.LRSched): - continue - if var_name in op.desc.output_arg_names(): - assert var_name not in write_info, ( - "two sections write the same var({}): second " - "op {}.".format(var_name, op)) - write_info[var_name] = prog - break - - for var_name in var_info.keys(): - # Case 1: read only variables, no special process - if not var_name in write_info: continue - - # Case 2: one write multiple reads - write_prog = write_info[var_name] - write_block = write_prog.block(0) - write_device = self._get_device_info(write_block) - all_progs = var_info[var_name] - for prog in all_progs: - if prog == write_prog: continue - - queue_name = var_name + "_blocking_queue" - queue_name = unique_name.generate(queue_name) - queue_var = startup_prog.block(0).create_var( - name=queue_name, - persistable=True, - type=core.VarDesc.VarType.RAW) - startup_prog.block(0).append_op( - type='queue_generator', - attrs={ - 'names': [queue_name], - 'capacity': self._num_microbatches - }) - write_block._insert_op( - index=0, - type='enqueue', - inputs={'X': write_block.var(var_name), }, - attrs={ - 'queue_name': queue_name, - self._op_device_key: write_device, - # A trick to make the role LRSched to avoid copy every - # microbatch - self._op_role_key: self._op_role.LRSched - }) - read_block = prog.block(0) - read_device = self._get_device_info(read_block) - read_block._insert_op( - index=0, - type='dequeue', - outputs={'Out': [read_block.var(var_name)]}, - attrs={ - self._op_device_key: read_device, - # A trick to make the role LRSched to avoid copy every - # microbatch - self._op_role_key: self._op_role.LRSched, - 'queue_name': queue_name, - }) - - def minimize(self, - loss, - startup_program=None, - parameter_list=None, - no_grad_set=None): - main_block = loss.block - if startup_program is None: - startup_program = default_startup_program() - optimize_ops, params_grads = self._optimizer.minimize( - loss, startup_program, parameter_list, no_grad_set) - self._update_param_device_map(params_grads, main_block) - - # Step1: add default op_device attribute for regulization and clip ops - self._add_opdevice_attr_for_regularization_clip(main_block) - - # Step2: add default op_device attribute for ops whose op_device - # attribute have not been set yet. - self._add_default_opdevice_attr(main_block) - device_specs = self._check_validation(main_block) - - # Step3: add enqueue and dequeue ops between section boundaries - origin_prog = main_block.program.clone(for_test=False) - origin_main_block = origin_prog.global_block() - self._insert_enq_deq_ops_for_boundaries(main_block, origin_main_block, - startup_program) - - # Step4: add a pair of enqueue and dequeueN for parameter gradients - self._insert_enq_deq_ops_for_update(main_block, startup_program) - - main_program = main_block.program - - place_list = [] - place_id_list = [] - for dev_spec in device_specs: - if dev_spec == "cpu": - place_list.append(core.CPUPlace()) - place_id_list.append(-1) - elif "gpu" in dev_spec and ":" in dev_spec: - dev_index = dev_spec.split(":")[1] - place_list.append(core.CUDAPlace(int(dev_index))) - place_id_list.append(int(dev_index)) - else: - raise ValueError("Unknown device type: %s", dev_spec) - - # Step5: split program into sections and add pairs of - # enqueue and dequeue ops for data var. - if len(place_list) == 0: - program_list = [] - ptmp = { - "program": main_program, - "input_set": set(), - "output_set": set() - } - program_list.append(ptmp) - else: - program_list = self._split_program(main_program) - for p in program_list: - self._create_vars(p["program"].block(0), main_program) - self._insert_enq_deq_for_data_var(main_block, program_list, - startup_program, device_specs) - - # Step6: Special Case: process persistable vars that exist in - # multiple sections - self._process_persistable_vars_in_multi_sections( - main_program, startup_program, program_list) - - # Step7: Add sub blocks for section programs - self._add_sub_blocks(main_block, program_list) - - main_program._pipeline_opt = { - "trainer": "PipelineTrainer", - "device_worker": "Section", - "section_program_list": program_list, - "place_list": place_list, - "place_id_list": place_id_list, - "sync_steps": -1, - "num_microbatches": self._num_microbatches, - "start_cpu_core_id": self._start_cpu_core_id, - } - return optimize_ops, params_grads, program_list - - -class RecomputeOptimizer(Optimizer): - """ - :api_attr: Static Graph - - Recompute Optimizer Wrapper - - Normally, a training step contains three sub-steps: first, run forward - Operators to calculate the loss; second, run backward Operators to - calculate gradient of the parameters; third, apply optimization method - to update the value of the parameters. - - In the forward computation process, all variables that are needed by - backward computation process will be kept in memory, which occupy a great - amount of memory when the network becomes very deep. - - Recompute split the network to k segments. In each segment, It will - recompute the forward Operators, before running backward operators. It is - very helpful for saving memory. - - The Variables that separate a network to segments are called as checkpoints, - and users should set it manually. The usage is very simple: - - Args: - optimizer (Optimizer): The optimizer that is applied to parameters. - - Examples: - .. code-block:: python - - import paddle.fluid as fluid - import paddle - import numpy as np - def gen_data(): - return {"x": np.random.random(size=(32, 32)).astype('float32'), - "y": np.random.randint(2, size=(32, 1)).astype('int64')} - def mlp(input_x, input_y, hid_dim=128, label_dim=2): - print(input_x) - fc_1 = fluid.layers.fc(input=input_x, size=hid_dim) - prediction = fluid.layers.fc(input=[fc_1], size=label_dim, act='softmax') - cost = fluid.layers.cross_entropy(input=prediction, label=input_y) - sum_cost = fluid.layers.reduce_mean(cost) - return sum_cost, fc_1, prediction - input_x = fluid.layers.data(name="x", shape=[32], dtype='float32') - input_y = fluid.layers.data(name="y", shape=[1], dtype='int64') - cost, fc_1, pred = mlp(input_x, input_y) - - sgd = paddle.optimizer.Adam(lr=0.01) - sgd = paddle.optimizer.RecomputeOptimizer(sgd) - sgd._set_checkpoints([fc_1, pred]) - sgd.minimize(cost) - - print("Finished optimize") - place = fluid.CPUPlace() - exe = fluid.Executor(place) - exe.run(fluid.default_startup_program()) - step = 10 - - for i in range(step): - cost_val = exe.run(feed=gen_data(), - program=fluid.default_main_program(), - fetch_list=[cost.name]) - print("step=%d cost=%f" % (i, cost_val[0])) - - """ - - def __init__(self, optimizer): - if framework.in_dygraph_mode(): - raise Exception("In dygraph, don't support RecomputeOptimizer.") - self._optimizer = optimizer - self._checkpoints = None - self._learning_rate = self._optimizer._learning_rate - self._learning_rate_map = self._optimizer._learning_rate_map - - def _set_checkpoints(self, checkpoints): - """ - Args: - checkpoints (list): List of Variable or string - """ - assert isinstance( - checkpoints, list - ), "_checkpoints should be a list of Variable or a list of String" - for ckpt in checkpoints: - assert ( - isinstance(ckpt, six.string_types) or isinstance(ckpt, Variable) - ), "_checkpoints should be a list of Variable or a list of String" - self._checkpoints = checkpoints - - def load(self, stat_dict): - """ - :api_attr: Static Graph - - load function is not supported by Recompute Optimizer for now. - :return: None - - Args: - stat_dict: the dict load by load_persistable method - - Examples: - .. code-block:: python - - import paddle.fluid as fluid - import paddle - import paddle.compat as cpt - - def mlp(input_x, input_y, hid_dim=128, label_dim=2): - fc_1 = fluid.layers.fc(input=input_x, size=hid_dim) - prediction = fluid.layers.fc(input=[fc_1], size=label_dim, act='softmax') - cost = fluid.layers.cross_entropy(input=prediction, label=input_y) - sum_cost = fluid.layers.reduce_mean(cost) - return sum_cost, fc_1, prediction - - input_x = fluid.layers.data(name="x", shape=[32], dtype='float32') - input_y = fluid.layers.data(name="y", shape=[1], dtype='int64') - cost, fc_1, pred = mlp(input_x, input_y) - print("Finished FF") - - sgd = paddle.optimizer.Adam(lr=0.01) - sgd = paddle.optimizer.RecomputeOptimizer(sgd) - sgd._set_checkpoints([fc_1, pred]) - try: - stat_dict = {} - sgd.load(stat_dict) - except NotImplementedError as e: - print(cpt.get_exception_message(e)) - """ - raise NotImplementedError( - "load function is not supported by Recompute Optimizer for now") - - def apply_gradients(self, params_grads): - """ - call apply_gradients function of self._optimizer. - - Args: - params_grads (list): list of (param, grad) pair to do optimization. - - Returns: - list: A list of operators appended to the current program. - - Examples: - .. code-block:: python - - import paddle.fluid as fluid - import paddle - import paddle.fluid.framework as framework - - def mlp(input_x, input_y, hid_dim=128, label_dim=2): - fc_1 = fluid.layers.fc(input=input_x, size=hid_dim) - prediction = fluid.layers.fc(input=[fc_1], size=label_dim, act='softmax') - cost = fluid.layers.cross_entropy(input=prediction, label=input_y) - sum_cost = fluid.layers.reduce_mean(cost) - return sum_cost, fc_1, prediction - - - input_x = fluid.layers.data(name="x", shape=[32], dtype='float32') - input_y = fluid.layers.data(name="y", shape=[1], dtype='int64') - cost, fc_1, pred = mlp(input_x, input_y) - print("Finished FF") - - sgd = paddle.optimizer.Adam(lr=0.01) - sgd = paddle.optimizer.RecomputeOptimizer(sgd) - sgd._set_checkpoints([fc_1, pred]) - params_grads = sgd.backward( - cost, - startup_program=None, - parameter_list=None, - no_grad_set=None) - - program = cost.block.program - with framework.program_guard(program, None): - optimize_ops = sgd.apply_gradients(params_grads) - - print("Finished apply gradients") - """ - - return self._optimizer.apply_gradients(params_grads=params_grads) - - def backward(self, - loss, - startup_program=None, - parameter_list=None, - no_grad_set=None, - callbacks=None): - """ - call append_backward with checkpoints. - - Args: - loss (Variable): loss variable to run optimizations. - startup_program (Program): startup_program for initializing parameters - in `parameter_list`. - parameter_list (list): list of Variables or Variable.names to update. - no_grad_set (set|None): set of Variables or Variables.names should be ignored. - callbacks (list|None): list of callables to run when appending backward - operator for one parameter. - checkpoints (list): list of Variables as checkpoints - - Examples: - .. code-block:: python - - import paddle.fluid as fluid - import paddle - - def mlp(input_x, input_y, hid_dim=128, label_dim=2): - fc_1 = fluid.layers.fc(input=input_x, size=hid_dim) - prediction = fluid.layers.fc(input=[fc_1], size=label_dim, act='softmax') - cost = fluid.layers.cross_entropy(input=prediction, label=input_y) - sum_cost = fluid.layers.reduce_mean(cost) - return sum_cost, fc_1, prediction - - - input_x = fluid.layers.data(name="x", shape=[32], dtype='float32') - input_y = fluid.layers.data(name="y", shape=[1], dtype='int64') - cost, fc_1, pred = mlp(input_x, input_y) - print("Finished FF") - - sgd = paddle.optimizer.Adam(lr=0.01) - sgd = paddle.optimizer.RecomputeOptimizer(sgd) - sgd._set_checkpoints([fc_1, pred]) - params_grads = sgd.backward( - cost, - startup_program=None, - parameter_list=None, - no_grad_set=None) - print("Finished backward") - """ - assert (self._checkpoints is not None - ), "You should call _set_checkpoints first" - - if framework.in_dygraph_mode(): - raise NotImplementedError( - "DyGraph current does not support recompute") - - self._dtype = loss.dtype - program = loss.block.program - with program_guard(program, startup_program): - checkpoint_vars = [] - for ckpt in self._checkpoints: - if isinstance(ckpt, Variable): - checkpoint_vars.append(ckpt) - else: - checkpoint_vars.append(loss.block.var(ckpt)) - - params_grads = append_backward( - loss, parameter_list, no_grad_set, checkpoints=checkpoint_vars) - # Note: since we can't use all_reduce_op now, - # dgc_op should be the last op of one grad. - if hasattr(self._optimizer, "_append_dgc_ops"): - self._optimizer._append_dgc_ops(params_grads) - return params_grads - - def apply_optimize(self, loss, startup_program, params_grads): - """ - call the apply_optimize function of self._optimizer - Args: - loss (Variable): loss variable to run optimizations. - startup_program (Program): startup_program for initializing parameters - in `parameter_list`. - params_grads (list): list of (param, grad) pair to do optimization. - Examples: - .. code-block:: python - import paddle.fluid as fluid - import paddle - - def mlp(input_x, input_y, hid_dim=128, label_dim=2): - fc_1 = fluid.layers.fc(input=input_x, size=hid_dim) - prediction = fluid.layers.fc(input=[fc_1], size=label_dim, act='softmax') - cost = fluid.layers.cross_entropy(input=prediction, label=input_y) - sum_cost = fluid.layers.reduce_mean(cost) - return sum_cost, fc_1, prediction - - input_x = fluid.layers.data(name="x", shape=[32], dtype='float32') - input_y = fluid.layers.data(name="y", shape=[1], dtype='int64') - cost, fc_1, pred = mlp(input_x, input_y) - print("Finished FF") - - sgd = paddle.optimizer.Adam(lr=0.01) - sgd = paddle.optimizer.RecomputeOptimizer(sgd) - sgd._set_checkpoints([fc_1, pred]) - params_grads = sgd.backward( - cost, - startup_program=None, - parameter_list=None, - no_grad_set=None) - - optimize_ops = sgd.apply_optimize( - cost, startup_program=None, params_grads=params_grads) - - print("Finished apply_optimize") - """ - - return self._optimizer.apply_optimize( - loss, startup_program=startup_program, params_grads=params_grads) - - def minimize(self, - loss, - startup_program=None, - parameter_list=None, - no_grad_set=None): - assert isinstance(loss, Variable), "The loss should be an Variable." - assert (self._checkpoints is not None - ), "You should call _set_checkpoints first" - if framework.in_dygraph_mode(): - raise NotImplementedError( - "DyGraph current does not support recompute") - params_grads = self.backward( - loss, - startup_program=startup_program, - parameter_list=parameter_list, - no_grad_set=no_grad_set) + if param._grad_ivar() is not None: + grad_var = param._grad_ivar() + params_grads.append((jparam, grad_var)) optimize_ops = self.apply_optimize( - loss, startup_program=startup_program, params_grads=params_grads) - - return optimize_ops, params_grads - - -class LookaheadOptimizer(object): - """ - :api_attr: Static Graph - - This implements the Lookahead optimizer of the - paper : https://arxiv.org/abs/1907.08610. - - Lookahead keeps two sets of params: the fast_params and - the slow_params. inner_optimizer update fast_params every - training step. Lookahead updates the slow_params and fast_params - every k training steps as follows: - - .. math:: - - slow\_param_t &= slow\_param_{t-1} + \\alpha * (fast\_param_{t-1} - slow\_param_{t-1}) - - fast\_param_t &= slow\_param_t - - Args: - inner_optimizer (Optimizer): The optimizer that update fast params step by step. - alpha (float): The learning rate of Lookahead. - k (int): The slow params is updated every k steps. - - Examples: - .. code-block:: python - - import paddle - import paddle.fluid as fluid - import numpy as np - - x = fluid.layers.data(name='x', shape=[2], dtype='float32') - label = fluid.layers.data(name="label", shape=[1], dtype="int64") - y = fluid.layers.fc(input=[x], size=2, act="softmax") - loss = fluid.layers.cross_entropy(input=y, label=label) - loss = fluid.layers.mean(x=loss) - sgd = paddle.optimizer.SGD(learning_rate=0.01) - optimizer = paddle.optimizer.LookaheadOptimizer(sgd, - alpha=0.5, - k=5) - optimizer.minimize(loss) - main_program = fluid.default_main_program() - place = fluid.CPUPlace() - exe = fluid.Executor(place) - exe.run(fluid.default_startup_program()) - - feeder = fluid.DataFeeder(feed_list=[x, label], place=place) - - step = 0 - while(step < 10): - step += 1 - exe.run(fluid.default_main_program(), - feed=feeder.feed(batch_data)) - - """ - - def __init__(self, inner_optimizer, alpha=0.5, k=5): - - if framework.in_dygraph_mode(): - raise Exception("In dygraph, don't support LookaheadOptimizer.") - assert (inner_optimizer is not None), "inner optimizer can not be None" - assert ( - 0.0 <= alpha <= 1.0 - ), "alpha should be larger or equal to 0.0, and less or equal than 1.0" - assert (isinstance(k, int) and k > 0), "k should be a positive integer" - - self.inner_optimizer = inner_optimizer - self.alpha = alpha - self.k = k - self.type = "lookahead" - - def minimize(self, loss, startup_program=None): - - # Apply inner optimizer to the main_program - mini_out = self.inner_optimizer.minimize( - loss, startup_program=startup_program) - - # Get startup_program and main_program - if startup_program is None: - startup_program = default_startup_program() - main_block = loss.block - - # add some vars to the main_program - params = [param.name for param in main_block.all_parameters()] - param_to_slow = {} - for param in params: - fast_var = main_block.var(param) - assert (fast_var is not None) - slow_var = main_block.create_var( - name=param + "@SLOW", - shape=fast_var.shape, - dtype=fast_var.dtype, - persistable=True) - param_to_slow[param] = slow_var - - # add some vars to the startup_program - startup_block = startup_program.global_block() - for param in params: - fast_var = startup_block.var(param) - assert (fast_var is not None) - slow_var = startup_block.create_var( - name=param + "@SLOW", - shape=fast_var.shape, - dtype=fast_var.dtype, - persistable=True) - - startup_block.append_op( - type="assign", - inputs={"X": fast_var}, - outputs={"Out": slow_var}) - - with framework.program_guard(main_block.program, startup_program): - # Add Var k to main prog and startup prog - k = layers.create_global_var( - name="lookahead_k", - shape=[1], - value=int(self.k), - dtype='int32', - persistable=True) - - # Add Var alpha to main prog and startup prog - alpha = layers.create_global_var( - name="lookahead_alpha", - shape=[1], - value=float(self.alpha), - dtype='float32', - persistable=True) - - # Add Var step - step = layers.create_global_var( - name="lookahead_step", - shape=[1], - value=int(0), - dtype='int32', - persistable=True) - layers.increment(x=step, value=1.0, in_place=True) - - # lookahead - zero_var = layers.fill_constant( - shape=[1], dtype='float32', value=0.0) - - one_var = layers.fill_constant( - shape=[1], dtype='float32', value=1.0) - - mod = layers.elementwise_mod(step, k) - with layers.control_flow.Switch() as switch: - with switch.case(mod == zero_var): - for param_name in params: - fast_var = main_block.var(param_name) - slow_var = param_to_slow[param_name] - tmp_var = layers.elementwise_add( - layers.elementwise_mul(fast_var, alpha), - layers.elementwise_mul( - slow_var, - layers.elementwise_sub(one_var, alpha))) - layers.assign(input=tmp_var, output=slow_var) - layers.assign(input=tmp_var, output=fast_var) - with switch.default(): - pass - return mini_out - - -class GradientMergeOptimizer(object): - """ - Gradient Merge, also called as Gradient Accumulation, - is a training strategy for larger batches. With this strategy, - the parameter will not be updated until specific steps. - - For each step, the forward network and the backward network - will run to calculate the gradient of the parameters. - - For every k step, the optimization network will run, - applying a specific optimization method (such as SGD, Adam) - to the parameters. - - Args: - inner_optimizer (Optimizer): The specific optimization (such as SGD, Adam) - which update the parameters - k_steps (int): the update period of the parameters - avg (bool): whether to average the gradients of each mini-batch, - the default value is `True` - - Examples: - .. code-block:: python - - import paddle.fluid as fluid - import paddle - import numpy as np - - def gen_data(batch_size): - return {"x": np.random.random(size=(batch_size, 32)).astype('float32'), - "y": np.random.random(size=(batch_size, 1)).astype('int64')} - - def mlp(input_x, input_y, hid_dim=128, label_dim=2): - fc_1 = fluid.layers.fc(input=input_x, size=hid_dim) - prediction = fluid.layers.fc(input=[fc_1], size=label_dim, act='softmax') - cost = fluid.layers.cross_entropy(input=prediction, label=input_y) - sum_cost = fluid.layers.reduce_mean(cost) - return sum_cost, fc_1, prediction - - input_x = fluid.layers.data(name="x", shape=[32], dtype='float32') - input_y = fluid.layers.data(name="y", shape=[1], dtype='int64') - cost, fc_1, pred = mlp(input_x, input_y) - sgd = paddle.optimizer.Adam(lr=0.01) - sgd = paddle.optimizer.GradientMergeOptimizer(sgd, k_steps=4, avg=True) - sgd.minimize(cost) - - place = fluid.CPUPlace() - exe = fluid.Executor(place) - exe.run(fluid.default_startup_program()) - - for i in range(10): - cost_val = exe.run(feed=gen_data(32), - program=fluid.default_main_program(), - fetch_list=[cost.name]) - print("step=%d, cost=%f" % (i, cost_val[0])) - """ - - def __init__(self, inner_optimizer, k_steps=1, avg=True): - if framework.in_dygraph_mode(): - raise Exception( - "In dygraph, we don't support GradientMergeOptimizer." - "You can do Gradient merge by yourself with k-times forward + backward, " - "and one-time optimizer.minimize()") - - assert (inner_optimizer is not None), "inner optimizer can not be None" - assert (isinstance(k_steps, int) and - k_steps > 0), "k_steps should be a positive integer" - - self.inner_optimizer = inner_optimizer - self.k_steps = k_steps - self.type = "gradient_merge" - self.avg = avg - - def _set_k_steps(self, k_steps): - self.k_steps = k_steps - - def _set_avg(self, avg): - self.avg = avg - - def minimize(self, - loss, - startup_program=None, - parameter_list=None, - no_grad_set=None): - - assert isinstance(loss, Variable), "The loss should be an Variable." - assert ( - parameter_list is None - ), "The parameter_list should be None when using GradientMergeOptimizer" - assert ( - no_grad_set is None - ), "The no_grad_set should be None when using GradientMergeOptimizer" - - params_grads = self.inner_optimizer.backward( - loss, startup_program=startup_program) - - #TODO(mapingshuo) support sparse embedding - for k, v in params_grads: - assert ( - v.type != core.VarDesc.VarType.SELECTED_ROWS - ), "SELECTED_ROWS is not supported in GradientMergeOptimizer for now" - - param_to_grad = {k.name: v for (k, v) in params_grads} - - # Get startup_program and main_program - if startup_program is None: - startup_program = default_startup_program() - main_block = loss.block - - # add some vars to the main_program and startup_program - startup_block = startup_program.global_block() - param_names = param_to_grad.keys() - param_to_gradient_merge = {} - - for param_name in param_names: - param_var = main_block.var(param_name) - assert (param_var is not None) - gradient_merge_var = main_block.create_var( - name=param_name + "@GRAD@GradientMerge", - shape=param_var.shape, - dtype=param_var.dtype, - persistable=True) - param_to_gradient_merge[param_name] = gradient_merge_var - startup_gradient_merge_var = startup_block.create_var( - name=param_name + "@GRAD@GradientMerge", - shape=param_var.shape, - dtype=param_var.dtype, - persistable=True) - startup_block.append_op( - type="fill_constant", - outputs={"Out": startup_gradient_merge_var}, - attrs={ - "shape": param_var.shape, - "dtype": param_var.dtype, - "value": float(0), - }) - - with framework.program_guard(main_block.program, startup_program): - # Add Var k to main prog and startup prog - gradient_merge_k = layers.create_global_var( - name="gradient_merge_k", - shape=[1], - value=int(self.k_steps), - dtype='int32', - persistable=True) - - # Add Var step - gradient_merge_step = layers.create_global_var( - name="gradient_merge_step", - shape=[1], - value=int(0), - dtype='int32', - persistable=True) - layers.increment(x=gradient_merge_step, value=1.0, in_place=True) - - # gradient merge - zero_var = layers.fill_constant( - shape=[1], dtype='float32', value=0.0) - one_var = layers.fill_constant( - shape=[1], dtype='float32', value=1.0) - - mod = layers.elementwise_mod(gradient_merge_step, gradient_merge_k) - with layers.control_flow.Switch() as switch: - with switch.case(mod != zero_var): - # 1. update the gradient_merge_vars - # gradient_merge_vars += gradient_vars - cur_block = main_block.program.current_block() - for param_name in param_names: - grad = param_to_grad[param_name] - grad_merge = param_to_gradient_merge[param_name] - cur_block.append_op( - type="elementwise_add", - inputs={'X': grad, - 'Y': grad_merge}, - outputs={'Out': grad_merge}, - attrs={'axis': -1, - 'use_mkldnn': False}) - - with switch.default(): - # 1. update the graient_vars - # gradient_vars += gradient_merge_vars - cur_block_idx = main_block.program.current_block_idx - cur_block = main_block.program.current_block() - for param_name in param_names: - grad = param_to_grad[param_name] - grad_merge = param_to_gradient_merge[param_name] - if self.avg: - tmp_var = layers.elementwise_add(grad, grad_merge) - cur_block.append_op( - type='scale', - inputs={'X': tmp_var}, - outputs={'Out': grad}, - attrs={ - 'scale': 1.0 / self.k_steps, - 'bias': 0.0, - 'bias_after_scale': False - }) - else: - cur_block.append_op( - type="elementwise_add", - inputs={'X': grad, - 'Y': grad_merge}, - outputs={'Out': grad}, - attrs={'axis': -1, - 'use_mkldnn': False}) - - # 2. apply_optimize - target_grad_block = main_block.program._create_block( - parent_idx=cur_block.parent_idx) - target_grad_block._set_forward_block_idx(cur_block_idx) - main_block.program.current_block_idx = cur_block_idx - - optimize_ops = self.inner_optimizer.apply_optimize( - loss, - startup_program=startup_program, - params_grads=params_grads) - - # 3. clear gradient_merge_vars - for param_name in param_names: - grad_merge = param_to_gradient_merge[param_name] - layers.fill_constant( - shape=grad_merge.shape, - dtype=grad_merge.dtype, - value=0.0, - out=grad_merge) - return optimize_ops, params_grads + loss=None, startup_program=None, params_grads=params_grads) From 5a558692bbba8ef82160ae4f9e115af23aa8c5e2 Mon Sep 17 00:00:00 2001 From: MRXLT Date: Tue, 18 Aug 2020 18:52:40 +0800 Subject: [PATCH 06/30] add adam --- .../fluid/tests/unittests/test_adam_op.py | 28 +- python/paddle/optimizer/__init__.py | 3 +- python/paddle/optimizer/adam.py | 286 ++++++++++++++++++ python/paddle/optimizer/optimizer.py | 25 +- 4 files changed, 331 insertions(+), 11 deletions(-) create mode 100644 python/paddle/optimizer/adam.py diff --git a/python/paddle/fluid/tests/unittests/test_adam_op.py b/python/paddle/fluid/tests/unittests/test_adam_op.py index 4c1ff217adcc6..365ad26344ba8 100644 --- a/python/paddle/fluid/tests/unittests/test_adam_op.py +++ b/python/paddle/fluid/tests/unittests/test_adam_op.py @@ -446,7 +446,9 @@ def test_with_place(place, shape): class TestAdamOpV2(unittest.TestCase): def test_adam_op(self): - exe = fluid.Executor(place, shape) + place = fluid.CPUPlace() + shape = [2, 3, 8, 8] + exe = fluid.Executor(place) train_prog = fluid.Program() startup = fluid.Program() with fluid.program_guard(train_prog, startup): @@ -461,7 +463,11 @@ def test_adam_op(self): shape=[1], value=0.95, dtype='float32', persistable=True) betas = [beta1, beta2] opt = paddle.optimizer.Adam( - lr=1e-5, betas=betas, weight_decay=0.01, eps=1e-8) + learning_rate=1e-5, + beta1=beta1, + beta2=beta2, + weight_decay=0.01, + epsilon=1e-8) opt.minimize(loss) exe.run(startup) @@ -469,12 +475,18 @@ def test_adam_op(self): rets = exe.run(train_prog, feed={"data": data_np}, fetch_list=[loss]) assert rets[0] is not None - shape = [2, 3, 8, 8] - places = [fluid.CPUPlace()] - if core.is_compiled_with_cuda(): - places.append(fluid.CUDAPlace(0)) - for place in places: - test_with_place(place, shape) + def test_adam_op_dygraph(self): + with fluid.dygraph.guard(): + value = np.arange(26).reshape(2, 13).astype("float32") + a = fluid.dygraph.to_variable(value) + linear = fluid.Linear(13, 5, dtype="float32") + + adam = paddle.optimizer.Adam( + learning_rate=0.01, parameters=linear.parameters()) + out = linear(a) + out.backward() + adam.step() + adam.clear_gradients() if __name__ == "__main__": diff --git a/python/paddle/optimizer/__init__.py b/python/paddle/optimizer/__init__.py index 15a3e8f7aa9bb..037ae2c5520a6 100644 --- a/python/paddle/optimizer/__init__.py +++ b/python/paddle/optimizer/__init__.py @@ -20,7 +20,7 @@ 'LambOptimizer', 'LarsMomentum', 'LarsMomentumOptimizer', 'LookaheadOptimizer', 'ModelAverage', 'Momentum', 'MomentumOptimizer', 'PipelineOptimizer', 'RecomputeOptimizer', 'RMSPropOptimizer', 'SGD', - 'SGDOptimizer' + 'SGDOptimizer', 'Optimizer' ] @@ -33,3 +33,4 @@ RecomputeOptimizer from .optimizer import Optimizer +from .adam import Adam diff --git a/python/paddle/optimizer/adam.py b/python/paddle/optimizer/adam.py new file mode 100644 index 0000000000000..d68b7af9d21a0 --- /dev/null +++ b/python/paddle/optimizer/adam.py @@ -0,0 +1,286 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .optimizer import Optimizer +from ..fluid import core +from ..fluid import framework +from ..fluid.framework import Variable + + +class Adam(Optimizer): + """ + The Adam optimizer uses an optimization described at the end + of section 2 of `Adam paper `_ , + it can dynamically adjusts the learning rate of each parameter using + the 1st moment estimates and the 2nd moment estimates of the gradient. + + The parameter ``param_out`` update rule with gradient ``grad``: + + .. math:: + + t & = t + 1 + + moment\_1\_out & = {\\beta}_1 * moment\_1 + (1 - {\\beta}_1) * grad + + moment\_2\_out & = {\\beta}_2 * moment\_2 + (1 - {\\beta}_2) * grad * grad + + learning\_rate & = learning\_rate * \\ + \\frac{\sqrt{1 - {\\beta}_2^t}}{1 - {\\beta}_1^t} + + param\_out & = param - learning\_rate * \\frac{moment\_1}{\sqrt{moment\_2} + \epsilon} + + Related paper: `Adam: A Method for Stochastic Optimization `_ + + Args: + learning_rate (float|Tensor, optional): The learning rate used to update ``Parameter``. + It can be a float value or a ``Variable`` with a float type. The default value is 0.001. + beta1 (float|Variable, optional): The exponential decay rate for the 1st moment estimates. + It should be a float number or a Variable with shape [1] and data type as float32. + The default value is 0.9. + beta2 (float|Variable, optional): The exponential decay rate for the 2nd moment estimates. + It should be a float number or a Variable with shape [1] and data type as float32. + The default value is 0.999. + epsilon (float, optional): A small float value for numerical stability. + The default value is 1e-08. + parameters (Iterable, optional): Iterable of ``Tensor`` names to update to minimize ``loss``. \ + This parameter is required in dygraph mode. \ + The default value is None in static mode, at this time all parameters will be updated. + weight_decay (float|WeightDecayRegularizer, optional): The strategy of regularization. \ + It canbe a float value as coeff of L2 regularization or \ + :ref:`api_fluid_regularizer_L1Decay`, :ref:`api_fluid_regularizer_L2Decay`. + If a parameter has set regularizer using :ref:`api_fluid_ParamAttr` already, \ + the regularization setting here in optimizer will be ignored for this parameter. \ + Otherwis, the regularization setting here in optimizer will take effect. \ + Default None, meaning there is no regularization. + grad_clip (GradientClipBase, optional): Gradient cliping strategy, it's an instance of + some derived class of ``GradientClipBase`` . There are three cliping strategies + ( :ref:`api_fluid_clip_GradientClipByGlobalNorm` , :ref:`api_fluid_clip_GradientClipByNorm` , + :ref:`api_fluid_clip_GradientClipByValue` ). Default None, meaning there is no gradient clipping. + name (str, optional): Normally there is no need for user to set this property. + For more information, please refer to :ref:`api_guide_Name`. + The default value is None. + lazy_mode (bool, optional): The official Adam algorithm has two moving-average accumulators. + The accumulators are updated at every step. Every element of the two moving-average + is updated in both dense mode and sparse mode. If the size of parameter is very large, + then the update may be very slow. The lazy mode only update the element that has + gradient in current mini-batch, so it will be much more faster. But this mode has + different semantics with the original Adam algorithm and may lead to different result. + The default value is False. + + Examples: + .. code-block:: python + + import paddle + import paddle.fluid as fluid + + place = fluid.CPUPlace() + main = fluid.Program() + with fluid.program_guard(main): + x = fluid.data(name='x', shape=[None, 13], dtype='float32') + y = fluid.data(name='y', shape=[None, 1], dtype='float32') + y_predict = fluid.layers.fc(input=x, size=1, act=None) + cost = fluid.layers.square_error_cost(input=y_predict, label=y) + avg_cost = fluid.layers.mean(cost) + + adam_optimizer = paddle.optimizer.Adam(0.01) + adam_optimizer.minimize(avg_cost) + + fetch_list = [avg_cost] + train_reader = paddle.batch( + paddle.dataset.uci_housing.train(), batch_size=1) + feeder = fluid.DataFeeder(place=place, feed_list=[x, y]) + exe = fluid.Executor(place) + exe.run(fluid.default_startup_program()) + for data in train_reader(): + exe.run(main, feed=feeder.feed(data), fetch_list=fetch_list) + + .. code-block:: python + + # Adam with beta1/beta2 as Variable + import paddle + import paddle.fluid as fluid + import paddle.fluid.layers.learning_rate_scheduler as lr_scheduler + + place = fluid.CPUPlace() + main = fluid.Program() + with fluid.program_guard(main): + x = fluid.data(name='x', shape=[None, 13], dtype='float32') + y = fluid.data(name='y', shape=[None, 1], dtype='float32') + y_predict = fluid.layers.fc(input=x, size=1, act=None) + cost = fluid.layers.square_error_cost(input=y_predict, label=y) + avg_cost = fluid.layers.mean(cost) + + # define beta decay variable + def get_decayed_betas(beta1_init, beta2_init, decay_steps, decay_rate): + global_step = lr_scheduler._decay_step_counter() + + beta1 = fluid.layers.create_global_var( + shape=[1], + value=float(beta1_init), + dtype='float32', + # set persistable for save checkpoints and resume + persistable=True, + name="beta1") + beta2 = fluid.layers.create_global_var( + shape=[1], + value=float(beta2_init), + dtype='float32', + # set persistable for save checkpoints and resume + persistable=True, + name="beta2") + + div_res = global_step / decay_steps + decayed_beta1 = beta1_init * (decay_rate**div_res) + decayed_beta2 = beta2_init * (decay_rate**div_res) + fluid.layers.assign(decayed_beta1, beta1) + fluid.layers.assign(decayed_beta2, beta2) + + return beta1, beta2 + + beta1, beta2 = get_decayed_betas(0.9, 0.99, 1e5, 0.9) + adam_optimizer = paddle.optimizer.Adam( + learning_rate=0.01, + beta1=beta1, + beta2=beta2) + adam_optimizer.minimize(avg_cost) + + fetch_list = [avg_cost] + train_reader = paddle.batch( + paddle.dataset.uci_housing.train(), batch_size=1) + feeder = fluid.DataFeeder(place=place, feed_list=[x, y]) + exe = fluid.Executor(place) + exe.run(fluid.default_startup_program()) + for data in train_reader(): + exe.run(main, feed=feeder.feed(data), fetch_list=fetch_list) + """ + _moment1_acc_str = "moment1" + _moment2_acc_str = "moment2" + _beta1_pow_acc_str = "beta1_pow_acc" + _beta2_pow_acc_str = "beta2_pow_acc" + + def __init__(self, + learning_rate=0.001, + beta1=0.9, + beta2=0.999, + epsilon=1e-8, + parameters=None, + weight_decay=None, + grad_clip=None, + name=None, + lazy_mode=False): + assert learning_rate is not None + assert beta1 is not None + assert beta2 is not None + assert epsilon is not None + super(Adam, self).__init__( + learning_rate=learning_rate, + parameters=parameters, + weight_decay=weight_decay, + grad_clip=grad_clip, + name=name) + self.type = "adam" + self._beta1 = beta1 + self._beta2 = beta2 + self._epsilon = epsilon + self._lazy_mode = lazy_mode + + def _create_accumulators(self, block, parameters): + assert isinstance(block, framework.Block) + + # Create accumulator tensors for first and second moments + for p in parameters: + self._add_accumulator(self._moment1_acc_str, p) + self._add_accumulator(self._moment2_acc_str, p) + self._add_accumulator( + name=self._beta1_pow_acc_str, + param=p, + fill_value=0.9 if isinstance(self._beta1, Variable) \ + else self._beta1, + shape=[1], + type=core.VarDesc.VarType.LOD_TENSOR, device='cpu') + self._add_accumulator( + name=self._beta2_pow_acc_str, + param=p, + fill_value=0.999 if isinstance(self._beta2, Variable) \ + else self._beta2, + shape=[1], + type=core.VarDesc.VarType.LOD_TENSOR, device='cpu') + + def _append_optimize_op(self, block, param_and_grad): + assert isinstance(block, framework.Block) + + moment1 = self._get_accumulator(self._moment1_acc_str, + param_and_grad[0]) + moment2 = self._get_accumulator(self._moment2_acc_str, + param_and_grad[0]) + beta1_pow_acc = self._get_accumulator(self._beta1_pow_acc_str, + param_and_grad[0]) + beta2_pow_acc = self._get_accumulator(self._beta2_pow_acc_str, + param_and_grad[0]) + lr = self._create_param_lr(param_and_grad) + # create the adam optimize op + + if framework.in_dygraph_mode(): + _beta1 = self._beta1 if not isinstance( + self._beta1, Variable) else self._beta1.numpy().item(0) + _beta2 = self._beta2 if not isinstance( + self._beta2, Variable) else self._beta2.numpy().item(0) + _, _, _, _, _ = core.ops.adam( + param_and_grad[0], param_and_grad[1], lr, moment1, moment2, + beta1_pow_acc, beta2_pow_acc, param_and_grad[0], moment1, + moment2, beta1_pow_acc, beta2_pow_acc, 'epsilon', self._epsilon, + 'lazy_mode', self._lazy_mode, 'min_row_size_to_use_multithread', + 1000, 'beta1', _beta1, 'beta2', _beta2) + + return None + + inputs = { + "Param": [param_and_grad[0]], + "Grad": [param_and_grad[1]], + "LearningRate": [lr], + "Moment1": [moment1], + "Moment2": [moment2], + "Beta1Pow": [beta1_pow_acc], + "Beta2Pow": [beta2_pow_acc] + } + outputs = { + "ParamOut": [param_and_grad[0]], + "Moment1Out": [moment1], + "Moment2Out": [moment2], + "Beta1PowOut": [beta1_pow_acc], + "Beta2PowOut": [beta2_pow_acc], + } + attrs = { + "epsilon": self._epsilon, + "lazy_mode": self._lazy_mode, + "min_row_size_to_use_multithread": 1000 + } + + if isinstance(self._beta1, Variable): + inputs['Beta1Tensor'] = self._beta1 + else: + attrs['beta1'] = self._beta1 + if isinstance(self._beta2, Variable): + inputs['Beta2Tensor'] = self._beta2 + else: + attrs['beta2'] = self._beta2 + + adam_op = block.append_op( + type=self.type, + inputs=inputs, + outputs=outputs, + attrs=attrs, + stop_gradient=True) + + return adam_op diff --git a/python/paddle/optimizer/optimizer.py b/python/paddle/optimizer/optimizer.py index 975f9e1ad1bf6..dadeaf28ce00b 100644 --- a/python/paddle/optimizer/optimizer.py +++ b/python/paddle/optimizer/optimizer.py @@ -37,7 +37,7 @@ from ..fluid.dygraph.learning_rate_scheduler import LearningRateDecay, _LearningRateEpochDecay from paddle.fluid import core from paddle.fluid.layers import tensor -from ..fluid.functools import reduce +from functools import reduce from ..fluid.wrapped_decorator import signature_safe_contextmanager from .. import compat as cpt @@ -58,6 +58,27 @@ class Optimizer(object): Define the common interface of an optimizer. User should not use this class directly, but need to use one of it's implementation. + + Args: + learning_rate (float|Variable): The learning rate used to update ``Parameter``. + It can be a float value or a ``Tensor`` with a float type. + parameters (Iterable, optional): Iterable of ``Tensor`` names to update to minimize ``loss``. \ + This parameter is required in dygraph mode. \ + The default value is None in static mode, at this time all parameters will be updated. + weight_decay (float|WeightDecayRegularizer, optional): The strategy of regularization. \ + It canbe a float value as coeff of L2 regularization or \ + :ref:`api_fluid_regularizer_L1Decay`, :ref:`api_fluid_regularizer_L2Decay`. + If a parameter has set regularizer using :ref:`api_fluid_ParamAttr` already, \ + the regularization setting here in optimizer will be ignored for this parameter. \ + Otherwis, the regularization setting here in optimizer will take effect. \ + Default None, meaning there is no regularization. + grad_clip (GradientClipBase, optional): Gradient cliping strategy, it's an instance of \ + some derived class of ``GradientClipBase`` . There are three cliping strategies \ + ( :ref:`api_fluid_clip_GradientClipByGlobalNorm` , :ref:`api_fluid_clip_GradientClipByNorm` , \ + :ref:`api_fluid_clip_GradientClipByValue` ). Default None, meaning there is no gradient clipping. + name (str, optional): Normally there is no need for user to set this property. + For more information, please refer to :ref:`api_guide_Name`. + The default value is None. """ @imperative_base.no_grad() @@ -949,7 +970,7 @@ def step(self): continue if param._grad_ivar() is not None: grad_var = param._grad_ivar() - params_grads.append((jparam, grad_var)) + params_grads.append((param, grad_var)) optimize_ops = self.apply_optimize( loss=None, startup_program=None, params_grads=params_grads) From fd34fbd52b14cb571707b219551b8f35d1220fcd Mon Sep 17 00:00:00 2001 From: MRXLT Date: Tue, 18 Aug 2020 19:56:45 +0800 Subject: [PATCH 07/30] fix doc --- python/paddle/optimizer/adam.py | 12 ++++++------ python/paddle/optimizer/optimizer.py | 2 +- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/python/paddle/optimizer/adam.py b/python/paddle/optimizer/adam.py index d68b7af9d21a0..91800b94840a6 100644 --- a/python/paddle/optimizer/adam.py +++ b/python/paddle/optimizer/adam.py @@ -44,12 +44,12 @@ class Adam(Optimizer): Args: learning_rate (float|Tensor, optional): The learning rate used to update ``Parameter``. - It can be a float value or a ``Variable`` with a float type. The default value is 0.001. - beta1 (float|Variable, optional): The exponential decay rate for the 1st moment estimates. - It should be a float number or a Variable with shape [1] and data type as float32. + It can be a float value or a ``Tensor`` with a float type. The default value is 0.001. + beta1 (float|Tensor, optional): The exponential decay rate for the 1st moment estimates. + It should be a float number or a Tensor with shape [1] and data type as float32. The default value is 0.9. - beta2 (float|Variable, optional): The exponential decay rate for the 2nd moment estimates. - It should be a float number or a Variable with shape [1] and data type as float32. + beta2 (float|Tensor, optional): The exponential decay rate for the 2nd moment estimates. + It should be a float number or a Tensor with shape [1] and data type as float32. The default value is 0.999. epsilon (float, optional): A small float value for numerical stability. The default value is 1e-08. @@ -107,7 +107,7 @@ class Adam(Optimizer): .. code-block:: python - # Adam with beta1/beta2 as Variable + # Adam with beta1/beta2 as Tensor import paddle import paddle.fluid as fluid import paddle.fluid.layers.learning_rate_scheduler as lr_scheduler diff --git a/python/paddle/optimizer/optimizer.py b/python/paddle/optimizer/optimizer.py index dadeaf28ce00b..521dce76ba557 100644 --- a/python/paddle/optimizer/optimizer.py +++ b/python/paddle/optimizer/optimizer.py @@ -60,7 +60,7 @@ class Optimizer(object): but need to use one of it's implementation. Args: - learning_rate (float|Variable): The learning rate used to update ``Parameter``. + learning_rate (float|Tensor): The learning rate used to update ``Parameter``. It can be a float value or a ``Tensor`` with a float type. parameters (Iterable, optional): Iterable of ``Tensor`` names to update to minimize ``loss``. \ This parameter is required in dygraph mode. \ From e67cd863829d22975530b14611590def3019c2e0 Mon Sep 17 00:00:00 2001 From: MRXLT Date: Wed, 19 Aug 2020 14:59:08 +0800 Subject: [PATCH 08/30] fix doc && add adamw; notest --- .../fluid/tests/unittests/test_adamw_op.py | 68 ++++++ python/paddle/optimizer/__init__.py | 20 +- python/paddle/optimizer/adam.py | 2 + python/paddle/optimizer/adamw.py | 215 ++++++++++++++++++ python/paddle/optimizer/optimizer.py | 189 ++++++++------- 5 files changed, 382 insertions(+), 112 deletions(-) create mode 100644 python/paddle/fluid/tests/unittests/test_adamw_op.py create mode 100644 python/paddle/optimizer/adamw.py diff --git a/python/paddle/fluid/tests/unittests/test_adamw_op.py b/python/paddle/fluid/tests/unittests/test_adamw_op.py new file mode 100644 index 0000000000000..d38dc0087d8f6 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_adamw_op.py @@ -0,0 +1,68 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest +import paddle +import numpy as np +import paddle.fluid as fluid + + +class TestAdamWOp(unittest.TestCase): + def test_adamw_opi_dygraph(self): + paddle.disable_static() + value = np.arange(26).reshape(2, 13).astype("float32") + a = paddle.to_variable(value) + linear = paddle.nn.Linear(13, 5, dtype="float32") + adam = paddle.optimizer.AdamW( + learning_rate=0.01, + parameters=linear.parameters(), + weight_decay=0.01) + out = linear(a) + out.backward() + adam.step() + adam.clear_gradients() + + def test_adamw_op(self): + place = fluid.CPUPlace() + shape = [2, 3, 8, 8] + exe = fluid.Executor(place) + train_prog = fluid.Program() + startup = fluid.Program() + with fluid.program_guard(train_prog, startup): + with fluid.unique_name.guard(): + data = fluid.data(name="data", shape=shape) + conv = fluid.layers.conv2d(data, 8, 3) + loss = paddle.mean(conv) + + beta1 = fluid.layers.create_global_var( + shape=[1], value=0.85, dtype='float32', persistable=True) + beta2 = fluid.layers.create_global_var( + shape=[1], value=0.95, dtype='float32', persistable=True) + betas = [beta1, beta2] + opt = paddle.optimizer.AdamW( + learning_rate=1e-5, + beta1=beta1, + beta2=beta2, + weight_decay=0.01, + epsilon=1e-8) + opt.minimize(loss) + + exe.run(startup) + data_np = np.random.random(shape).astype('float32') + rets = exe.run(train_prog, feed={"data": data_np}, fetch_list=[loss]) + assert rets[0] is not None + + +if __name__ == "__main__": + unittest.main() diff --git a/python/paddle/optimizer/__init__.py b/python/paddle/optimizer/__init__.py index 037ae2c5520a6..31a15f2e4482c 100644 --- a/python/paddle/optimizer/__init__.py +++ b/python/paddle/optimizer/__init__.py @@ -13,24 +13,20 @@ # limitations under the License. __all__ = [ - 'Adadelta', 'AdadeltaOptimizer', 'Adagrad', 'AdagradOptimizer', 'Adam', - 'Adamax', 'DecayedAdagrad', 'AdamW' - 'DecayedAdagradOptimizer', 'DGCMomentumOptimizer', 'Dpsgd', - 'DpsgdOptimizer', 'ExponentialMovingAverage', 'Ftrl', 'FtrlOptimizer', - 'LambOptimizer', 'LarsMomentum', 'LarsMomentumOptimizer', - 'LookaheadOptimizer', 'ModelAverage', 'Momentum', 'MomentumOptimizer', - 'PipelineOptimizer', 'RecomputeOptimizer', 'RMSPropOptimizer', 'SGD', - 'SGDOptimizer', 'Optimizer' + 'Adadelta', 'Adagrad', 'Adam', 'Adamax', 'DecayedAdagrad', 'AdamW' + 'DGCMomentumOptimizer', 'Dpsgd', 'DpsgdOptimizer', + 'ExponentialMovingAverage', 'Ftrl', 'Lamb', 'LarsMomentum', + 'LookaheadOptimizer', 'ModelAverage', 'Momentum', 'PipelineOptimizer', + 'RecomputeOptimizer', 'RMSProp', 'SGD', 'SGDOptimizer', 'Optimizer' ] from ..fluid.optimizer import SGD, Momentum, Adagrad, Dpsgd, DecayedAdagrad, \ - Ftrl, SGDOptimizer, MomentumOptimizer, AdagradOptimizer, DpsgdOptimizer, \ - DecayedAdagradOptimizer, RMSPropOptimizer, FtrlOptimizer, Adadelta, \ - AdadeltaOptimizer, ModelAverage, LarsMomentum, \ - LarsMomentumOptimizer, DGCMomentumOptimizer, LambOptimizer, \ + Ftrl, Adadelta, Lamb, RMSProp, \ + ModelAverage, LarsMomentum, DGCMomentumOptimizer, \ ExponentialMovingAverage, PipelineOptimizer, LookaheadOptimizer, \ RecomputeOptimizer from .optimizer import Optimizer from .adam import Adam +from .adamw import AdamW diff --git a/python/paddle/optimizer/adam.py b/python/paddle/optimizer/adam.py index 91800b94840a6..b0b2af34f2c34 100644 --- a/python/paddle/optimizer/adam.py +++ b/python/paddle/optimizer/adam.py @@ -17,6 +17,8 @@ from ..fluid import framework from ..fluid.framework import Variable +__all__ = ["Adam"] + class Adam(Optimizer): """ diff --git a/python/paddle/optimizer/adamw.py b/python/paddle/optimizer/adamw.py new file mode 100644 index 0000000000000..fecc74ce2c2d7 --- /dev/null +++ b/python/paddle/optimizer/adamw.py @@ -0,0 +1,215 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .optimizer import Optimizer +from .adam import Adam +from ..fluid import framework +import paddle +__all__ = ['AdamW'] + + +class DecoupledWeightDecay(object): + def __init__(self, coeff=0.0, apply_decay_param_fun=None, **kwargs): + if not isinstance(coeff, float) and \ + not isinstance(coeff, framework.Variable): + raise TypeError("coeff should be float or Variable.") + self._params_name = set() + self._apply_decay_param_fun = apply_decay_param_fun + self._coeff = coeff + super(DecoupledWeightDecay, self).__init__(**kwargs) + + def _scale_parameters(self, params_and_grads): + """ + Adds weight decay ops. + scaled_parameter = parameter * coeff + + Args: + params_and_grads: A list of (parameters, gradients) pairs, + the parameters need to decay. + Raises: + Exception: The type of coeff and parameter is not consistent. + """ + if isinstance(self._coeff, float) and self._coeff == 0.0: + return + + scaled_params = [] + for param, grad in params_and_grads: + # If no gradient then we don't need to do anything + if grad is None: + continue + if self._apply_decay_param_fun is not None \ + and not self._apply_decay_param_fun(param.name): + continue + + if isinstance(self._coeff, float): + assert param.dtype is not paddle.fluid.core.VarDesc.VarType.FP32, \ + "the type of coeff(float) and parameter(%s) is not consistent."%(self._coeff.dtype) + else: + assert self._coeff.dtype == param.dtype, \ + "the type of coeff(%s) and parameter(%s) is not consistent."%(self._coeff.dtype, param.dtype) + + with param.block.program._optimized_guard( + [param, grad]), framework.name_scope('weight decay'): + assert param.name not in self._params_name + scaled_params.append((param, grad, param * self._coeff)) + self._params_name.add(param.name) + return scaled_params + + def backward(self, **kargs): + return super(DecoupledWeightDecay, self).backward(**kargs) + + def apply_optimize(self, **kargs): + return super(DecoupledWeightDecay, self).apply_optimize(**kargs) + + def minimize(self, + loss, + startup_program=None, + parameters=None, + no_grad_set=None): + params_grads = self.backward( + loss=loss, + startup_program=startup_program, + parameters=parameters, + no_grad_set=no_grad_set) + scaled_params = self._scale_parameters(params_grads) + for p_grad_sgrad in scaled_params: + param, grad, scaled_param = p_grad_sgrad + with param.block.program._optimized_guard( + [param, grad]), framework.name_scope('weight decay'): + updated_param = paddle.fluid.layers.elementwise_sub( + x=param, y=scaled_param) + paddle.fluid.layers.assign(input=updated_param, output=param) + + optimize_ops = self.apply_optimize( + loss=loss, + params_grads=params_grads, + startup_program=startup_program) + return optimize_ops, params_grads + + @framework.dygraph_only + def step(self): + parameter_list = self._parameter_list + self._dtype = None + params_grads = [] + for param in self._parameter_list: + if not param.trainable: + continue + if param._grad_ivar() is not None: + grad_var = param._grad_ivar() + params_grads.append((param, grad_var)) + + scaled_params = self._scale_parameters(params_grads) + for p_grad_sgrad in scaled_params: + param, grad, scaled_param = p_grad_sgrad + with param.block.program._optimized_guard( + [param, grad]), framework.name_scope('weight decay'): + updated_param = paddle.fluid.layers.elementwise_sub( + x=param, y=scaled_param) + paddle.fluid.layers.assign(input=updated_param, output=param) + optimize_ops = self.apply_optimize( + loss=None, startup_program=None, params_grads=params_grads) + + def __str__(self): + return " ".join(["Weight Decay, params:", ",".join(self._params_name)]) + + +class AdamW(DecoupledWeightDecay, Adam): + """ + The AdamW optimizer is implemented based on the AdamW Optimization + in paper `DECOUPLED WEIGHT DECAY REGULARIZATION `_. + it can resolves the problem of L2 regularization failure in the Adam optimizer. + + .. math:: + + t & = t + 1 + + moment\_1\_out & = {\\beta}_1 * moment\_1 + (1 - {\\beta}_1) * grad + + moemnt\_2\_out & = {\\beta}_2 * moment\_2 + (1 - {\\beta}_2) * grad * grad + + learning\_rate & = learning\_rate * \\ + \\frac{\sqrt{1 - {\\beta}_2^t}}{1 - {beta}_1^t} + + param\_out & = param - learning\_rate * (\\frac{moment\_1}{\sqrt{moment\_2} + \epsilon} + \lambda * param) + + + Args: + learning_rate (float|Tensor, optional): The learning rate used to update ``Parameter``. + It can be a float value or a ``Tensor`` with a float type. The default value is 0.001. + beta1 (float|Tensor, optional): The exponential decay rate for the 1st moment estimates. + It should be a float number or a Tensor with shape [1] and data type as float32. + The default value is 0.9. + beta2 (float|Tensor, optional): The exponential decay rate for the 2nd moment estimates. + It should be a float number or a Tensor with shape [1] and data type as float32. + The default value is 0.999. + epsilon (float, optional): A small float value for numerical stability. + The default value is 1e-08. + parameters (Iterable, optional): Iterable of ``Tensor`` names to update to minimize ``loss``. \ + This parameter is required in dygraph mode. \ + The default value is None in static mode, at this time all parameters will be updated. + weight_decay (float|Tensor): The weight decay coefficient, it can be + float or Tensor. + apply_decay_param_fun (function|None): If it is not None, + only tensors that makes apply_decay_param_fun(Tensor)==True + will be updated. It only works when we want to specify tensors. + Default: None. + grad_clip (GradientClipBase, optional): Gradient cliping strategy, it's an instance of + some derived class of ``GradientClipBase`` . There are three cliping strategies + ( :ref:`api_fluid_clip_GradientClipByGlobalNorm` , :ref:`api_fluid_clip_GradientClipByNorm` , + :ref:`api_fluid_clip_GradientClipByValue` ). Default None, meaning there is no gradient clipping. + name (str, optional): Normally there is no need for user to set this property. + For more information, please refer to :ref:`api_guide_Name`. + The default value is None. + lazy_mode (bool, optional): The official Adam algorithm has two moving-average accumulators. + The accumulators are updated at every step. Every element of the two moving-average + is updated in both dense mode and sparse mode. If the size of parameter is very large, + then the update may be very slow. The lazy mode only update the element that has + gradient in current mini-batch, so it will be much more faster. But this mode has + different semantics with the original Adam algorithm and may lead to different result. + The default value is False. + **Notes**: + **Currently, AdamW doesn't support sparse parameter optimization.** + + Examples: + .. code-block:: python + + import paddle.fluid as fluid + import paddle + import numpy + + # First create the Executor. + place = fluid.CPUPlace() # fluid.CUDAPlace(0) + exe = fluid.Executor(place) + + train_program = fluid.Program() + startup_program = fluid.Program() + with fluid.program_guard(train_program, startup_program): + data = fluid.data(name='X', shape=[None, 1], dtype='float32') + hidden = fluid.layers.fc(input=data, size=10) + loss = fluid.layers.mean(hidden) + adam = paddle.optimizer.AdamW(learning_rate=0.2) + adam.minimize(loss) + + # Run the startup program once and only once. + exe.run(startup_program) + + x = numpy.random.random(size=(10, 1)).astype('float32') + outs = exe.run(program=train_program, + feed={'X': x}, + fetch_list=[loss.name]) + """ + + def __init__(self, weight_decay, apply_decay_param_fun=None, **kwargs): + super(AdamW, self).__init__( + weight_decay, apply_decay_param_fun=apply_decay_param_fun, **kwargs) diff --git a/python/paddle/optimizer/optimizer.py b/python/paddle/optimizer/optimizer.py index 521dce76ba557..1f2900a4aa0b3 100644 --- a/python/paddle/optimizer/optimizer.py +++ b/python/paddle/optimizer/optimizer.py @@ -41,15 +41,7 @@ from ..fluid.wrapped_decorator import signature_safe_contextmanager from .. import compat as cpt -__all__ = [ - 'SGD', 'Momentum', 'Adagrad', 'Adam', 'Adamax', 'Dpsgd', 'DecayedAdagrad', - 'Ftrl', 'SGDOptimizer', 'MomentumOptimizer', 'AdagradOptimizer', - 'AdamOptimizer', 'AdamaxOptimizer', 'DpsgdOptimizer', - 'DecayedAdagradOptimizer', 'RMSPropOptimizer', 'FtrlOptimizer', 'Adadelta', - 'AdadeltaOptimizer', 'ModelAverage', 'LarsMomentum', - 'LarsMomentumOptimizer', 'LambOptimizer', 'ExponentialMovingAverage', - 'PipelineOptimizer', 'LookaheadOptimizer', 'RecomputeOptimizer' -] +__all__ = ['Optimizer'] class Optimizer(object): @@ -62,7 +54,7 @@ class Optimizer(object): Args: learning_rate (float|Tensor): The learning rate used to update ``Parameter``. It can be a float value or a ``Tensor`` with a float type. - parameters (Iterable, optional): Iterable of ``Tensor`` names to update to minimize ``loss``. \ + parameters (list, optional): List of ``Tensor`` names to update to minimize ``loss``. \ This parameter is required in dygraph mode. \ The default value is None in static mode, at this time all parameters will be updated. weight_decay (float|WeightDecayRegularizer, optional): The strategy of regularization. \ @@ -70,7 +62,7 @@ class Optimizer(object): :ref:`api_fluid_regularizer_L1Decay`, :ref:`api_fluid_regularizer_L2Decay`. If a parameter has set regularizer using :ref:`api_fluid_ParamAttr` already, \ the regularization setting here in optimizer will be ignored for this parameter. \ - Otherwis, the regularization setting here in optimizer will take effect. \ + Otherwise, the regularization setting here in optimizer will take effect. \ Default None, meaning there is no regularization. grad_clip (GradientClipBase, optional): Gradient cliping strategy, it's an instance of \ some derived class of ``GradientClipBase`` . There are three cliping strategies \ @@ -79,6 +71,9 @@ class Optimizer(object): name (str, optional): Normally there is no need for user to set this property. For more information, please refer to :ref:`api_guide_Name`. The default value is None. + + Returns: + Base class for optimizer. """ @imperative_base.no_grad() @@ -105,7 +100,7 @@ def __init__(self, for param in self._parameter_list: if param.regularizer is not None: logging.info( - "If regularizer of a Parameter has been set by 'fluid.ParamAttr' or 'fluid.WeightNormParamAttr' already. " + "If regularizer of a Parameter has been set by 'paddle.ParamAttr' or 'static.WeightNormParamAttr' already. " "The weight_decay[%s] in Optimizer will not take effect, and it will only be applied to other Parameters!" % weight_decay.__str__()) break @@ -159,13 +154,12 @@ def state_dict(self): Examples: .. code-block:: python - import paddle.fluid as fluid import paddle - with fluid.dygraph.guard(): - emb = fluid.dygraph.Embedding([10, 10]) + paddle.disable_static(): + emb = paddle.nn.Embedding([10, 10]) - adam = paddle.optimizer.Adam(0.001, parameters=emb.parameters()) - state_dict = adam.state_dict() + adam = paddle.optimizer.Adam(0.001, parameters=emb.parameters()) + state_dict = adam.state_dict() ''' state_dict = {} @@ -200,20 +194,20 @@ def set_state_dict(self, state_dict): Examples: .. code-block:: python - with fluid.dygraph.guard(): - emb = fluid.dygraph.Embedding([10, 10]) + paddle.disable_static(): + emb = paddle.nn.Embedding([10, 10]) - state_dict = emb.state_dict() - fluid.save_dygraph(state_dict, "paddle_dy") + state_dict = emb.state_dict() + paddle.framework.save(state_dict, "paddle_dy") - adam = paddle.optimizer.Adam(learning_rate=fluid.layers.noam_decay( 100, 10000), - parameters=emb.parameters()) - state_dict = adam.state_dict() - fluid.save_dygraph(state_dict, "paddle_dy") + adam = paddle.optimizer.Adam(learning_rate=paddle.nn.functional.noam_decay( 100, 10000), + parameters=emb.parameters()) + state_dict = adam.state_dict() + paddle.framework.save(state_dict, "paddle_dy") - para_state_dict, opti_state_dict = fluid.load_dygraph( "paddle_dy") + para_state_dict, opti_state_dict = paddle.framework.load( "paddle_dy") - adam.set_state_dict(opti_state_dict) + adam.set_state_dict(opti_state_dict) ''' @@ -337,29 +331,28 @@ def set_lr(self, value): Examples: .. code-block:: python - import paddle.fluid as fluid import paddle - with fluid.dygraph.guard(): - linear = fluid.dygraph.nn.Linear(10, 10) + paddle.disable_static() + linear = paddle.nn.Linear(10, 10) - adam = paddle.optimizer.Adam(0.1, parameters=linear.parameters()) + adam = paddle.optimizer.Adam(0.1, parameters=linear.parameters()) - # set learning rate manually by python float value - lr_list = [0.2, 0.3, 0.4, 0.5, 0.6] - for i in range(5): - adam.set_lr(lr_list[i]) - lr = adam.current_step_lr() - print("current lr is {}".format(lr)) - # Print: - # current lr is 0.2 - # current lr is 0.3 - # current lr is 0.4 - # current lr is 0.5 - # current lr is 0.6 + # set learning rate manually by python float value + lr_list = [0.2, 0.3, 0.4, 0.5, 0.6] + for i in range(5): + adam.set_lr(lr_list[i]) + lr = adam.current_step_lr() + print("current lr is {}".format(lr)) + # Print: + # current lr is 0.2 + # current lr is 0.3 + # current lr is 0.4 + # current lr is 0.5 + # current lr is 0.6 # set learning rate manually by framework Tensor - lr_var = fluid.layers.create_global_var( + lr_var = paddle.create_global_var( shape=[1], value=0.7, dtype='float32') adam.set_lr(lr_var) lr = adam.current_step_lr() @@ -411,38 +404,37 @@ def current_step_lr(self): Examples: .. code-block:: python - import paddle.fluid as fluid import numpy as np import paddle # example1: LearningRateDecay is not used, return value is all the same - with fluid.dygraph.guard(): - emb = fluid.dygraph.Embedding([10, 10]) - adam = paddle.optimizer.Adam(0.001, parameters = emb.parameters()) - lr = adam.current_step_lr() - print(lr) # 0.001 + paddle.disable_static() + emb = paddle.nn.Embedding([10, 10]) + adam = paddle.optimizer.Adam(0.001, parameters = emb.parameters()) + lr = adam.current_step_lr() + print(lr) # 0.001 # example2: PiecewiseDecay is used, return the step learning rate - with fluid.dygraph.guard(): - inp = np.random.uniform(-0.1, 0.1, [10, 10]).astype("float32") - linear = fluid.dygraph.nn.Linear(10, 10) - inp = fluid.dygraph.to_variable(inp) - out = linear(inp) - loss = fluid.layers.reduce_mean(out) - - bd = [2, 4, 6, 8] - value = [0.2, 0.4, 0.6, 0.8, 1.0] - adam = paddle.optimizer.Adam(fluid.dygraph.PiecewiseDecay(bd, value, 0), - parameters=linear.parameters()) - - # first step: learning rate is 0.2 - np.allclose(adam.current_step_lr(), 0.2, rtol=1e-06, atol=0.0) # True - - # learning rate for different steps - ret = [0.2, 0.2, 0.4, 0.4, 0.6, 0.6, 0.8, 0.8, 1.0, 1.0, 1.0, 1.0] - for i in range(12): - adam.minimize(loss) - lr = adam.current_step_lr() - np.allclose(lr, ret[i], rtol=1e-06, atol=0.0) # True + paddle.disable_static() + inp = np.random.uniform(-0.1, 0.1, [10, 10]).astype("float32") + linear = paddle.nn.Linear(10, 10) + inp = paddle.to_tensor(inp) + out = linear(inp) + loss = paddle.reduce_mean(out) + + bd = [2, 4, 6, 8] + value = [0.2, 0.4, 0.6, 0.8, 1.0] + adam = paddle.optimizer.Adam(paddle.PiecewiseDecay(bd, value, 0), + parameters=linear.parameters()) + + # first step: learning rate is 0.2 + np.allclose(adam.current_step_lr(), 0.2, rtol=1e-06, atol=0.0) # True + + # learning rate for different steps + ret = [0.2, 0.2, 0.4, 0.4, 0.6, 0.6, 0.8, 0.8, 1.0, 1.0, 1.0, 1.0] + for i in range(12): + adam.minimize(loss) + lr = adam.current_step_lr() + np.allclose(lr, ret[i], rtol=1e-06, atol=0.0) # True """ current_lr = self._global_learning_rate() @@ -732,7 +724,7 @@ def backward(self, startup_program (Program, optional): :ref:`api_fluid_Program` for initializing parameters in ``parameters``. The default value is None, at this time :ref:`api_fluid_default_startup_program` will be used. - parameters (Iterable, optional): Iterable of ``Tensor`` or ``Tensor.name`` to update + parameters (list, optional): List of ``Tensor`` or ``Tensor.name`` to update to minimize ``loss``. The default value is None, at this time all parameters will be updated. no_grad_set (set, optional): Set of ``Tensor`` or ``Tensor.name`` that don't need @@ -771,7 +763,7 @@ def backward(self, program = loss.block.program assert len(loss.shape) == 1 and loss.shape[0] == 1, \ "The loss.shape should be (1L,), but the current loss.shape is {}. " \ - "Maybe that you should call fluid.layers.mean to process the current loss.".format( + "Maybe that you should call paddle.mean to process the current loss.".format( loss.shape) parameter_list = parameters if parameters \ else self._parameter_list @@ -799,7 +791,7 @@ def apply_gradients(self, params_grads): import paddle loss = network() - optimizer = paddle.optimizer.SGD(learning_rate=0.1) + optimizer = paddle.optimizer.SGD(learning_rate=0.1) params_grads = optimizer.backward(loss) # you may append operations for params_grads here # ... @@ -869,20 +861,19 @@ def clear_gradients(self): Examples: .. code-block:: python - import paddle.fluid as fluid import numpy as np import paddle - with fluid.dygraph.guard(): - value = np.arange(26).reshape(2, 13).astype("float32") - a = fluid.dygraph.to_variable(value) - linear = fluid.Linear(13, 5, dtype="float32") - # This can be any optimizer supported by dygraph. - adam = paddle.optimizer.Adam(learning_rate = 0.01, - parameters = linear.parameters()) - out = linear(a) - out.backward() - adam.minimize(out) - adam.clear_gradients() + paddle.disable_static() + value = np.arange(26).reshape(2, 13).astype("float32") + a = paddle.to_tensor(value) + linear = paddle.nn.Linear(13, 5, dtype="float32") + # This can be any optimizer supported by dygraph. + adam = paddle.optimizer.Adam(learning_rate = 0.01, + parameters = linear.parameters()) + out = linear(a) + out.backward() + adam.step() + adam.clear_gradients() """ for p in self._parameter_list: @@ -903,7 +894,7 @@ def minimize(self, startup_program (Program, optional): :ref:`api_fluid_Program` for initializing parameters in ``parameters``. The default value is None, at this time :ref:`api_fluid_default_startup_program` will be used. - parameters (Iterable, optional): Iterable of ``Tensor`` or ``Tensor.name`` to update + parameters (list, optional): List of ``Tensor`` or ``Tensor.name`` to update to minimize ``loss``. The default value is None, at this time all parameters will be updated. no_grad_set (set, optional): Set of ``Tensor`` or ``Tensor.name`` that don't need @@ -947,20 +938,18 @@ def step(self): .. code-block:: python import paddle - import paddle.fluid as fluid import numpy as np - - with fluie.dygraph.guard(): - value = np.arange(26).reshape(2, 13).astype("float32") - a = fluid.dygraph.to_variable(value) - linear = fluid.Linear(13, 5, dtype="float32") - # This can be any optimizer supported by dygraph. - adam = paddle.optimizer.Adam(learning_rate = 0.01, - parameters = linear.parameters()) - out = linear(a) - out.backward() - adam.step() - adam.clear_gradients() + paddle.disable_static() + value = np.arange(26).reshape(2, 13).astype("float32") + a = paddle.to_tensor(value) + linear = paddle.nn.Linear(13, 5, dtype="float32") + # This can be any optimizer supported by dygraph. + adam = paddle.optimizer.Adam(learning_rate = 0.01, + parameters = linear.parameters()) + out = linear(a) + out.backward() + adam.step() + adam.clear_gradients() """ parameter_list = self._parameter_list self._dtype = None From da4025d81d3254c51270a1a7063317fe62635edc Mon Sep 17 00:00:00 2001 From: MRXLT Date: Wed, 19 Aug 2020 15:35:16 +0800 Subject: [PATCH 09/30] add error message --- python/paddle/optimizer/optimizer.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/python/paddle/optimizer/optimizer.py b/python/paddle/optimizer/optimizer.py index 1f2900a4aa0b3..346a8108a44b6 100644 --- a/python/paddle/optimizer/optimizer.py +++ b/python/paddle/optimizer/optimizer.py @@ -147,8 +147,10 @@ def state_dict(self): Get state dict information from optimizer. It contain all the variable used by optimizer. For Adam optimizer, contains beta1, beta2, momentum etc. If LearningRateDecay have been used, global_step will be include in state dict. If the optimizer never be called(minimize function), the state_dict is empty. - Args: None - Return: + Args: + None + + Returns: state_dict(dict) : dict contains all the Tensor used by optimizer Examples: @@ -465,7 +467,9 @@ def _global_learning_rate(self, program=None): def _append_optimize_op(self, block, param_and_grad): """ append optimize operator to block and return all the added optimize_op """ - raise NotImplementedError() + raise NotImplementedError( + "Class \"Optimizer\" connot be used directly as an optimizer, please use its subclasses such as \"Adam\"" + ) def _create_param_lr(self, param_and_grad): # create learning rate variable for every parameter From f3699cb240360afa4238829ea70475972790cf9b Mon Sep 17 00:00:00 2001 From: MRXLT Date: Wed, 19 Aug 2020 17:15:10 +0800 Subject: [PATCH 10/30] bug fix --- python/paddle/optimizer/__init__.py | 2 +- python/paddle/optimizer/adam.py | 2 +- python/paddle/optimizer/optimizer.py | 4 ++-- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/python/paddle/optimizer/__init__.py b/python/paddle/optimizer/__init__.py index 31a15f2e4482c..1092a7615a304 100644 --- a/python/paddle/optimizer/__init__.py +++ b/python/paddle/optimizer/__init__.py @@ -17,7 +17,7 @@ 'DGCMomentumOptimizer', 'Dpsgd', 'DpsgdOptimizer', 'ExponentialMovingAverage', 'Ftrl', 'Lamb', 'LarsMomentum', 'LookaheadOptimizer', 'ModelAverage', 'Momentum', 'PipelineOptimizer', - 'RecomputeOptimizer', 'RMSProp', 'SGD', 'SGDOptimizer', 'Optimizer' + 'RecomputeOptimizer', 'RMSProp', 'SGD', 'Optimizer' ] diff --git a/python/paddle/optimizer/adam.py b/python/paddle/optimizer/adam.py index b0b2af34f2c34..b8ffc8c5dc77d 100644 --- a/python/paddle/optimizer/adam.py +++ b/python/paddle/optimizer/adam.py @@ -173,10 +173,10 @@ def get_decayed_betas(beta1_init, beta2_init, decay_steps, decay_rate): def __init__(self, learning_rate=0.001, + parameters=None, beta1=0.9, beta2=0.999, epsilon=1e-8, - parameters=None, weight_decay=None, grad_clip=None, name=None, diff --git a/python/paddle/optimizer/optimizer.py b/python/paddle/optimizer/optimizer.py index 346a8108a44b6..d97fdc93bfa00 100644 --- a/python/paddle/optimizer/optimizer.py +++ b/python/paddle/optimizer/optimizer.py @@ -157,7 +157,7 @@ def state_dict(self): .. code-block:: python import paddle - paddle.disable_static(): + paddle.disable_static() emb = paddle.nn.Embedding([10, 10]) adam = paddle.optimizer.Adam(0.001, parameters=emb.parameters()) @@ -196,7 +196,7 @@ def set_state_dict(self, state_dict): Examples: .. code-block:: python - paddle.disable_static(): + paddle.disable_static() emb = paddle.nn.Embedding([10, 10]) state_dict = emb.state_dict() From 6f00384be6cce54cb374c8d901c9d582774b4442 Mon Sep 17 00:00:00 2001 From: MRXLT Date: Wed, 19 Aug 2020 19:20:17 +0800 Subject: [PATCH 11/30] refine rmsprop && adamax --- .../fluid/tests/unittests/test_rmsprop_op.py | 39 ++++ python/paddle/optimizer/__init__.py | 2 + python/paddle/optimizer/adam.py | 4 +- python/paddle/optimizer/adamax.py | 194 ++++++++++++++++ python/paddle/optimizer/adamw.py | 2 +- python/paddle/optimizer/optimizer.py | 2 +- python/paddle/optimizer/rmsprop.py | 210 ++++++++++++++++++ 7 files changed, 449 insertions(+), 4 deletions(-) create mode 100644 python/paddle/optimizer/adamax.py create mode 100644 python/paddle/optimizer/rmsprop.py diff --git a/python/paddle/fluid/tests/unittests/test_rmsprop_op.py b/python/paddle/fluid/tests/unittests/test_rmsprop_op.py index eb12bc7417673..5716dc4afc054 100644 --- a/python/paddle/fluid/tests/unittests/test_rmsprop_op.py +++ b/python/paddle/fluid/tests/unittests/test_rmsprop_op.py @@ -222,5 +222,44 @@ def test_rmsprop(self): size=size) +class TestRMSPropV2(unittest.TestCase): + def test_rmsprop_dygraph(self): + paddle.disable_static() + value = np.arange(26).reshape(2, 13).astype("float32") + a = paddle.to_tensor(value) + linear = paddle.nn.Linear(13, 5, dtype="float32") + # This can be any optimizer supported by dygraph. + adam = paddle.optimizer.RMSProp( + learning_rate=0.01, + parameters=linear.parameters(), + weight_decay=0.01) + out = linear(a) + out.backward() + adam.step() + adam.clear_gradients() + + def test_rmsprop(self): + place = fluid.CPUPlace() + main = fluid.Program() + with fluid.program_guard(main): + x = fluid.layers.data(name='x', shape=[13], dtype='float32') + y = fluid.layers.data(name='y', shape=[1], dtype='float32') + y_predict = fluid.layers.fc(input=x, size=1, act=None) + cost = fluid.layers.square_error_cost(input=y_predict, label=y) + avg_cost = fluid.layers.mean(cost) + + rms_optimizer = paddle.optimizer.RMSProp(learning_rate=0.1) + rms_optimizer.minimize(avg_cost) + + fetch_list = [avg_cost] + train_reader = paddle.batch( + paddle.dataset.uci_housing.train(), batch_size=1) + feeder = fluid.DataFeeder(place=place, feed_list=[x, y]) + exe = fluid.Executor(place) + exe.run(fluid.default_startup_program()) + for data in train_reader(): + exe.run(main, feed=feeder.feed(data), fetch_list=fetch_list) + + if __name__ == "__main__": unittest.main() diff --git a/python/paddle/optimizer/__init__.py b/python/paddle/optimizer/__init__.py index 1092a7615a304..24665404ab35c 100644 --- a/python/paddle/optimizer/__init__.py +++ b/python/paddle/optimizer/__init__.py @@ -30,3 +30,5 @@ from .optimizer import Optimizer from .adam import Adam from .adamw import AdamW +from .adamax import adamax +from .rmsprop import RMSProp diff --git a/python/paddle/optimizer/adam.py b/python/paddle/optimizer/adam.py index b8ffc8c5dc77d..e062f2f4ce2d3 100644 --- a/python/paddle/optimizer/adam.py +++ b/python/paddle/optimizer/adam.py @@ -63,7 +63,7 @@ class Adam(Optimizer): :ref:`api_fluid_regularizer_L1Decay`, :ref:`api_fluid_regularizer_L2Decay`. If a parameter has set regularizer using :ref:`api_fluid_ParamAttr` already, \ the regularization setting here in optimizer will be ignored for this parameter. \ - Otherwis, the regularization setting here in optimizer will take effect. \ + Otherwise, the regularization setting here in optimizer will take effect. \ Default None, meaning there is no regularization. grad_clip (GradientClipBase, optional): Gradient cliping strategy, it's an instance of some derived class of ``GradientClipBase`` . There are three cliping strategies @@ -123,7 +123,7 @@ class Adam(Optimizer): cost = fluid.layers.square_error_cost(input=y_predict, label=y) avg_cost = fluid.layers.mean(cost) - # define beta decay variable + # define beta decay Tensor def get_decayed_betas(beta1_init, beta2_init, decay_steps, decay_rate): global_step = lr_scheduler._decay_step_counter() diff --git a/python/paddle/optimizer/adamax.py b/python/paddle/optimizer/adamax.py new file mode 100644 index 0000000000000..6ca59f20a6f86 --- /dev/null +++ b/python/paddle/optimizer/adamax.py @@ -0,0 +1,194 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .optimizer import Optimizer +from ..fluid import core +from ..fluid import framework +from ..fluid.framework import Variable + +__all__ = ["Adamax"] + + +class Adamax(Optimizer): + """ + The Adamax optimizer is implemented based on the Adamax Optimization + in Section 7 of `Adam paper `_. + The Adamax algorithm is a variant of the Adam algorithm based on the infinite norm, + which makes the learning rate update algorithm more stable and simple. + + The parameter ``param_out`` update rule with gradient ``grad``: + + .. math:: + + t & = t + 1 + + moment\_out & = {\\beta}_1 * moment + (1 - {\\beta}_1) * grad + + inf\_norm\_out & = max({\\beta}_2 * inf\_norm + \epsilon, |grad|) + + learning\_rate & = \\frac{learning\_rate}{1 - {\\beta}_1^t} + + param\_out & = param - learning\_rate * \\frac{moment\_out}{inf\_norm\_out} + + Related paper: `Adam: A Method for Stochastic Optimization `_ + + The original paper does not have an ``epsilon`` attribute, + it is added here for numerical stability to prevent the division by 0 error. + + Args: + learning_rate (float|Tensor, optional): The learning rate used to update ``Parameter``. + It can be a float value or a ``Tensor`` with a float type. The default value is 0.001. + beta1 (float, optional): The exponential decay rate for the 1st moment estimates. + The default value is 0.9. + beta2 (float, optional): The exponential decay rate for the 2nd moment estimates. + The default value is 0.999. + epsilon (float, optional): A small float value for numerical stability. + The default value is 1e-08. + parameters (Iterable, optional): Iterable of ``Tensor`` names to update to minimize ``loss``. \ + This parameter is required in dygraph mode. \ + The default value is None in static mode, at this time all parameters will be updated. + weight_decay (float|WeightDecayRegularizer, optional): The strategy of regularization. \ + It canbe a float value as coeff of L2 regularization or \ + :ref:`api_fluid_regularizer_L1Decay`, :ref:`api_fluid_regularizer_L2Decay`. + If a parameter has set regularizer using :ref:`api_fluid_ParamAttr` already, \ + the regularization setting here in optimizer will be ignored for this parameter. \ + Otherwise, the regularization setting here in optimizer will take effect. \ + Default None, meaning there is no regularization. + grad_clip (GradientClipBase, optional): Gradient cliping strategy, it's an instance of + some derived class of ``GradientClipBase`` . There are three cliping strategies + ( :ref:`api_fluid_clip_GradientClipByGlobalNorm` , :ref:`api_fluid_clip_GradientClipByNorm` , + :ref:`api_fluid_clip_GradientClipByValue` ). Default None, meaning there is no gradient clipping. + name (str, optional): Normally there is no need for user to set this property. + For more information, please refer to :ref:`api_guide_Name`. + The default value is None. + + **Notes**: + **Currently, AdamaxOptimizer doesn't support sparse parameter optimization.** + + Examples: + .. code-block:: python + + import paddle.fluid as fluid + import numpy + + # First create the Executor. + place = fluid.CPUPlace() # fluid.CUDAPlace(0) + exe = fluid.Executor(place) + + train_program = fluid.Program() + startup_program = fluid.Program() + with fluid.program_guard(train_program, startup_program): + data = fluid.data(name='X', shape=[None, 1], dtype='float32') + hidden = fluid.layers.fc(input=data, size=10) + loss = fluid.layers.mean(hidden) + adam = paddle.optimizer.AdamaxOptimizer(learning_rate=0.2) + adam.minimize(loss) + + # Run the startup program once and only once. + exe.run(startup_program) + + x = numpy.random.random(size=(10, 1)).astype('float32') + outs = exe.run(program=train_program, + feed={'X': x}, + fetch_list=[loss.name]) + """ + _moment_acc_str = "moment" + _inf_norm_acc_str = "inf_norm" + _beta1_pow_acc_str = "beta1_pow_acc" + + def __init__(self, + learning_rate=0.001, + beta1=0.9, + beta2=0.999, + epsilon=1e-8, + parameters=None, + weight_decay=None, + grad_clip=None, + name=None): + assert learning_rate is not None + assert beta1 is not None + assert beta2 is not None + assert epsilon is not None + super(AdamaxOptimizer, self).__init__( + learning_rate=learning_rate, + parameters=parameters, + weight_decay=weight_decay, + grad_clip=grad_clip, + name=name) + self.type = "adamax" + self._beta1 = beta1 + self._beta2 = beta2 + self._epsilon = epsilon + + def _create_accumulators(self, block, parameters): + # Create accumulator tensors for first moment and infinity norm + for p in parameters: + self._add_accumulator(self._moment_acc_str, p) + self._add_accumulator(self._inf_norm_acc_str, p) + self._add_accumulator( + name=self._beta1_pow_acc_str, + param=p, + fill_value=self._beta1, + shape=[1]) + + def _append_optimize_op(self, block, param_and_grad): + assert isinstance(block, framework.Block) + + moment = self._get_accumulator(self._moment_acc_str, param_and_grad[0]) + inf_norm = self._get_accumulator(self._inf_norm_acc_str, + param_and_grad[0]) + beta1_pow_acc = self._get_accumulator(self._beta1_pow_acc_str, + param_and_grad[0]) + # create the adamax optimize op + adamax_op = block.append_op( + type=self.type, + inputs={ + "Param": param_and_grad[0], + "Grad": param_and_grad[1], + "LearningRate": self._create_param_lr(param_and_grad), + "Moment": moment, + "InfNorm": inf_norm, + "Beta1Pow": beta1_pow_acc + }, + outputs={ + "ParamOut": param_and_grad[0], + "MomentOut": moment, + "InfNormOut": inf_norm + }, + attrs={ + "beta1": self._beta1, + "beta2": self._beta2, + "epsilon": self._epsilon + }, + stop_gradient=True) + + return adamax_op + + def _finish_update(self, block, parameters_and_grads): + """Update Beta1 Power accumulator + """ + assert isinstance(block, framework.Block) + for param, grad in parameters_and_grads: + if grad is None or param.trainable is False: + continue + with param.block.program._optimized_guard( + [param, grad]), name_scope('adamx'): + beta1_pow_acc = self._get_accumulator(self._beta1_pow_acc_str, + param) + block.append_op( + type="scale", + inputs={"X": beta1_pow_acc}, + outputs={"Out": beta1_pow_acc}, + attrs={"scale": self._beta1}, + stop_gradient=True) diff --git a/python/paddle/optimizer/adamw.py b/python/paddle/optimizer/adamw.py index fecc74ce2c2d7..97bc37a222697 100644 --- a/python/paddle/optimizer/adamw.py +++ b/python/paddle/optimizer/adamw.py @@ -23,7 +23,7 @@ class DecoupledWeightDecay(object): def __init__(self, coeff=0.0, apply_decay_param_fun=None, **kwargs): if not isinstance(coeff, float) and \ not isinstance(coeff, framework.Variable): - raise TypeError("coeff should be float or Variable.") + raise TypeError("coeff should be float or Tensor.") self._params_name = set() self._apply_decay_param_fun = apply_decay_param_fun self._coeff = coeff diff --git a/python/paddle/optimizer/optimizer.py b/python/paddle/optimizer/optimizer.py index d97fdc93bfa00..2907ae2b76ec9 100644 --- a/python/paddle/optimizer/optimizer.py +++ b/python/paddle/optimizer/optimizer.py @@ -434,7 +434,7 @@ def current_step_lr(self): # learning rate for different steps ret = [0.2, 0.2, 0.4, 0.4, 0.6, 0.6, 0.8, 0.8, 1.0, 1.0, 1.0, 1.0] for i in range(12): - adam.minimize(loss) + adam.step() lr = adam.current_step_lr() np.allclose(lr, ret[i], rtol=1e-06, atol=0.0) # True diff --git a/python/paddle/optimizer/rmsprop.py b/python/paddle/optimizer/rmsprop.py new file mode 100644 index 0000000000000..6a5bd51469de5 --- /dev/null +++ b/python/paddle/optimizer/rmsprop.py @@ -0,0 +1,210 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from optimizer import Optimizer +from ..fluid import core +from ..fluid import framework +from ..fluid.framework import Variable + +__all__ = ["RMSProp"] + + +class RMSProp(Optimizer): + """ + Root Mean Squared Propagation (RMSProp) is an unpublished, adaptive learning + rate method. The original slides proposed RMSProp: Slide 29 of + http://www.cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf . + + The original equation is as follows: + + .. math:: + + r(w, t) & = \\rho r(w, t-1) + (1 - \\rho)(\\nabla Q_{i}(w))^2 + + w & = w - \\frac{\\eta} {\\sqrt{r(w,t) + \\epsilon}} \\nabla Q_{i}(w) + + The first equation calculates moving average of the squared gradient for + each weight. Then dividing the gradient by :math:`sqrt{v(w,t)}`. + + In some cases, adding a momentum term :math: `\\beta` is beneficial. + In our implementation, Nesterov momentum is used: + + .. math:: + + r(w, t) & = \\rho r(w, t-1) + (1 - \\rho)(\\nabla Q_{i}(w))^2 + + v(w, t) & = \\beta v(w, t-1) + \\frac{\\eta} {\\sqrt{r(w,t) + + \\epsilon}} \\nabla Q_{i}(w) + + w & = w - v(w, t) + + if centered is True: + + .. math:: + + r(w, t) & = \\rho r(w, t-1) + (1 - \\rho)(\\nabla Q_{i}(w))^2 + + g(w, t) & = \\rho g(w, t-1) + (1 - \\rho)\\nabla Q_{i}(w) + + v(w, t) & = \\beta v(w, t-1) + \\frac{\\eta} {\\sqrt{r(w,t) - (g(w, t))^2 + + \\epsilon}} \\nabla Q_{i}(w) + + w & = w - v(w, t) + + where, :math:`\\rho` is a hyperparameter and typical values are 0.9, 0.95 + and so on. :math: `beta` is the momentum term. :math: `\\epsilon` is a + smoothing term to avoid division by zero, usually set somewhere in range + from 1e-4 to 1e-8. + + + Parameters: + learning_rate(float): Global learning rate. + rho(float): rho is :math: `\\rho` in equation, default is 0.95. + epsilon(float): :math: `\\epsilon` in equation is smoothing term to + avoid division by zero, default is 1e-6. + momentum(float): :math:`\\beta` in equation is the momentum term, + default is 0.0. + centered(bool): If True, gradients are normalized by the estimated variance of + the gradient; if False, by the uncentered second moment. Setting this to + True may help with training, but is slightly more expensive in terms of + computation and memory. Defaults to False. + parameters (Iterable, optional): Iterable of ``Tensor`` names to update to minimize ``loss``. \ + This parameter is required in dygraph mode. \ + The default value is None in static mode, at this time all parameters will be updated. + weight_decay (float|WeightDecayRegularizer, optional): The strategy of regularization. \ + It canbe a float value as coeff of L2 regularization or \ + :ref:`api_fluid_regularizer_L1Decay`, :ref:`api_fluid_regularizer_L2Decay`. + If a parameter has set regularizer using :ref:`api_fluid_ParamAttr` already, \ + the regularization setting here in optimizer will be ignored for this parameter. \ + Otherwise, the regularization setting here in optimizer will take effect. \ + Default None, meaning there is no regularization. + grad_clip (GradientClipBase, optional): Gradient cliping strategy, it's an instance of + some derived class of ``GradientClipBase`` . There are three cliping strategies + ( :ref:`api_fluid_clip_GradientClipByGlobalNorm` , :ref:`api_fluid_clip_GradientClipByNorm` , + :ref:`api_fluid_clip_GradientClipByValue` ). Default None, meaning there is no gradient clipping. + name (str, optional): This parameter is used by developers to print debugging information. \ + For details, please refer to :ref:`api_guide_Name`. Default is None. + + Raises: + ValueError: If learning_rate, rho, epsilon, momentum are None. + + Examples: + .. code-block:: python + + import paddle + import paddle.fluid as fluid + import numpy as np + + place = fluid.CPUPlace() + main = fluid.Program() + with fluid.program_guard(main): + x = fluid.layers.data(name='x', shape=[13], dtype='float32') + y = fluid.layers.data(name='y', shape=[1], dtype='float32') + y_predict = fluid.layers.fc(input=x, size=1, act=None) + cost = fluid.layers.square_error_cost(input=y_predict, label=y) + avg_cost = fluid.layers.mean(cost) + + rms_optimizer = fluid.optimizer.RMSProp(learning_rate=0.1) + rms_optimizer.minimize(avg_cost) + + fetch_list = [avg_cost] + train_reader = paddle.batch( + paddle.dataset.uci_housing.train(), batch_size=1) + feeder = fluid.DataFeeder(place=place, feed_list=[x, y]) + exe = fluid.Executor(place) + exe.run(fluid.default_startup_program()) + for data in train_reader(): + exe.run(main, feed=feeder.feed(data), fetch_list=fetch_list) + + """ + + _momentum_acc_str = "momentum" + _mean_square_acc_str = "mean_square" + _mean_grad_acc_str = "mean_grad" + + def __init__(self, + learning_rate, + rho=0.95, + epsilon=1.0e-6, + momentum=0.0, + centered=False, + parameters=None, + weight_decay=None, + grad_clip=None, + name=None): + super(RMSProp, self).__init__( + learning_rate=learning_rate, + parameters=parameters, + weight_decay=weight_decay, + grad_clip=grad_clip, + name=name) + if learning_rate is None: + raise ValueError("learning_rate is not set.") + if rho is None: + raise ValueError("rho is not set.") + if epsilon is None: + raise ValueError("epsilon is not set.") + if momentum is None: + raise ValueError("momentum is not set.") + + self.type = "rmsprop" + self._rho = rho + self._epsilon = epsilon + self._momentum = momentum + self._centered = centered + + def _create_accumulators(self, block, parameters): + if not isinstance(block, framework.Block): + raise TypeError("block is not instance of framework.Block.") + + for p in parameters: + self._add_accumulator(self._momentum_acc_str, p) + self._add_accumulator(self._mean_square_acc_str, p) + self._add_accumulator(self._mean_grad_acc_str, p) + + def _append_optimize_op(self, block, param_and_grad): + if not isinstance(block, framework.Block): + raise TypeError("block is not instance of framework.Block.") + + momentum_acc = self._get_accumulator(self._momentum_acc_str, + param_and_grad[0]) + mean_square_acc = self._get_accumulator(self._mean_square_acc_str, + param_and_grad[0]) + mean_grad_acc = self._get_accumulator(self._mean_grad_acc_str, + param_and_grad[0]) + rmsprop_op = block.append_op( + type=self.type, + inputs={ + "Param": param_and_grad[0], + "Grad": param_and_grad[1], + "Moment": momentum_acc, + "MeanSquare": mean_square_acc, + "MeanGrad": mean_grad_acc, + "LearningRate": self._create_param_lr(param_and_grad), + }, + outputs={ + "ParamOut": param_and_grad[0], + "MomentOut": momentum_acc, + "MeanSquareOut": mean_square_acc, + "MeanGradOut": mean_grad_acc + }, + attrs={ + "epsilon": self._epsilon, + "decay": self._rho, + "momentum": self._momentum, + "centered": self._centered + }, + stop_gradient=True) + + return rmsprop_op From 654377d155324d7d66b88e530bcf46b1c2b602f0 Mon Sep 17 00:00:00 2001 From: MRXLT Date: Wed, 19 Aug 2020 19:48:58 +0800 Subject: [PATCH 12/30] fix ci --- .../tests/unittests/test_retain_graph.py | 4 ++-- python/paddle/optimizer/__init__.py | 2 +- python/paddle/optimizer/optimizer.py | 23 ++++++++++++------- 3 files changed, 18 insertions(+), 11 deletions(-) diff --git a/python/paddle/fluid/tests/unittests/test_retain_graph.py b/python/paddle/fluid/tests/unittests/test_retain_graph.py index 360a2de1df6ed..1053ff84ac06d 100644 --- a/python/paddle/fluid/tests/unittests/test_retain_graph.py +++ b/python/paddle/fluid/tests/unittests/test_retain_graph.py @@ -96,8 +96,8 @@ def run_retain(self, need_retain): g = Generator() d = Discriminator() - optim_g = paddle.optimizer.Adam(parameter_list=g.parameters()) - optim_d = paddle.optimizer.Adam(parameter_list=d.parameters()) + optim_g = paddle.optimizer.Adam(parameters=g.parameters()) + optim_d = paddle.optimizer.Adam(parameters=d.parameters()) gan_criterion = paddle.nn.MSELoss() l1_criterion = paddle.nn.L1Loss() diff --git a/python/paddle/optimizer/__init__.py b/python/paddle/optimizer/__init__.py index 24665404ab35c..2c84953f0225b 100644 --- a/python/paddle/optimizer/__init__.py +++ b/python/paddle/optimizer/__init__.py @@ -30,5 +30,5 @@ from .optimizer import Optimizer from .adam import Adam from .adamw import AdamW -from .adamax import adamax +from .adamax import Adamax from .rmsprop import RMSProp diff --git a/python/paddle/optimizer/optimizer.py b/python/paddle/optimizer/optimizer.py index 2907ae2b76ec9..75446da8bc5f8 100644 --- a/python/paddle/optimizer/optimizer.py +++ b/python/paddle/optimizer/optimizer.py @@ -195,7 +195,7 @@ def set_state_dict(self, state_dict): Examples: .. code-block:: python - + import paddle paddle.disable_static() emb = paddle.nn.Embedding([10, 10]) @@ -793,13 +793,20 @@ def apply_gradients(self, params_grads): Examples: .. code-block:: python - import paddle - loss = network() - optimizer = paddle.optimizer.SGD(learning_rate=0.1) - params_grads = optimizer.backward(loss) - # you may append operations for params_grads here - # ... - optimizer.apply_gradients(params_grads) + import paddle + import numpy as np + + paddle.disable_static() + inp = np.random.uniform(-0.1, 0.1, [10, 10]).astype("float32") + linear = paddle.nn.Linear(10, 10) + inp = paddle.to_tensor(inp) + out = linear(inp) + loss = paddle.mean(out) + optimizer = paddle.optimizer.Adam(learning_rate=0.1, + parameters=linear.parameters()) + params_grads = optimizer.backward(loss) + optimizer.apply_gradients(params_grads) + """ params_grads = sorted(params_grads, key=lambda x: x[0].name) From fa7ccb1eda57ec54d445962180fd386fcb18dbb2 Mon Sep 17 00:00:00 2001 From: MRXLT Date: Wed, 19 Aug 2020 21:09:56 +0800 Subject: [PATCH 13/30] buf fix --- python/paddle/optimizer/adam.py | 112 +++++++++------------------ python/paddle/optimizer/adamax.py | 50 ++++++------ python/paddle/optimizer/adamw.py | 71 ++++++++++------- python/paddle/optimizer/optimizer.py | 1 + python/paddle/optimizer/rmsprop.py | 37 ++++----- 5 files changed, 121 insertions(+), 150 deletions(-) diff --git a/python/paddle/optimizer/adam.py b/python/paddle/optimizer/adam.py index e062f2f4ce2d3..b2c66e5d8a111 100644 --- a/python/paddle/optimizer/adam.py +++ b/python/paddle/optimizer/adam.py @@ -84,87 +84,45 @@ class Adam(Optimizer): .. code-block:: python import paddle - import paddle.fluid as fluid - - place = fluid.CPUPlace() - main = fluid.Program() - with fluid.program_guard(main): - x = fluid.data(name='x', shape=[None, 13], dtype='float32') - y = fluid.data(name='y', shape=[None, 1], dtype='float32') - y_predict = fluid.layers.fc(input=x, size=1, act=None) - cost = fluid.layers.square_error_cost(input=y_predict, label=y) - avg_cost = fluid.layers.mean(cost) - - adam_optimizer = paddle.optimizer.Adam(0.01) - adam_optimizer.minimize(avg_cost) - - fetch_list = [avg_cost] - train_reader = paddle.batch( - paddle.dataset.uci_housing.train(), batch_size=1) - feeder = fluid.DataFeeder(place=place, feed_list=[x, y]) - exe = fluid.Executor(place) - exe.run(fluid.default_startup_program()) - for data in train_reader(): - exe.run(main, feed=feeder.feed(data), fetch_list=fetch_list) + import numpy as np + + paddle.disable_static() + inp = np.random.uniform(-0.1, 0.1, [10, 10]).astype("float32") + linear = paddle.nn.Linear(10, 10) + inp = paddle.to_tensor(inp) + out = linear(inp) + loss = paddle.mean(out) + adam = paddle.optimizer.Adam(learning_rate=0.1, + parameters=linear.parameters()) + out.backward() + adam.step() + adam.clear_grad() .. code-block:: python - # Adam with beta1/beta2 as Tensor + # Adam with beta1/beta2 as Tensor and weight_decay as float import paddle - import paddle.fluid as fluid - import paddle.fluid.layers.learning_rate_scheduler as lr_scheduler - - place = fluid.CPUPlace() - main = fluid.Program() - with fluid.program_guard(main): - x = fluid.data(name='x', shape=[None, 13], dtype='float32') - y = fluid.data(name='y', shape=[None, 1], dtype='float32') - y_predict = fluid.layers.fc(input=x, size=1, act=None) - cost = fluid.layers.square_error_cost(input=y_predict, label=y) - avg_cost = fluid.layers.mean(cost) - - # define beta decay Tensor - def get_decayed_betas(beta1_init, beta2_init, decay_steps, decay_rate): - global_step = lr_scheduler._decay_step_counter() - - beta1 = fluid.layers.create_global_var( - shape=[1], - value=float(beta1_init), - dtype='float32', - # set persistable for save checkpoints and resume - persistable=True, - name="beta1") - beta2 = fluid.layers.create_global_var( - shape=[1], - value=float(beta2_init), - dtype='float32', - # set persistable for save checkpoints and resume - persistable=True, - name="beta2") - - div_res = global_step / decay_steps - decayed_beta1 = beta1_init * (decay_rate**div_res) - decayed_beta2 = beta2_init * (decay_rate**div_res) - fluid.layers.assign(decayed_beta1, beta1) - fluid.layers.assign(decayed_beta2, beta2) - - return beta1, beta2 - - beta1, beta2 = get_decayed_betas(0.9, 0.99, 1e5, 0.9) - adam_optimizer = paddle.optimizer.Adam( - learning_rate=0.01, - beta1=beta1, - beta2=beta2) - adam_optimizer.minimize(avg_cost) - - fetch_list = [avg_cost] - train_reader = paddle.batch( - paddle.dataset.uci_housing.train(), batch_size=1) - feeder = fluid.DataFeeder(place=place, feed_list=[x, y]) - exe = fluid.Executor(place) - exe.run(fluid.default_startup_program()) - for data in train_reader(): - exe.run(main, feed=feeder.feed(data), fetch_list=fetch_list) + import numpy as np + + paddle.disable_static() + inp = np.random.uniform(-0.1, 0.1, [10, 10]).astype("float32") + linear = paddle.nn.Linear(10, 10) + inp = paddle.to_tensor(inp) + out = linear(inp) + loss = paddle.mean(out) + + beta1 = paddle.to_tensor([0.9], dtype="float32") + beta2 = paddle.to_tensor([0.99], dtype="float32") + + adam = paddle.optimizer.Adam(learning_rate=0.1, + parameters=linear.parameters(), + beta1=beta1, + beta2=beta2, + weight_decay=0.01) + out.backward() + adam.step() + adam.clear_grad() + """ _moment1_acc_str = "moment1" _moment2_acc_str = "moment2" diff --git a/python/paddle/optimizer/adamax.py b/python/paddle/optimizer/adamax.py index 6ca59f20a6f86..60d6b75b5a962 100644 --- a/python/paddle/optimizer/adamax.py +++ b/python/paddle/optimizer/adamax.py @@ -15,7 +15,7 @@ from .optimizer import Optimizer from ..fluid import core from ..fluid import framework -from ..fluid.framework import Variable +from ..fluid.framework import Variable, name_scope __all__ = ["Adamax"] @@ -78,30 +78,28 @@ class Adamax(Optimizer): Examples: .. code-block:: python + import paddle + import numpy as np + + paddle.disable_static() + inp = np.random.uniform(-0.1, 0.1, [10, 10]).astype("float32") + linear = paddle.nn.Linear(10, 10) + inp = paddle.to_tensor(inp) + out = linear(inp) + loss = paddle.mean(out) + + beta1 = paddle.to_tensor([0.9], dtype="float32") + beta2 = paddle.to_tensor([0.99], dtype="float32") + + adam = paddle.optimizer.Adamax(learning_rate=0.1, + parameters=linear.parameters(), + beta1=beta1, + beta2=beta2, + weight_decay=0.01) + out.backward() + adam.step() + adam.clear_grad() - import paddle.fluid as fluid - import numpy - - # First create the Executor. - place = fluid.CPUPlace() # fluid.CUDAPlace(0) - exe = fluid.Executor(place) - - train_program = fluid.Program() - startup_program = fluid.Program() - with fluid.program_guard(train_program, startup_program): - data = fluid.data(name='X', shape=[None, 1], dtype='float32') - hidden = fluid.layers.fc(input=data, size=10) - loss = fluid.layers.mean(hidden) - adam = paddle.optimizer.AdamaxOptimizer(learning_rate=0.2) - adam.minimize(loss) - - # Run the startup program once and only once. - exe.run(startup_program) - - x = numpy.random.random(size=(10, 1)).astype('float32') - outs = exe.run(program=train_program, - feed={'X': x}, - fetch_list=[loss.name]) """ _moment_acc_str = "moment" _inf_norm_acc_str = "inf_norm" @@ -120,7 +118,7 @@ def __init__(self, assert beta1 is not None assert beta2 is not None assert epsilon is not None - super(AdamaxOptimizer, self).__init__( + super(Adamax, self).__init__( learning_rate=learning_rate, parameters=parameters, weight_decay=weight_decay, @@ -183,7 +181,7 @@ def _finish_update(self, block, parameters_and_grads): if grad is None or param.trainable is False: continue with param.block.program._optimized_guard( - [param, grad]), name_scope('adamx'): + [param, grad]), name_scope('adamax'): beta1_pow_acc = self._get_accumulator(self._beta1_pow_acc_str, param) block.append_op( diff --git a/python/paddle/optimizer/adamw.py b/python/paddle/optimizer/adamw.py index 97bc37a222697..ba60f920492ca 100644 --- a/python/paddle/optimizer/adamw.py +++ b/python/paddle/optimizer/adamw.py @@ -183,33 +183,52 @@ class AdamW(DecoupledWeightDecay, Adam): Examples: .. code-block:: python + import paddle + import numpy as np + + paddle.disable_static() + inp = np.random.uniform(-0.1, 0.1, [10, 10]).astype("float32") + linear = paddle.nn.Linear(10, 10) + inp = paddle.to_tensor(inp) + out = linear(inp) + loss = paddle.mean(out) + + beta1 = paddle.to_tensor([0.9], dtype="float32") + beta2 = paddle.to_tensor([0.99], dtype="float32") + + adam = paddle.optimizer.AdamW(learning_rate=0.1, + parameters=linear.parameters(), + beta1=beta1, + beta2=beta2, + weight_decay=0.01) + out.backward() + adam.step() + adam.clear_grad() - import paddle.fluid as fluid - import paddle - import numpy - - # First create the Executor. - place = fluid.CPUPlace() # fluid.CUDAPlace(0) - exe = fluid.Executor(place) - - train_program = fluid.Program() - startup_program = fluid.Program() - with fluid.program_guard(train_program, startup_program): - data = fluid.data(name='X', shape=[None, 1], dtype='float32') - hidden = fluid.layers.fc(input=data, size=10) - loss = fluid.layers.mean(hidden) - adam = paddle.optimizer.AdamW(learning_rate=0.2) - adam.minimize(loss) - - # Run the startup program once and only once. - exe.run(startup_program) - - x = numpy.random.random(size=(10, 1)).astype('float32') - outs = exe.run(program=train_program, - feed={'X': x}, - fetch_list=[loss.name]) """ - def __init__(self, weight_decay, apply_decay_param_fun=None, **kwargs): + def __init__(self, + weight_decay, + apply_decay_param_fun=None, + learning_rate=0.001, + parameters=None, + beta1=0.9, + beta2=0.999, + epsilon=1e-8, + grad_clip=None, + name=None, + lazy_mode=False): + args_dict = { + "learning_rate": learning_rate, + "parameters": parameters, + "beta1": beta1, + "beta2": beta2, + "epsilon": epsilon, + "grad_clip": grad_clip, + "name": name, + "lazy_mode": lazy_mode + } super(AdamW, self).__init__( - weight_decay, apply_decay_param_fun=apply_decay_param_fun, **kwargs) + weight_decay, + apply_decay_param_fun=apply_decay_param_fun, + **args_dict) diff --git a/python/paddle/optimizer/optimizer.py b/python/paddle/optimizer/optimizer.py index 75446da8bc5f8..d6789ababaa32 100644 --- a/python/paddle/optimizer/optimizer.py +++ b/python/paddle/optimizer/optimizer.py @@ -140,6 +140,7 @@ def __init__(self, self._opti_name_list = [] self._accumulators_holder = {} self._param_device_map = dict() + self.clear_grad = self.clear_gradients @framework.dygraph_only def state_dict(self): diff --git a/python/paddle/optimizer/rmsprop.py b/python/paddle/optimizer/rmsprop.py index 6a5bd51469de5..9b72ef59a02ff 100644 --- a/python/paddle/optimizer/rmsprop.py +++ b/python/paddle/optimizer/rmsprop.py @@ -103,29 +103,24 @@ class RMSProp(Optimizer): .. code-block:: python import paddle - import paddle.fluid as fluid import numpy as np - place = fluid.CPUPlace() - main = fluid.Program() - with fluid.program_guard(main): - x = fluid.layers.data(name='x', shape=[13], dtype='float32') - y = fluid.layers.data(name='y', shape=[1], dtype='float32') - y_predict = fluid.layers.fc(input=x, size=1, act=None) - cost = fluid.layers.square_error_cost(input=y_predict, label=y) - avg_cost = fluid.layers.mean(cost) - - rms_optimizer = fluid.optimizer.RMSProp(learning_rate=0.1) - rms_optimizer.minimize(avg_cost) - - fetch_list = [avg_cost] - train_reader = paddle.batch( - paddle.dataset.uci_housing.train(), batch_size=1) - feeder = fluid.DataFeeder(place=place, feed_list=[x, y]) - exe = fluid.Executor(place) - exe.run(fluid.default_startup_program()) - for data in train_reader(): - exe.run(main, feed=feeder.feed(data), fetch_list=fetch_list) + paddle.disable_static() + inp = np.random.uniform(-0.1, 0.1, [10, 10]).astype("float32") + linear = paddle.nn.Linear(10, 10) + inp = paddle.to_tensor(inp) + out = linear(inp) + loss = paddle.mean(out) + + beta1 = paddle.to_tensor([0.9], dtype="float32") + beta2 = paddle.to_tensor([0.99], dtype="float32") + + adam = paddle.optimizer.RMSProp(learning_rate=0.1, + parameters=linear.parameters(), + weight_decay=0.01) + out.backward() + adam.step() + adam.clear_grad() """ From 9aaf8993eba2c5581102a5d10a158beb9c33090f Mon Sep 17 00:00:00 2001 From: MRXLT Date: Wed, 19 Aug 2020 21:39:07 +0800 Subject: [PATCH 14/30] update comment --- python/paddle/optimizer/optimizer.py | 48 ++++++++++++++-------------- 1 file changed, 24 insertions(+), 24 deletions(-) diff --git a/python/paddle/optimizer/optimizer.py b/python/paddle/optimizer/optimizer.py index d6789ababaa32..a8efcd6eecc79 100644 --- a/python/paddle/optimizer/optimizer.py +++ b/python/paddle/optimizer/optimizer.py @@ -126,14 +126,14 @@ def __init__(self, # the learning rate type should be inferenced from loss self._dtype = None # each program should have a independent learning rate - # program -> Variable(learning_rate) + # program -> tensor(learning_rate) self._learning_rate_map = dict() if isinstance(self._learning_rate, framework.Variable): self._learning_rate_map[framework.default_main_program( )] = self._learning_rate # Dictionary of accumulators. Some optimizer subclasses need to - # allocate and manage extra variables associated with the parameters - # to train. These variables are called accumulators. + # allocate and manage extra tensors associated with the parameters + # to train. These tensors are called accumulators. # {accum_name : { paramter_name : accumulator_for_parameter, ...}, ...} self._accumulators = defaultdict(lambda: dict()) self.helper = None @@ -145,7 +145,7 @@ def __init__(self, @framework.dygraph_only def state_dict(self): ''' - Get state dict information from optimizer. It contain all the variable used by optimizer. For Adam optimizer, contains beta1, beta2, momentum etc. If LearningRateDecay have been used, global_step will be include in state dict. + Get state dict information from optimizer. It contain all the tensor used by optimizer. For Adam optimizer, contains beta1, beta2, momentum etc. If LearningRateDecay have been used, global_step will be include in state dict. If the optimizer never be called(minimize function), the state_dict is empty. Args: @@ -274,7 +274,7 @@ def get_opti_var_name_list(self): def _create_global_learning_rate(self): if imperative_base.enabled(): - # create learning rate Variable + # create learning rate tensor if isinstance(self._learning_rate, float): lr = self._global_learning_rate() @@ -473,7 +473,7 @@ def _append_optimize_op(self, block, param_and_grad): ) def _create_param_lr(self, param_and_grad): - # create learning rate variable for every parameter + # create learning rate tensor for every parameter param = param_and_grad[0] param_lr = param.optimize_attr['learning_rate'] if type(param_lr) == Variable: @@ -491,8 +491,8 @@ def _create_accumulators(self, block, parameters): """Create all accumulators needed by the parameters Args: - block: the block in which the loss variable is present - parameters: list of parameter variables for the optimizer + block: the block in which the loss tensor is present + parameters: list of parameter tensors for the optimizer """ pass @@ -501,8 +501,8 @@ def _finish_update(self, block, parameters_and_grads): before completing an optimization step Args: - block: the block in which the loss variable is present - parameters: list of parameter variables for the optimizer + block: the block in which the loss tensor is present + parameters: list of parameter tensors for the optimizer Returns: None @@ -520,11 +520,11 @@ def _add_accumulator(self, """Utility function to add an accumulator for a parameter Args: - block: the block in which the loss variable is present + block: the block in which the loss tensor is present name: name of the accumulator - param: parameter variable for which accumulator is to be added - dtype: data type of the accumulator variable - fill_value: value to initialize the accumulator variable + param: parameter tensor for which accumulator is to be added + dtype: data type of the accumulator tensor + fill_value: value to initialize the accumulator tensor """ if self._name is not None: name = self._name + "_" + name @@ -569,10 +569,10 @@ def _get_accumulator(self, name, param): Args: name: name of the accumulator - param: parameter variable for which accumulator is to be fetched + param: parameter tensor for which accumulator is to be fetched Returns: - accumulator variable for the parameter + accumulator tensor for the parameter """ if self._name is not None: name = self._name + "_" + name @@ -603,11 +603,11 @@ def _get_device_for_param(self, param_name): return device def _create_optimization_pass(self, parameters_and_grads): - """Add optimization operators to update gradients to variables. + """Add optimization operators to update gradients to tensors. Args: parameters_and_grads(list(tuple(Tensor, Tensor))): - a list of (variable, gradient) pair to update. + a list of (tensor, gradient) pair to update. Returns: return_op_list: a list of operators that will complete one step of @@ -676,7 +676,7 @@ def _process_distribute_lookuptable(self, param_grads): and avoid to add regularization and other op for it, and add sgd optimize op for it independently. :param param_grads(list((Var, Var))): list of (param, grad) pair. - :param loss: the loss variable. + :param loss: the loss tensor. :param startup_program: the startup program """ program = framework.default_main_program() @@ -725,7 +725,7 @@ def backward(self, the current program. Args: - loss (Tensor): ``loss`` variable to run optimizations. + loss (Tensor): ``loss`` tensor to run optimizations. startup_program (Program, optional): :ref:`api_fluid_Program` for initializing parameters in ``parameters``. The default value is None, at this time :ref:`api_fluid_default_startup_program` will be used. @@ -738,7 +738,7 @@ def backward(self, operator for one parameter. The default value is None. Return: - list: list of (param, grad) variable pairs, param is ``Parameter``, + list: list of (param, grad) tensor pairs, param is ``Parameter``, grad is the gradient value corresponding to the parameter. Examples: @@ -757,7 +757,7 @@ def backward(self, if not param.trainable: continue if param._grad_ivar() is not None: - # create gradient variable + # create gradient tensor grad_var = param._grad_ivar() params_grads.append((param, grad_var)) else: @@ -831,7 +831,7 @@ def apply_optimize(self, loss, startup_program, params_grads): Second part of `minimize`, appending optimization operators for given `params_grads` pairs. Args: - loss (Tensor): loss variable to run optimizations. + loss (Tensor): loss tensor to run optimizations. startup_program (Program): startup_program for initializing parameters in `parameters`. params_grads (list): list of (param, grad) pair to do optimization. @@ -914,7 +914,7 @@ def minimize(self, Returns: tuple: tuple (optimize_ops, params_grads), A list of operators appended - by minimize and a list of (param, grad) variable pairs, param is + by minimize and a list of (param, grad) tensor pairs, param is ``Parameter``, grad is the gradient value corresponding to the parameter. The returned tuple can be passed to ``fetch_list`` in ``Executor.run()`` to indicate program pruning. If so, the program will be pruned by ``feed`` and From b727dad0135e0886676e7ef0a2129f10be05412f Mon Sep 17 00:00:00 2001 From: MRXLT Date: Thu, 20 Aug 2020 10:44:21 +0800 Subject: [PATCH 15/30] unify arguments place; notest --- python/paddle/optimizer/adam.py | 2 +- python/paddle/optimizer/adamw.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/python/paddle/optimizer/adam.py b/python/paddle/optimizer/adam.py index b2c66e5d8a111..405f68a6845d0 100644 --- a/python/paddle/optimizer/adam.py +++ b/python/paddle/optimizer/adam.py @@ -131,10 +131,10 @@ class Adam(Optimizer): def __init__(self, learning_rate=0.001, - parameters=None, beta1=0.9, beta2=0.999, epsilon=1e-8, + parameters=None, weight_decay=None, grad_clip=None, name=None, diff --git a/python/paddle/optimizer/adamw.py b/python/paddle/optimizer/adamw.py index ba60f920492ca..fdffe442ef81b 100644 --- a/python/paddle/optimizer/adamw.py +++ b/python/paddle/optimizer/adamw.py @@ -211,10 +211,10 @@ def __init__(self, weight_decay, apply_decay_param_fun=None, learning_rate=0.001, - parameters=None, beta1=0.9, beta2=0.999, epsilon=1e-8, + parameters=None, grad_clip=None, name=None, lazy_mode=False): From 9cf4c3b4b6ff59bc4384c73aa0d84087a4a474b3 Mon Sep 17 00:00:00 2001 From: mapingshuo Date: Thu, 20 Aug 2020 10:41:12 +0800 Subject: [PATCH 16/30] fix ut, test=develop --- python/paddle/fluid/optimizer.py | 18 ++++++++++-------- .../test_dist_fleet_a_sync_optimizer_async.py | 4 ++-- .../test_dist_fleet_a_sync_optimizer_geo.py | 2 +- .../test_dist_fleet_a_sync_optimizer_sync.py | 2 +- .../tests/unittests/test_dist_fleet_ps2.py | 2 +- .../unittests/test_fleet_amp_meta_optimizer.py | 2 +- .../fluid/tests/unittests/test_fleet_base.py | 4 ++-- .../unittests/test_fleet_dgc_meta_optimizer.py | 8 +++++--- ...test_fleet_gradient_merge_meta_optimizer.py | 2 +- ...est_fleet_graph_execution_meta_optimizer.py | 4 ++-- .../unittests/test_fleet_graph_executor.py | 2 +- .../test_fleet_lamb_meta_optimizer.py | 7 ++++--- .../test_fleet_lars_meta_optimizer.py | 5 +++-- .../test_fleet_localsgd_meta_optimizer.py | 2 +- .../test_fleet_pipeline_meta_optimizer.py | 2 +- .../test_fleet_recompute_meta_optimizer.py | 2 +- .../fluid/tests/unittests/test_rmsprop_op.py | 1 + 17 files changed, 38 insertions(+), 31 deletions(-) diff --git a/python/paddle/fluid/optimizer.py b/python/paddle/fluid/optimizer.py index d7e6a9d211b8b..a535c48cd1ff5 100644 --- a/python/paddle/fluid/optimizer.py +++ b/python/paddle/fluid/optimizer.py @@ -40,6 +40,7 @@ from functools import reduce from .wrapped_decorator import signature_safe_contextmanager from .. import compat as cpt +import paddle __all__ = [ 'SGD', 'Momentum', 'Adagrad', 'Adam', 'Adamax', 'Dpsgd', 'DecayedAdagrad', @@ -1141,7 +1142,7 @@ def _append_optimize_op(self, block, param_and_grad): class DGCMomentumOptimizer(Optimizer): """ - :api_attr: Static Graph + :api_attr: Static Graph DGC (Deep Gradient Compression) Momentum Optimizer. Original paper is https://arxiv.org/abs/1712.01887 @@ -3067,7 +3068,7 @@ def _append_optimize_op(self, block, param_and_grad): class ModelAverage(Optimizer): """ - :api_attr: Static Graph + :api_attr: Static Graph The ModelAverage optimizer accumulates specific continuous historical parameters during training. The accumulated historical range can be controlled by the passed @@ -3376,7 +3377,7 @@ def restore(self, executor): class ExponentialMovingAverage(object): """ - :api_attr: Static Graph + :api_attr: Static Graph Compute the moving average of parameters with exponential decay. Given a parameter :math:`\\theta`, its exponential moving average (EMA) @@ -3626,7 +3627,7 @@ def restore(self, executor): class PipelineOptimizer(object): """ - :api_attr: Static Graph + :api_attr: Static Graph Pipeline Optimizer: Make a program to run as pipeline, that is splitting a program into multiple sections (sub-programs) and each section run on a @@ -3690,7 +3691,8 @@ def train_reader(): def __init__(self, optimizer, num_microbatches=1, start_cpu_core_id=0): if framework.in_dygraph_mode(): raise Exception("In dygraph, don't support PipelineOptimizer.") - if not isinstance(optimizer, Optimizer): + if not isinstance(optimizer, Optimizer) and not isinstance( + optimizer, paddle.optimizer.Optimizer): raise ValueError("The 'optimizer' parameter for " "PipelineOptimizer must be an instance of " "Optimizer, but the given type is {}.".format( @@ -4477,7 +4479,7 @@ def minimize(self, class RecomputeOptimizer(Optimizer): """ - :api_attr: Static Graph + :api_attr: Static Graph Recompute Optimizer Wrapper @@ -4562,7 +4564,7 @@ def _set_checkpoints(self, checkpoints): def load(self, stat_dict): """ - :api_attr: Static Graph + :api_attr: Static Graph load function is not supported by Recompute Optimizer for now. :return: None @@ -4786,7 +4788,7 @@ def minimize(self, class LookaheadOptimizer(object): """ - :api_attr: Static Graph + :api_attr: Static Graph This implements the Lookahead optimizer of the paper : https://arxiv.org/abs/1907.08610. diff --git a/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_async.py b/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_async.py index 28bd637726ebe..0ac755b9aed81 100644 --- a/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_async.py +++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_async.py @@ -55,7 +55,7 @@ def test_a_sync_optimizer_trainer(self): strategy = paddle.distributed.fleet.DistributedStrategy() strategy.a_sync = True - optimizer = paddle.optimizer.SGD(learning_rate=0.01) + optimizer = paddle.fluid.optimizer.SGD(learning_rate=0.01) optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy) optimizer.minimize(avg_cost) @@ -100,7 +100,7 @@ def test_a_sync_optimizer_pserver(self): strategy = paddle.distributed.fleet.DistributedStrategy() strategy.a_sync = True - optimizer = paddle.optimizer.SGD(learning_rate=0.01) + optimizer = paddle.fluid.optimizer.SGD(learning_rate=0.01) optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy) optimizer.minimize(avg_cost) diff --git a/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_geo.py b/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_geo.py index 9cd35f1754ff7..bc7968f0595ed 100644 --- a/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_geo.py +++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_geo.py @@ -55,7 +55,7 @@ def test_a_sync_optimizer_trainer(self): strategy = paddle.distributed.fleet.DistributedStrategy() strategy.a_sync = True strategy.a_sync_configs = {"k_steps": 100} - optimizer = paddle.optimizer.SGD(learning_rate=0.01) + optimizer = paddle.fluid.optimizer.SGD(learning_rate=0.01) optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy) optimizer.minimize(avg_cost) diff --git a/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_sync.py b/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_sync.py index c8130d62c304b..4b4cc7e43171d 100644 --- a/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_sync.py +++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_sync.py @@ -47,7 +47,7 @@ def test_gradient_merge_optimizer(self): strategy = paddle.distributed.fleet.DistributedStrategy() strategy.a_sync = False - optimizer = paddle.optimizer.SGD(learning_rate=0.01) + optimizer = paddle.fluid.optimizer.SGD(learning_rate=0.01) optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy) optimizer.minimize(avg_cost) diff --git a/python/paddle/fluid/tests/unittests/test_dist_fleet_ps2.py b/python/paddle/fluid/tests/unittests/test_dist_fleet_ps2.py index 5fcf5d894b2ee..f27a8bd858f45 100644 --- a/python/paddle/fluid/tests/unittests/test_dist_fleet_ps2.py +++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_ps2.py @@ -165,7 +165,7 @@ def test(self): strategy = paddle.distributed.fleet.DistributedStrategy() strategy.a_sync = True - optimizer = paddle.optimizer.SGD(learning_rate=0.01) + optimizer = paddle.fluid.optimizer.SGD(learning_rate=0.01) optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy) optimizer.minimize(loss) diff --git a/python/paddle/fluid/tests/unittests/test_fleet_amp_meta_optimizer.py b/python/paddle/fluid/tests/unittests/test_fleet_amp_meta_optimizer.py index 22a1434ae251a..38c3903306e6e 100644 --- a/python/paddle/fluid/tests/unittests/test_fleet_amp_meta_optimizer.py +++ b/python/paddle/fluid/tests/unittests/test_fleet_amp_meta_optimizer.py @@ -51,7 +51,7 @@ def test_amp_optimizer(self): "custom_black_list": ['tanh'], } - optimizer = paddle.optimizer.SGD(learning_rate=0.01) + optimizer = paddle.fluid.optimizer.SGD(learning_rate=0.01) optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy) optimizer.minimize(avg_cost) diff --git a/python/paddle/fluid/tests/unittests/test_fleet_base.py b/python/paddle/fluid/tests/unittests/test_fleet_base.py index ca657a5a619b6..e3f1fceebf06e 100644 --- a/python/paddle/fluid/tests/unittests/test_fleet_base.py +++ b/python/paddle/fluid/tests/unittests/test_fleet_base.py @@ -116,7 +116,7 @@ def test_distributed_optimizer(self): role = role_maker.PaddleCloudRoleMaker(is_collective=True) fleet.init(role) - optimizer = paddle.optimizer.SGD(learning_rate=0.001) + optimizer = paddle.fluid.optimizer.SGD(learning_rate=0.001) optimizer = fleet.distributed_optimizer(optimizer) def test_minimize(self): @@ -134,7 +134,7 @@ def test_minimize(self): role = role_maker.PaddleCloudRoleMaker(is_collective=True) fleet.init(role) strategy = fleet.DistributedStrategy() - optimizer = paddle.optimizer.SGD(learning_rate=0.001) + optimizer = paddle.fluid.optimizer.SGD(learning_rate=0.001) optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy) optimizer.minimize(avg_cost) diff --git a/python/paddle/fluid/tests/unittests/test_fleet_dgc_meta_optimizer.py b/python/paddle/fluid/tests/unittests/test_fleet_dgc_meta_optimizer.py index b43687ce1cdab..55d4ff7726aac 100755 --- a/python/paddle/fluid/tests/unittests/test_fleet_dgc_meta_optimizer.py +++ b/python/paddle/fluid/tests/unittests/test_fleet_dgc_meta_optimizer.py @@ -60,7 +60,8 @@ def test_dgc_optimizer(self): startup_prog = fluid.Program() train_prog = fluid.Program() avg_cost, strategy = self.net(train_prog, startup_prog) - optimizer = paddle.optimizer.Momentum(learning_rate=0.01, momentum=0.9) + optimizer = paddle.fluid.optimizer.Momentum( + learning_rate=0.01, momentum=0.9) optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy) optimizer.minimize(avg_cost) @@ -72,7 +73,7 @@ def test_dgc_not_apply_with_adam(self): startup_prog = fluid.Program() train_prog = fluid.Program() avg_cost, strategy = self.net(train_prog, startup_prog) - optimizer = paddle.optimizer.Adam(learning_rate=0.01) + optimizer = paddle.fluid.optimizer.Adam(learning_rate=0.01) optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy) optimizer.minimize(avg_cost) @@ -87,7 +88,8 @@ def test_dgc_not_apply_with_one_worker(self): startup_prog = fluid.Program() train_prog = fluid.Program() avg_cost, strategy = self.net(train_prog, startup_prog) - optimizer = paddle.optimizer.Momentum(learning_rate=0.01, momentum=0.9) + optimizer = paddle.fluid.optimizer.Momentum( + learning_rate=0.01, momentum=0.9) optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy) optimizer.minimize(avg_cost) diff --git a/python/paddle/fluid/tests/unittests/test_fleet_gradient_merge_meta_optimizer.py b/python/paddle/fluid/tests/unittests/test_fleet_gradient_merge_meta_optimizer.py index 581f8becbbff1..af72df5186876 100644 --- a/python/paddle/fluid/tests/unittests/test_fleet_gradient_merge_meta_optimizer.py +++ b/python/paddle/fluid/tests/unittests/test_fleet_gradient_merge_meta_optimizer.py @@ -44,7 +44,7 @@ def test_gradient_merge_optimizer(self): strategy = paddle.distributed.fleet.DistributedStrategy() strategy.gradient_merge = True strategy.gradient_merge_configs = {"k_steps": 2, "avg": True} - optimizer = paddle.optimizer.SGD(learning_rate=0.01) + optimizer = paddle.fluid.optimizer.SGD(learning_rate=0.01) optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy) optimizer.minimize(avg_cost) diff --git a/python/paddle/fluid/tests/unittests/test_fleet_graph_execution_meta_optimizer.py b/python/paddle/fluid/tests/unittests/test_fleet_graph_execution_meta_optimizer.py index 3e97ab3bfc66c..26e004164eb95 100644 --- a/python/paddle/fluid/tests/unittests/test_fleet_graph_execution_meta_optimizer.py +++ b/python/paddle/fluid/tests/unittests/test_fleet_graph_execution_meta_optimizer.py @@ -58,7 +58,7 @@ def node_func(): avg_cost = paddle.fluid.layers.mean(x=cost) strategy = paddle.distributed.fleet.DistributedStrategy() - optimizer = paddle.optimizer.SGD(learning_rate=0.01) + optimizer = paddle.fluid.optimizer.SGD(learning_rate=0.01) optimizer = fleet.distributed_optimizer( optimizer, strategy=strategy) optimizer.minimize(avg_cost) @@ -111,7 +111,7 @@ def node_func(): strategy = paddle.distributed.fleet.DistributedStrategy() strategy.nccl_comm_num = 2 strategy.sync_nccl_allreduce = True - optimizer = paddle.optimizer.SGD(learning_rate=0.01) + optimizer = paddle.fluidoptimizer.SGD(learning_rate=0.01) optimizer = fleet.distributed_optimizer( optimizer, strategy=strategy) optimizer.minimize(avg_cost) diff --git a/python/paddle/fluid/tests/unittests/test_fleet_graph_executor.py b/python/paddle/fluid/tests/unittests/test_fleet_graph_executor.py index 4d92c6f70541d..69f5b134888b0 100644 --- a/python/paddle/fluid/tests/unittests/test_fleet_graph_executor.py +++ b/python/paddle/fluid/tests/unittests/test_fleet_graph_executor.py @@ -60,7 +60,7 @@ def node_func(): strategy = paddle.distributed.fleet.DistributedStrategy() strategy.nccl_comm_num = 2 strategy.sync_nccl_allreduce = True - optimizer = paddle.optimizer.SGD(learning_rate=0.01) + optimizer = paddle.fluid.optimizer.SGD(learning_rate=0.01) optimizer = fleet.distributed_optimizer( optimizer, strategy=strategy) optimizer.minimize(avg_cost) diff --git a/python/paddle/fluid/tests/unittests/test_fleet_lamb_meta_optimizer.py b/python/paddle/fluid/tests/unittests/test_fleet_lamb_meta_optimizer.py index 134aea363b55e..3f140f53b043b 100755 --- a/python/paddle/fluid/tests/unittests/test_fleet_lamb_meta_optimizer.py +++ b/python/paddle/fluid/tests/unittests/test_fleet_lamb_meta_optimizer.py @@ -62,7 +62,7 @@ def test_lamb_optimizer(self): startup_prog = fluid.Program() train_prog = fluid.Program() avg_cost, strategy = self.net(train_prog, startup_prog) - optimizer = paddle.optimizer.Adam(learning_rate=0.01) + optimizer = paddle.fluid.optimizer.Adam(learning_rate=0.01) optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy) optimizer.minimize(avg_cost) @@ -75,7 +75,8 @@ def test_lamb_not_apply_with_momentum(self): startup_prog = fluid.Program() train_prog = fluid.Program() avg_cost, strategy = self.net(train_prog, startup_prog) - optimizer = paddle.optimizer.Momentum(learning_rate=0.1, momentum=0.9) + optimizer = paddle.fluid.optimizer.Momentum( + learning_rate=0.1, momentum=0.9) optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy) optimizer.minimize(avg_cost) @@ -88,7 +89,7 @@ def test_lamb_exclude_fn(self): startup_prog = fluid.Program() train_prog = fluid.Program() avg_cost, strategy = self.net(train_prog, startup_prog) - optimizer = paddle.optimizer.Adam(learning_rate=0.01) + optimizer = paddle.fluid.optimizer.Adam(learning_rate=0.01) strategy.lamb_configs = { 'lamb_weight_decay': 0.01, 'exclude_from_weight_decay': ['.b_0'], diff --git a/python/paddle/fluid/tests/unittests/test_fleet_lars_meta_optimizer.py b/python/paddle/fluid/tests/unittests/test_fleet_lars_meta_optimizer.py index b15db0b12d001..3caa1a4eac0bf 100755 --- a/python/paddle/fluid/tests/unittests/test_fleet_lars_meta_optimizer.py +++ b/python/paddle/fluid/tests/unittests/test_fleet_lars_meta_optimizer.py @@ -62,7 +62,8 @@ def test_lars_optimizer(self): startup_prog = fluid.Program() train_prog = fluid.Program() avg_cost, strategy = self.net(train_prog, startup_prog) - optimizer = paddle.optimizer.Momentum(learning_rate=0.01, momentum=0.9) + optimizer = paddle.fluid.optimizer.Momentum( + learning_rate=0.01, momentum=0.9) optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy) optimizer.minimize(avg_cost) @@ -75,7 +76,7 @@ def test_lars_not_apply_with_adam(self): startup_prog = fluid.Program() train_prog = fluid.Program() avg_cost, strategy = self.net(train_prog, startup_prog) - optimizer = paddle.optimizer.Adam(learning_rate=0.01) + optimizer = paddle.fluid.optimizer.Adam(learning_rate=0.01) optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy) optimizer.minimize(avg_cost) diff --git a/python/paddle/fluid/tests/unittests/test_fleet_localsgd_meta_optimizer.py b/python/paddle/fluid/tests/unittests/test_fleet_localsgd_meta_optimizer.py index 86098d42b823b..07b988bf87520 100644 --- a/python/paddle/fluid/tests/unittests/test_fleet_localsgd_meta_optimizer.py +++ b/python/paddle/fluid/tests/unittests/test_fleet_localsgd_meta_optimizer.py @@ -46,7 +46,7 @@ def test_localsgd_optimizer(self): config['k_steps'] = 1 strategy.localsgd_configs = config - optimizer = paddle.optimizer.SGD(learning_rate=0.01) + optimizer = paddle.fluid.optimizer.SGD(learning_rate=0.01) optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy) optimizer.minimize(avg_cost) diff --git a/python/paddle/fluid/tests/unittests/test_fleet_pipeline_meta_optimizer.py b/python/paddle/fluid/tests/unittests/test_fleet_pipeline_meta_optimizer.py index ca969bc4032b1..adbb1268c6f4d 100644 --- a/python/paddle/fluid/tests/unittests/test_fleet_pipeline_meta_optimizer.py +++ b/python/paddle/fluid/tests/unittests/test_fleet_pipeline_meta_optimizer.py @@ -53,7 +53,7 @@ def test_pipeline_optimizer(self): strategy.pipeline = True strategy.pipeline_configs = {'micro_batch': 2} - optimizer = paddle.optimizer.SGD(learning_rate=0.01) + optimizer = paddle.fluid.optimizer.SGD(learning_rate=0.01) optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy) optimizer.minimize(avg_cost) diff --git a/python/paddle/fluid/tests/unittests/test_fleet_recompute_meta_optimizer.py b/python/paddle/fluid/tests/unittests/test_fleet_recompute_meta_optimizer.py index 95e1c3a360257..a42010a4eaa50 100644 --- a/python/paddle/fluid/tests/unittests/test_fleet_recompute_meta_optimizer.py +++ b/python/paddle/fluid/tests/unittests/test_fleet_recompute_meta_optimizer.py @@ -45,7 +45,7 @@ def test_recompute_optimizer(self): strategy.recompute = True strategy.recompute_configs = {"checkpoints": ["fc_1.tmp_0"]} - optimizer = paddle.optimizer.SGD(learning_rate=0.01) + optimizer = paddle.fluid.optimizer.SGD(learning_rate=0.01) optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy) optimizer.minimize(avg_cost) diff --git a/python/paddle/fluid/tests/unittests/test_rmsprop_op.py b/python/paddle/fluid/tests/unittests/test_rmsprop_op.py index 5716dc4afc054..5d50c7d93c9d5 100644 --- a/python/paddle/fluid/tests/unittests/test_rmsprop_op.py +++ b/python/paddle/fluid/tests/unittests/test_rmsprop_op.py @@ -20,6 +20,7 @@ import paddle.fluid.core as core from paddle.fluid.op import Operator import paddle.fluid as fluid +import paddle def create_selected_rows_and_tensor(scope, place, height, row_num, From 2e8d253c8ac22186b19b2d6fdde10967932c10b1 Mon Sep 17 00:00:00 2001 From: MRXLT Date: Thu, 20 Aug 2020 13:37:29 +0800 Subject: [PATCH 17/30] bug fix --- python/paddle/optimizer/__init__.py | 18 +++++++---- python/paddle/optimizer/adamw.py | 8 ++--- python/paddle/optimizer/optimizer.py | 48 +++++++++++++++++++++++++--- 3 files changed, 58 insertions(+), 16 deletions(-) diff --git a/python/paddle/optimizer/__init__.py b/python/paddle/optimizer/__init__.py index 2c84953f0225b..7159baeb30512 100644 --- a/python/paddle/optimizer/__init__.py +++ b/python/paddle/optimizer/__init__.py @@ -13,19 +13,23 @@ # limitations under the License. __all__ = [ - 'Adadelta', 'Adagrad', 'Adam', 'Adamax', 'DecayedAdagrad', 'AdamW' + 'Adadelta', 'AdadeltaOptimizer', 'Adagrad', 'AdagradOptimizer', 'Adam', + 'Adamax', 'AdamW', 'DecayedAdagrad', 'DecayedAdagradOptimizer', 'DGCMomentumOptimizer', 'Dpsgd', 'DpsgdOptimizer', - 'ExponentialMovingAverage', 'Ftrl', 'Lamb', 'LarsMomentum', - 'LookaheadOptimizer', 'ModelAverage', 'Momentum', 'PipelineOptimizer', - 'RecomputeOptimizer', 'RMSProp', 'SGD', 'Optimizer' + 'ExponentialMovingAverage', 'Ftrl', 'FtrlOptimizer', 'LambOptimizer', + 'LarsMomentum', 'LarsMomentumOptimizer', 'LookaheadOptimizer', + 'ModelAverage', 'Momentum', 'MomentumOptimizer', 'PipelineOptimizer', + 'RecomputeOptimizer', 'RMSProp', 'SGD', 'SGDOptimizer', 'Optimizer' ] from ..fluid.optimizer import SGD, Momentum, Adagrad, Dpsgd, DecayedAdagrad, \ - Ftrl, Adadelta, Lamb, RMSProp, \ - ModelAverage, LarsMomentum, DGCMomentumOptimizer, \ + Ftrl, Adadelta, \ + SGDOptimizer, MomentumOptimizer, AdagradOptimizer,DpsgdOptimizer,\ + DecayedAdagradOptimizer,FtrlOptimizer,AdadeltaOptimizer, \ + ModelAverage, LarsMomentum, DGCMomentumOptimizer, LambOptimizer,\ ExponentialMovingAverage, PipelineOptimizer, LookaheadOptimizer, \ - RecomputeOptimizer + RecomputeOptimizer, LarsMomentumOptimizer from .optimizer import Optimizer from .adam import Adam diff --git a/python/paddle/optimizer/adamw.py b/python/paddle/optimizer/adamw.py index fdffe442ef81b..d0bb771c2ecae 100644 --- a/python/paddle/optimizer/adamw.py +++ b/python/paddle/optimizer/adamw.py @@ -69,8 +69,8 @@ def _scale_parameters(self, params_and_grads): def backward(self, **kargs): return super(DecoupledWeightDecay, self).backward(**kargs) - def apply_optimize(self, **kargs): - return super(DecoupledWeightDecay, self).apply_optimize(**kargs) + def _apply_optimize(self, **kargs): + return super(DecoupledWeightDecay, self)._apply_optimize(**kargs) def minimize(self, loss, @@ -91,7 +91,7 @@ def minimize(self, x=param, y=scaled_param) paddle.fluid.layers.assign(input=updated_param, output=param) - optimize_ops = self.apply_optimize( + optimize_ops = self._apply_optimize( loss=loss, params_grads=params_grads, startup_program=startup_program) @@ -117,7 +117,7 @@ def step(self): updated_param = paddle.fluid.layers.elementwise_sub( x=param, y=scaled_param) paddle.fluid.layers.assign(input=updated_param, output=param) - optimize_ops = self.apply_optimize( + optimize_ops = self._apply_optimize( loss=None, startup_program=None, params_grads=params_grads) def __str__(self): diff --git a/python/paddle/optimizer/optimizer.py b/python/paddle/optimizer/optimizer.py index a8efcd6eecc79..2dabca5ad22b9 100644 --- a/python/paddle/optimizer/optimizer.py +++ b/python/paddle/optimizer/optimizer.py @@ -742,7 +742,21 @@ def backward(self, grad is the gradient value corresponding to the parameter. Examples: - See examples in ``apply_gradients``. + .. code-block:: python + + import paddle + import numpy as np + paddle.disable_static() + value = np.arange(26).reshape(2, 13).astype("float32") + a = paddle.to_tensor(value) + linear = paddle.nn.Linear(13, 5, dtype="float32") + # This can be any optimizer supported by dygraph. + adam = paddle.optimizer.Adam(learning_rate = 0.01, + parameters = linear.parameters()) + out = linear(a) + out.backward() + adam.step() + adam.clear_gradients() """ act_no_grad_set = None if framework.in_dygraph_mode(): @@ -826,7 +840,7 @@ def apply_gradients(self, params_grads): optimize_ops = self._create_optimization_pass(params_grads) return optimize_ops - def apply_optimize(self, loss, startup_program, params_grads): + def _apply_optimize(self, loss, startup_program, params_grads): """ Second part of `minimize`, appending optimization operators for given `params_grads` pairs. @@ -921,7 +935,31 @@ def minimize(self, ``fetch_list`` before run, see details in ``Executor``. Examples: - Please refer to the example of current Optimizer. + .. code-bloack:: python + import paddle + import paddle.fluid as fluid + + place = fluid.CPUPlace() + main = fluid.Program() + with fluid.program_guard(main): + x = fluid.data(name='x', shape=[None, 13], dtype='float32') + y = fluid.data(name='y', shape=[None, 1], dtype='float32') + y_predict = fluid.layers.fc(input=x, size=1, act=None) + cost = fluid.layers.square_error_cost(input=y_predict, label=y) + avg_cost = fluid.layers.mean(cost) + + adam_optimizer = paddle.optimizer.AdamOptimizer(0.01) + adam_optimizer.minimize(avg_cost) + + fetch_list = [avg_cost] + train_reader = paddle.batch( + paddle.dataset.uci_housing.train(), batch_size=1) + feeder = fluid.DataFeeder(place=place, feed_list=[x, y]) + exe = fluid.Executor(place) + exe.run(fluid.default_startup_program()) + for data in train_reader(): + exe.run(main, feed=feeder.feed(data), fetch_list=fetch_list) + """ assert isinstance(loss, Variable), "The loss should be an Tensor." @@ -933,7 +971,7 @@ def minimize(self, parameters=parameter_list, no_grad_set=no_grad_set) - optimize_ops = self.apply_optimize( + optimize_ops = self._apply_optimize( loss, startup_program=startup_program, params_grads=params_grads) return optimize_ops, params_grads @@ -973,5 +1011,5 @@ def step(self): grad_var = param._grad_ivar() params_grads.append((param, grad_var)) - optimize_ops = self.apply_optimize( + optimize_ops = self._apply_optimize( loss=None, startup_program=None, params_grads=params_grads) From 00c38fc3265762bbf36cad836edab5093411aa13 Mon Sep 17 00:00:00 2001 From: mapingshuo Date: Thu, 20 Aug 2020 15:34:07 +0800 Subject: [PATCH 18/30] fix conflicts, test=develop --- python/paddle/fluid/tests/unittests/test_fleet_base.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/paddle/fluid/tests/unittests/test_fleet_base.py b/python/paddle/fluid/tests/unittests/test_fleet_base.py index e3f1fceebf06e..ca657a5a619b6 100644 --- a/python/paddle/fluid/tests/unittests/test_fleet_base.py +++ b/python/paddle/fluid/tests/unittests/test_fleet_base.py @@ -116,7 +116,7 @@ def test_distributed_optimizer(self): role = role_maker.PaddleCloudRoleMaker(is_collective=True) fleet.init(role) - optimizer = paddle.fluid.optimizer.SGD(learning_rate=0.001) + optimizer = paddle.optimizer.SGD(learning_rate=0.001) optimizer = fleet.distributed_optimizer(optimizer) def test_minimize(self): @@ -134,7 +134,7 @@ def test_minimize(self): role = role_maker.PaddleCloudRoleMaker(is_collective=True) fleet.init(role) strategy = fleet.DistributedStrategy() - optimizer = paddle.fluid.optimizer.SGD(learning_rate=0.001) + optimizer = paddle.optimizer.SGD(learning_rate=0.001) optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy) optimizer.minimize(avg_cost) From b75ab16659c53387ff50c1ecb4a3e2a2c682f5f2 Mon Sep 17 00:00:00 2001 From: MRXLT Date: Thu, 20 Aug 2020 15:44:37 +0800 Subject: [PATCH 19/30] add examples code --- python/paddle/optimizer/adam.py | 2 +- python/paddle/optimizer/adamax.py | 2 +- python/paddle/optimizer/adamw.py | 15 +++--- python/paddle/optimizer/optimizer.py | 75 ++++++++++++++-------------- python/paddle/optimizer/rmsprop.py | 2 +- 5 files changed, 48 insertions(+), 48 deletions(-) diff --git a/python/paddle/optimizer/adam.py b/python/paddle/optimizer/adam.py index 405f68a6845d0..b829bcd6047be 100644 --- a/python/paddle/optimizer/adam.py +++ b/python/paddle/optimizer/adam.py @@ -55,7 +55,7 @@ class Adam(Optimizer): The default value is 0.999. epsilon (float, optional): A small float value for numerical stability. The default value is 1e-08. - parameters (Iterable, optional): Iterable of ``Tensor`` names to update to minimize ``loss``. \ + parameters (list, optional): List of ``Tensor`` names to update to minimize ``loss``. \ This parameter is required in dygraph mode. \ The default value is None in static mode, at this time all parameters will be updated. weight_decay (float|WeightDecayRegularizer, optional): The strategy of regularization. \ diff --git a/python/paddle/optimizer/adamax.py b/python/paddle/optimizer/adamax.py index 60d6b75b5a962..82cfbaf4a0304 100644 --- a/python/paddle/optimizer/adamax.py +++ b/python/paddle/optimizer/adamax.py @@ -55,7 +55,7 @@ class Adamax(Optimizer): The default value is 0.999. epsilon (float, optional): A small float value for numerical stability. The default value is 1e-08. - parameters (Iterable, optional): Iterable of ``Tensor`` names to update to minimize ``loss``. \ + parameters (list, optional): List of ``Tensor`` names to update to minimize ``loss``. \ This parameter is required in dygraph mode. \ The default value is None in static mode, at this time all parameters will be updated. weight_decay (float|WeightDecayRegularizer, optional): The strategy of regularization. \ diff --git a/python/paddle/optimizer/adamw.py b/python/paddle/optimizer/adamw.py index d0bb771c2ecae..cc46e8bdf9944 100644 --- a/python/paddle/optimizer/adamw.py +++ b/python/paddle/optimizer/adamw.py @@ -147,6 +147,9 @@ class AdamW(DecoupledWeightDecay, Adam): Args: learning_rate (float|Tensor, optional): The learning rate used to update ``Parameter``. It can be a float value or a ``Tensor`` with a float type. The default value is 0.001. + parameters (list, optional): List of ``Tensor`` names to update to minimize ``loss``. \ + This parameter is required in dygraph mode. \ + The default value is None in static mode, at this time all parameters will be updated. beta1 (float|Tensor, optional): The exponential decay rate for the 1st moment estimates. It should be a float number or a Tensor with shape [1] and data type as float32. The default value is 0.9. @@ -154,12 +157,8 @@ class AdamW(DecoupledWeightDecay, Adam): It should be a float number or a Tensor with shape [1] and data type as float32. The default value is 0.999. epsilon (float, optional): A small float value for numerical stability. + weight_decay (float|Tensor): The weight decay coefficient, it can be float or Tensor. The default value is 0.0. The default value is 1e-08. - parameters (Iterable, optional): Iterable of ``Tensor`` names to update to minimize ``loss``. \ - This parameter is required in dygraph mode. \ - The default value is None in static mode, at this time all parameters will be updated. - weight_decay (float|Tensor): The weight decay coefficient, it can be - float or Tensor. apply_decay_param_fun (function|None): If it is not None, only tensors that makes apply_decay_param_fun(Tensor)==True will be updated. It only works when we want to specify tensors. @@ -208,13 +207,13 @@ class AdamW(DecoupledWeightDecay, Adam): """ def __init__(self, - weight_decay, - apply_decay_param_fun=None, learning_rate=0.001, + parameters=None, beta1=0.9, beta2=0.999, epsilon=1e-8, - parameters=None, + weight_decay=0.0, + apply_decay_param_fun=None, grad_clip=None, name=None, lazy_mode=False): diff --git a/python/paddle/optimizer/optimizer.py b/python/paddle/optimizer/optimizer.py index 2dabca5ad22b9..2ce454bdb0e7c 100644 --- a/python/paddle/optimizer/optimizer.py +++ b/python/paddle/optimizer/optimizer.py @@ -744,19 +744,19 @@ def backward(self, Examples: .. code-block:: python - import paddle - import numpy as np - paddle.disable_static() - value = np.arange(26).reshape(2, 13).astype("float32") - a = paddle.to_tensor(value) - linear = paddle.nn.Linear(13, 5, dtype="float32") - # This can be any optimizer supported by dygraph. - adam = paddle.optimizer.Adam(learning_rate = 0.01, - parameters = linear.parameters()) - out = linear(a) - out.backward() - adam.step() - adam.clear_gradients() + import paddle + import numpy as np + paddle.disable_static() + value = np.arange(26).reshape(2, 13).astype("float32") + a = paddle.to_tensor(value) + linear = paddle.nn.Linear(13, 5, dtype="float32") + # This can be any optimizer supported by dygraph. + adam = paddle.optimizer.Adam(learning_rate = 0.01, + parameters = linear.parameters()) + out = linear(a) + out.backward() + adam.step() + adam.clear_gradients() """ act_no_grad_set = None if framework.in_dygraph_mode(): @@ -935,31 +935,32 @@ def minimize(self, ``fetch_list`` before run, see details in ``Executor``. Examples: - .. code-bloack:: python - import paddle - import paddle.fluid as fluid - - place = fluid.CPUPlace() - main = fluid.Program() - with fluid.program_guard(main): - x = fluid.data(name='x', shape=[None, 13], dtype='float32') - y = fluid.data(name='y', shape=[None, 1], dtype='float32') - y_predict = fluid.layers.fc(input=x, size=1, act=None) - cost = fluid.layers.square_error_cost(input=y_predict, label=y) - avg_cost = fluid.layers.mean(cost) - - adam_optimizer = paddle.optimizer.AdamOptimizer(0.01) - adam_optimizer.minimize(avg_cost) - - fetch_list = [avg_cost] - train_reader = paddle.batch( - paddle.dataset.uci_housing.train(), batch_size=1) - feeder = fluid.DataFeeder(place=place, feed_list=[x, y]) - exe = fluid.Executor(place) - exe.run(fluid.default_startup_program()) - for data in train_reader(): - exe.run(main, feed=feeder.feed(data), fetch_list=fetch_list) + .. code-bloack:: python + + import paddle + import paddle.fluid as fluid + + place = fluid.CPUPlace() + main = fluid.Program() + with fluid.program_guard(main): + x = fluid.data(name='x', shape=[None, 13], dtype='float32') + y = fluid.data(name='y', shape=[None, 1], dtype='float32') + y_predict = fluid.layers.fc(input=x, size=1, act=None) + cost = fluid.layers.square_error_cost(input=y_predict, label=y) + avg_cost = fluid.layers.mean(cost) + + adam_optimizer = paddle.optimizer.AdamOptimizer(0.01) + adam_optimizer.minimize(avg_cost) + + fetch_list = [avg_cost] + train_reader = paddle.batch( + paddle.dataset.uci_housing.train(), batch_size=1) + feeder = fluid.DataFeeder(place=place, feed_list=[x, y]) + exe = fluid.Executor(place) + exe.run(fluid.default_startup_program()) + for data in train_reader(): + exe.run(main, feed=feeder.feed(data), fetch_list=fetch_list) """ assert isinstance(loss, Variable), "The loss should be an Tensor." diff --git a/python/paddle/optimizer/rmsprop.py b/python/paddle/optimizer/rmsprop.py index 9b72ef59a02ff..545a39c848050 100644 --- a/python/paddle/optimizer/rmsprop.py +++ b/python/paddle/optimizer/rmsprop.py @@ -79,7 +79,7 @@ class RMSProp(Optimizer): the gradient; if False, by the uncentered second moment. Setting this to True may help with training, but is slightly more expensive in terms of computation and memory. Defaults to False. - parameters (Iterable, optional): Iterable of ``Tensor`` names to update to minimize ``loss``. \ + parameters (list, optional): List of ``Tensor`` names to update to minimize ``loss``. \ This parameter is required in dygraph mode. \ The default value is None in static mode, at this time all parameters will be updated. weight_decay (float|WeightDecayRegularizer, optional): The strategy of regularization. \ From b6fa771cc2e5874b094dcc83739c2ed5c6b91a74 Mon Sep 17 00:00:00 2001 From: MRXLT Date: Thu, 20 Aug 2020 16:51:27 +0800 Subject: [PATCH 20/30] bug fix --- python/paddle/optimizer/rmsprop.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/paddle/optimizer/rmsprop.py b/python/paddle/optimizer/rmsprop.py index 545a39c848050..77fcf4b473f33 100644 --- a/python/paddle/optimizer/rmsprop.py +++ b/python/paddle/optimizer/rmsprop.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -from optimizer import Optimizer +from .optimizer import Optimizer from ..fluid import core from ..fluid import framework from ..fluid.framework import Variable From 9cd1838818f2e9adebd3b50b2254304b8aac6349 Mon Sep 17 00:00:00 2001 From: MRXLT Date: Thu, 20 Aug 2020 19:10:26 +0800 Subject: [PATCH 21/30] fix comments --- python/paddle/optimizer/optimizer.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/python/paddle/optimizer/optimizer.py b/python/paddle/optimizer/optimizer.py index 2ce454bdb0e7c..80b65b903aec3 100644 --- a/python/paddle/optimizer/optimizer.py +++ b/python/paddle/optimizer/optimizer.py @@ -935,8 +935,7 @@ def minimize(self, ``fetch_list`` before run, see details in ``Executor``. Examples: - - .. code-bloack:: python + .. code-block:: python import paddle import paddle.fluid as fluid From 95310f5450bd3f92873aa22bbfe6d2b3d7bb2664 Mon Sep 17 00:00:00 2001 From: MRXLT Date: Thu, 20 Aug 2020 19:31:38 +0800 Subject: [PATCH 22/30] fix sample code --- python/paddle/optimizer/adamax.py | 2 +- python/paddle/optimizer/optimizer.py | 99 ++++++++++++++-------------- 2 files changed, 51 insertions(+), 50 deletions(-) diff --git a/python/paddle/optimizer/adamax.py b/python/paddle/optimizer/adamax.py index 82cfbaf4a0304..f0e05f98f4d15 100644 --- a/python/paddle/optimizer/adamax.py +++ b/python/paddle/optimizer/adamax.py @@ -74,7 +74,7 @@ class Adamax(Optimizer): The default value is None. **Notes**: - **Currently, AdamaxOptimizer doesn't support sparse parameter optimization.** + **Currently, Adamax doesn't support sparse parameter optimization.** Examples: .. code-block:: python diff --git a/python/paddle/optimizer/optimizer.py b/python/paddle/optimizer/optimizer.py index 80b65b903aec3..46766418721ac 100644 --- a/python/paddle/optimizer/optimizer.py +++ b/python/paddle/optimizer/optimizer.py @@ -196,6 +196,7 @@ def set_state_dict(self, state_dict): Examples: .. code-block:: python + import paddle paddle.disable_static() emb = paddle.nn.Embedding([10, 10]) @@ -744,19 +745,19 @@ def backward(self, Examples: .. code-block:: python - import paddle - import numpy as np - paddle.disable_static() - value = np.arange(26).reshape(2, 13).astype("float32") - a = paddle.to_tensor(value) - linear = paddle.nn.Linear(13, 5, dtype="float32") - # This can be any optimizer supported by dygraph. - adam = paddle.optimizer.Adam(learning_rate = 0.01, - parameters = linear.parameters()) - out = linear(a) - out.backward() - adam.step() - adam.clear_gradients() + import paddle + import numpy as np + paddle.disable_static() + value = np.arange(26).reshape(2, 13).astype("float32") + a = paddle.to_tensor(value) + linear = paddle.nn.Linear(13, 5, dtype="float32") + # This can be any optimizer supported by dygraph. + adam = paddle.optimizer.Adam(learning_rate = 0.01, + parameters = linear.parameters()) + out = linear(a) + out.backward() + adam.step() + adam.clear_gradients() """ act_no_grad_set = None if framework.in_dygraph_mode(): @@ -808,19 +809,19 @@ def apply_gradients(self, params_grads): Examples: .. code-block:: python - import paddle - import numpy as np + import paddle + import numpy as np - paddle.disable_static() - inp = np.random.uniform(-0.1, 0.1, [10, 10]).astype("float32") - linear = paddle.nn.Linear(10, 10) - inp = paddle.to_tensor(inp) - out = linear(inp) - loss = paddle.mean(out) - optimizer = paddle.optimizer.Adam(learning_rate=0.1, - parameters=linear.parameters()) - params_grads = optimizer.backward(loss) - optimizer.apply_gradients(params_grads) + paddle.disable_static() + inp = np.random.uniform(-0.1, 0.1, [10, 10]).astype("float32") + linear = paddle.nn.Linear(10, 10) + inp = paddle.to_tensor(inp) + out = linear(inp) + loss = paddle.mean(out) + optimizer = paddle.optimizer.Adam(learning_rate=0.1, + parameters=linear.parameters()) + params_grads = optimizer.backward(loss) + optimizer.apply_gradients(params_grads) """ @@ -936,30 +937,30 @@ def minimize(self, Examples: .. code-block:: python - - import paddle - import paddle.fluid as fluid - - place = fluid.CPUPlace() - main = fluid.Program() - with fluid.program_guard(main): - x = fluid.data(name='x', shape=[None, 13], dtype='float32') - y = fluid.data(name='y', shape=[None, 1], dtype='float32') - y_predict = fluid.layers.fc(input=x, size=1, act=None) - cost = fluid.layers.square_error_cost(input=y_predict, label=y) - avg_cost = fluid.layers.mean(cost) - - adam_optimizer = paddle.optimizer.AdamOptimizer(0.01) - adam_optimizer.minimize(avg_cost) - - fetch_list = [avg_cost] - train_reader = paddle.batch( - paddle.dataset.uci_housing.train(), batch_size=1) - feeder = fluid.DataFeeder(place=place, feed_list=[x, y]) - exe = fluid.Executor(place) - exe.run(fluid.default_startup_program()) - for data in train_reader(): - exe.run(main, feed=feeder.feed(data), fetch_list=fetch_list) + + import paddle + import paddle.fluid as fluid + + place = fluid.CPUPlace() + main = fluid.Program() + with fluid.program_guard(main): + x = fluid.data(name='x', shape=[None, 13], dtype='float32') + y = fluid.data(name='y', shape=[None, 1], dtype='float32') + y_predict = fluid.layers.fc(input=x, size=1, act=None) + cost = fluid.layers.square_error_cost(input=y_predict, label=y) + avg_cost = fluid.layers.mean(cost) + + adam_optimizer = paddle.optimizer.Adam(0.01) + adam_optimizer.minimize(avg_cost) + + fetch_list = [avg_cost] + train_reader = paddle.batch( + paddle.dataset.uci_housing.train(), batch_size=1) + feeder = fluid.DataFeeder(place=place, feed_list=[x, y]) + exe = fluid.Executor(place) + exe.run(fluid.default_startup_program()) + for data in train_reader(): + exe.run(main, feed=feeder.feed(data), fetch_list=fetch_list) """ assert isinstance(loss, Variable), "The loss should be an Tensor." From ce3179552605d8c3bd79992438d591c209bea348 Mon Sep 17 00:00:00 2001 From: MRXLT Date: Thu, 20 Aug 2020 20:37:30 +0800 Subject: [PATCH 23/30] add sample code for Optimizer --- python/paddle/optimizer/optimizer.py | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/python/paddle/optimizer/optimizer.py b/python/paddle/optimizer/optimizer.py index 46766418721ac..ff894805c0c7f 100644 --- a/python/paddle/optimizer/optimizer.py +++ b/python/paddle/optimizer/optimizer.py @@ -74,6 +74,27 @@ class Optimizer(object): Returns: Base class for optimizer. + + Examples: + .. code-block:: python + + #Take the subclass adam as an example + #Optimizer + import paddle + import numpy as np + + paddle.disable_static() + inp = np.random.uniform(-0.1, 0.1, [10, 10]).astype("float32") + linear = paddle.nn.Linear(10, 10) + inp = paddle.to_tensor(inp) + out = linear(inp) + loss = paddle.mean(out) + adam = paddle.optimizer.Adam(learning_rate=0.1, + parameters=linear.parameters()) + out.backward() + adam.step() + adam.clear_grad() + """ @imperative_base.no_grad() From 0780b9cabeb3f179a748860b7322ef28d1874b72 Mon Sep 17 00:00:00 2001 From: mapingshuo Date: Fri, 21 Aug 2020 10:32:20 +0800 Subject: [PATCH 24/30] add adamax ut, test=develop --- .../fluid/tests/unittests/test_adamax_api.py | 67 +++++++++++++++++++ 1 file changed, 67 insertions(+) create mode 100644 python/paddle/fluid/tests/unittests/test_adamax_api.py diff --git a/python/paddle/fluid/tests/unittests/test_adamax_api.py b/python/paddle/fluid/tests/unittests/test_adamax_api.py new file mode 100644 index 0000000000000..f6946dc80b5e5 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_adamax_api.py @@ -0,0 +1,67 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import unittest +import numpy as np +from op_test import OpTest +import paddle +import paddle.fluid as fluid + + +class TestAdamaxAPI(unittest.TestCase): + def test_adamax_api_dygraph(self): + paddle.disable_static() + value = np.arange(26).reshape(2, 13).astype("float32") + a = paddle.to_variable(value) + linear = paddle.nn.Linear(13, 5, dtype="float32") + adam = paddle.optimizer.Adamax( + learning_rate=0.01, + parameters=linear.parameters(), + weight_decay=0.01) + out = linear(a) + out.backward() + adam.step() + adam.clear_gradients() + + def test_adamax_api(self): + place = fluid.CPUPlace() + shape = [2, 3, 8, 8] + exe = fluid.Executor(place) + train_prog = fluid.Program() + startup = fluid.Program() + with fluid.program_guard(train_prog, startup): + with fluid.unique_name.guard(): + data = fluid.data(name="data", shape=shape) + conv = fluid.layers.conv2d(data, 8, 3) + loss = paddle.mean(conv) + beta1 = 0.85 + beta2 = 0.95 + opt = paddle.optimizer.Adamax( + learning_rate=1e-5, + beta1=beta1, + beta2=beta2, + weight_decay=0.01, + epsilon=1e-8) + opt.minimize(loss) + + exe.run(startup) + data_np = np.random.random(shape).astype('float32') + rets = exe.run(train_prog, feed={"data": data_np}, fetch_list=[loss]) + assert rets[0] is not None + + +if __name__ == "__main__": + unittest.main() From 87a7f56f21deccc671f733afb18cc805512a0df9 Mon Sep 17 00:00:00 2001 From: mapingshuo Date: Fri, 21 Aug 2020 10:49:11 +0800 Subject: [PATCH 25/30] fix rmsprop ut, test=develop --- .../fluid/tests/unittests/test_rmsprop_op.py | 15 +++++++++++++++ python/paddle/optimizer/rmsprop.py | 13 +++++++------ 2 files changed, 22 insertions(+), 6 deletions(-) diff --git a/python/paddle/fluid/tests/unittests/test_rmsprop_op.py b/python/paddle/fluid/tests/unittests/test_rmsprop_op.py index 5d50c7d93c9d5..0f225758ced3b 100644 --- a/python/paddle/fluid/tests/unittests/test_rmsprop_op.py +++ b/python/paddle/fluid/tests/unittests/test_rmsprop_op.py @@ -261,6 +261,21 @@ def test_rmsprop(self): for data in train_reader(): exe.run(main, feed=feeder.feed(data), fetch_list=fetch_list) + def test_raise_error(self): + self.assertRaises(ValueError, paddle.optimizer.RMSProp, None) + self.assertRaises( + ValueError, paddle.optimizer.RMSProp, learning_rate=0.1, rho=None) + self.assertRaises( + ValueError, + paddle.optimizer.RMSProp, + learning_rate=0.1, + epsilon=None) + self.assertRaises( + ValueError, + paddle.optimizer.RMSProp, + learning_rate=0.1, + momentum=None) + if __name__ == "__main__": unittest.main() diff --git a/python/paddle/optimizer/rmsprop.py b/python/paddle/optimizer/rmsprop.py index 77fcf4b473f33..155a04b42db88 100644 --- a/python/paddle/optimizer/rmsprop.py +++ b/python/paddle/optimizer/rmsprop.py @@ -138,12 +138,6 @@ def __init__(self, weight_decay=None, grad_clip=None, name=None): - super(RMSProp, self).__init__( - learning_rate=learning_rate, - parameters=parameters, - weight_decay=weight_decay, - grad_clip=grad_clip, - name=name) if learning_rate is None: raise ValueError("learning_rate is not set.") if rho is None: @@ -153,6 +147,13 @@ def __init__(self, if momentum is None: raise ValueError("momentum is not set.") + super(RMSProp, self).__init__( + learning_rate=learning_rate, + parameters=parameters, + weight_decay=weight_decay, + grad_clip=grad_clip, + name=name) + self.type = "rmsprop" self._rho = rho self._epsilon = epsilon From 06f3c732ec13fa14ea011dcd60294391cb662968 Mon Sep 17 00:00:00 2001 From: MRXLT Date: Fri, 21 Aug 2020 11:12:43 +0800 Subject: [PATCH 26/30] add ut for optimizer.py and adamw.py --- .../fluid/tests/unittests/test_adam_op.py | 62 +++++++++++++++---- .../fluid/tests/unittests/test_adamw_op.py | 1 + 2 files changed, 52 insertions(+), 11 deletions(-) diff --git a/python/paddle/fluid/tests/unittests/test_adam_op.py b/python/paddle/fluid/tests/unittests/test_adam_op.py index 365ad26344ba8..b0bfc5974090a 100644 --- a/python/paddle/fluid/tests/unittests/test_adam_op.py +++ b/python/paddle/fluid/tests/unittests/test_adam_op.py @@ -476,17 +476,57 @@ def test_adam_op(self): assert rets[0] is not None def test_adam_op_dygraph(self): - with fluid.dygraph.guard(): - value = np.arange(26).reshape(2, 13).astype("float32") - a = fluid.dygraph.to_variable(value) - linear = fluid.Linear(13, 5, dtype="float32") - - adam = paddle.optimizer.Adam( - learning_rate=0.01, parameters=linear.parameters()) - out = linear(a) - out.backward() - adam.step() - adam.clear_gradients() + paddle.disable_static() + value = np.arange(26).reshape(2, 13).astype("float32") + a = fluid.dygraph.to_variable(value) + linear = fluid.Linear(13, 5, dtype="float32") + + adam = paddle.optimizer.Adam( + learning_rate=0.01, parameters=linear.parameters()) + out = linear(a) + out.backward() + adam.step() + adam.clear_gradients() + + def test_adam_op_with_state_dict(self): + + import paddle + paddle.disable_static() + emb = paddle.nn.Embedding([10, 10]) + + adam = paddle.optimizer.Adam(0.001, parameters=emb.parameters()) + state_dict = adam.state_dict() + + adam.set_state_dict(state_dict) + + #learning_rate is Decay + from paddle.fluid.regularizer import L2Decay + adam = paddle.optimizer.Adam( + learning_rate=0.01, + weight_decay=L2Decay(0.001), + parameters=emb.parameters()) + + state_dict = adam.state_dict() + adam.set_state_dict(state_dict) + + params = adam.get_opti_var_name_list() + assert (params is not None) + + def test_adam_op_with_set_lr(self): + import paddle + paddle.disable_static() + linear = paddle.nn.Linear(10, 10) + adam = paddle.optimizer.Adam(0.1, parameters=linear.parameters()) + + lr = 0.01 + adam.set_lr(lr) + cur_lr = adam.current_step_lr() + assert (lr == cur_lr) + + lr_var = paddle.create_global_var(shape=[1], value=lr, dtype='float32') + adam.set_lr(lr_var) + cur_lr = adam.current_step_lr() + assert (np.float32(lr) == cur_lr) if __name__ == "__main__": diff --git a/python/paddle/fluid/tests/unittests/test_adamw_op.py b/python/paddle/fluid/tests/unittests/test_adamw_op.py index d38dc0087d8f6..ec25fc3eea9eb 100644 --- a/python/paddle/fluid/tests/unittests/test_adamw_op.py +++ b/python/paddle/fluid/tests/unittests/test_adamw_op.py @@ -27,6 +27,7 @@ def test_adamw_opi_dygraph(self): adam = paddle.optimizer.AdamW( learning_rate=0.01, parameters=linear.parameters(), + apply_decay_param_fun=lambda name: True, weight_decay=0.01) out = linear(a) out.backward() From b00b85f55928ef25391f9068ef6b7f8280afad57 Mon Sep 17 00:00:00 2001 From: MRXLT Date: Fri, 21 Aug 2020 13:47:22 +0800 Subject: [PATCH 27/30] remove TestAdamOptimizerBetaVariable --- .../fluid/tests/unittests/test_adam_op.py | 42 ------------------- 1 file changed, 42 deletions(-) diff --git a/python/paddle/fluid/tests/unittests/test_adam_op.py b/python/paddle/fluid/tests/unittests/test_adam_op.py index b0bfc5974090a..09eea7cc89997 100644 --- a/python/paddle/fluid/tests/unittests/test_adam_op.py +++ b/python/paddle/fluid/tests/unittests/test_adam_op.py @@ -402,48 +402,6 @@ def test_check_output(self): self.check_output() -class TestAdamOptimizerBetaVariable(unittest.TestCase): - def test_adam_optimizer(self): - def test_with_place(place, shape): - exe = fluid.Executor(place) - - train_prog = fluid.Program() - startup = fluid.Program() - with fluid.program_guard(train_prog, startup): - with fluid.unique_name.guard(): - data = fluid.data(name="data", shape=shape) - conv = fluid.layers.conv2d(data, 8, 3) - loss = fluid.layers.reduce_mean(conv) - - beta1 = fluid.layers.create_global_var( - shape=[1], - value=0.85, - dtype='float32', - persistable=True) - beta2 = fluid.layers.create_global_var( - shape=[1], - value=0.95, - dtype='float32', - persistable=True) - opt = fluid.optimizer.Adam( - learning_rate=1e-5, beta1=beta1, beta2=beta2) - opt.minimize(loss) - - exe.run(startup) - data_np = np.random.random(shape).astype('float32') - rets = exe.run(train_prog, - feed={"data": data_np}, - fetch_list=[loss]) - assert rets[0] is not None - - shape = [2, 3, 8, 8] - places = [fluid.CPUPlace()] - if core.is_compiled_with_cuda(): - places.append(fluid.CUDAPlace(0)) - for place in places: - test_with_place(place, shape) - - class TestAdamOpV2(unittest.TestCase): def test_adam_op(self): place = fluid.CPUPlace() From 6cc0fc2372e8cf164db6664a480891d8ac065f50 Mon Sep 17 00:00:00 2001 From: MRXLT Date: Fri, 21 Aug 2020 17:41:52 +0800 Subject: [PATCH 28/30] update api && add ut --- .../fluid/tests/unittests/test_adam_op.py | 19 +- .../fluid/tests/unittests/test_adamw_op.py | 14 +- ...st_fleet_graph_execution_meta_optimizer.py | 121 ++- .../unittests/test_imperative_optimizer_v2.py | 728 ++++++++++++++++++ python/paddle/optimizer/adam.py | 4 +- python/paddle/optimizer/adamw.py | 4 +- python/paddle/optimizer/optimizer.py | 26 +- python/paddle/optimizer/rmsprop.py | 3 +- 8 files changed, 894 insertions(+), 25 deletions(-) create mode 100644 python/paddle/fluid/tests/unittests/test_imperative_optimizer_v2.py diff --git a/python/paddle/fluid/tests/unittests/test_adam_op.py b/python/paddle/fluid/tests/unittests/test_adam_op.py index 09eea7cc89997..a6841feb79657 100644 --- a/python/paddle/fluid/tests/unittests/test_adam_op.py +++ b/python/paddle/fluid/tests/unittests/test_adam_op.py @@ -458,15 +458,24 @@ def test_adam_op_with_state_dict(self): adam.set_state_dict(state_dict) #learning_rate is Decay - from paddle.fluid.regularizer import L2Decay + learning_rate = fluid.dygraph.CosineDecay(0.1, 10000, 120) adam = paddle.optimizer.Adam( - learning_rate=0.01, - weight_decay=L2Decay(0.001), + learning_rate=learning_rate, + weight_decay=fluid.regularizer.L2Decay(0.001), parameters=emb.parameters()) state_dict = adam.state_dict() adam.set_state_dict(state_dict) + #leanrning_rate is Tensor + learning_rate = np.array([0.01]).astype("float32") + learning_rate = paddle.to_tensor(learning_rate) + adam = paddle.optimizer.Adam( + learning_rate=learning_rate, parameters=emb.parameters()) + + state_dict = adam.state_dict() + adam.set_state_dict(state_dict) + params = adam.get_opti_var_name_list() assert (params is not None) @@ -478,12 +487,12 @@ def test_adam_op_with_set_lr(self): lr = 0.01 adam.set_lr(lr) - cur_lr = adam.current_step_lr() + cur_lr = adam.get_lr() assert (lr == cur_lr) lr_var = paddle.create_global_var(shape=[1], value=lr, dtype='float32') adam.set_lr(lr_var) - cur_lr = adam.current_step_lr() + cur_lr = adam.get_lr() assert (np.float32(lr) == cur_lr) diff --git a/python/paddle/fluid/tests/unittests/test_adamw_op.py b/python/paddle/fluid/tests/unittests/test_adamw_op.py index ec25fc3eea9eb..0c2cd7ec9c7ed 100644 --- a/python/paddle/fluid/tests/unittests/test_adamw_op.py +++ b/python/paddle/fluid/tests/unittests/test_adamw_op.py @@ -19,7 +19,7 @@ class TestAdamWOp(unittest.TestCase): - def test_adamw_opi_dygraph(self): + def test_adamw_op_dygraph(self): paddle.disable_static() value = np.arange(26).reshape(2, 13).astype("float32") a = paddle.to_variable(value) @@ -34,6 +34,18 @@ def test_adamw_opi_dygraph(self): adam.step() adam.clear_gradients() + def test_adamw_op_coverage(self): + paddle.disable_static() + value = np.arange(26).reshape(2, 13).astype("float32") + a = paddle.to_variable(value) + linear = paddle.nn.Linear(13, 5, dtype="float32") + adam = paddle.optimizer.AdamW( + learning_rate=0.0, + parameters=linear.parameters(), + apply_decay_param_fun=lambda name: True, + weight_decay=0.01) + assert (adam.__str__ is not None) + def test_adamw_op(self): place = fluid.CPUPlace() shape = [2, 3, 8, 8] diff --git a/python/paddle/fluid/tests/unittests/test_fleet_graph_execution_meta_optimizer.py b/python/paddle/fluid/tests/unittests/test_fleet_graph_execution_meta_optimizer.py index 26e004164eb95..25039b8c9e015 100644 --- a/python/paddle/fluid/tests/unittests/test_fleet_graph_execution_meta_optimizer.py +++ b/python/paddle/fluid/tests/unittests/test_fleet_graph_execution_meta_optimizer.py @@ -111,7 +111,126 @@ def node_func(): strategy = paddle.distributed.fleet.DistributedStrategy() strategy.nccl_comm_num = 2 strategy.sync_nccl_allreduce = True - optimizer = paddle.fluidoptimizer.SGD(learning_rate=0.01) + optimizer = paddle.fluid.optimizer.SGD(learning_rate=0.01) + optimizer = fleet.distributed_optimizer( + optimizer, strategy=strategy) + optimizer.minimize(avg_cost) + exe = paddle.fluid.Executor(place=paddle.fluid.CPUPlace()) + exe.run(paddle.fluid.default_startup_program()) + + import numpy as np + + def gen_data(): + return { + "x": np.random.random(size=(128, 32)).astype('float32'), + "y": np.random.randint( + 2, size=(128, 1)).astype('int64') + } + + for i in range(10): + cost_val = exe.run(feed=gen_data(), fetch_list=[avg_cost.name]) + print("cost of step[{}] = {}".format(i, cost_val)) + + proc_a = launch_func(node_func, node_a) + proc_a.start() + proc_b = launch_func(node_func, node_b) + proc_b.start() + proc_a.join() + proc_b.join() + + def test_graph_execution_optimizer_not_apply_v2(self): + node_a = { + "PADDLE_TRAINER_ID": "0", + "PADDLE_CURRENT_ENDPOINT": "127.0.0.1:36003", + "PADDLE_TRAINERS_NUM": "2", + "PADDLE_TRAINER_ENDPOINTS": "127.0.0.1:36003,127.0.0.1:36004", + "http_proxy": "", + "https_proxy": "" + } + + node_b = { + "PADDLE_TRAINER_ID": "1", + "PADDLE_CURRENT_ENDPOINT": "127.0.0.1:36004", + "PADDLE_TRAINERS_NUM": "2", + "PADDLE_TRAINER_ENDPOINTS": "127.0.0.1:36003,127.0.0.1:36004", + "http_proxy": "", + "https_proxy": "" + } + + def node_func(): + import paddle.distributed.fleet as fleet + import paddle.fluid.incubate.fleet.base.role_maker as role_maker + role = role_maker.PaddleCloudRoleMaker(is_collective=True) + fleet.init(role) + input_x = paddle.fluid.layers.data( + name="x", shape=[32], dtype='float32') + input_y = paddle.fluid.layers.data( + name="y", shape=[1], dtype='int64') + + fc_1 = paddle.fluid.layers.fc(input=input_x, size=64, act='tanh') + fc_2 = paddle.fluid.layers.fc(input=fc_1, size=64, act='tanh') + prediction = paddle.fluid.layers.fc(input=[fc_2], + size=2, + act='softmax') + cost = paddle.fluid.layers.cross_entropy( + input=prediction, label=input_y) + avg_cost = paddle.fluid.layers.mean(x=cost) + + strategy = paddle.distributed.fleet.DistributedStrategy() + optimizer = paddle.optimizer.SGD(learning_rate=0.01) + optimizer = fleet.distributed_optimizer( + optimizer, strategy=strategy) + optimizer.minimize(avg_cost) + + proc_a = launch_func(node_func, node_a) + proc_a.start() + proc_b = launch_func(node_func, node_b) + proc_b.start() + proc_a.join() + proc_b.join() + + def test_graph_execution_optimizer(self): + node_a = { + "PADDLE_TRAINER_ID": "0", + "PADDLE_CURRENT_ENDPOINT": "127.0.0.1:36001", + "PADDLE_TRAINERS_NUM": "2", + "PADDLE_TRAINER_ENDPOINTS": "127.0.0.1:36001,127.0.0.1:36002", + "http_proxy": "", + "https_proxy": "" + } + + node_b = { + "PADDLE_TRAINER_ID": "1", + "PADDLE_CURRENT_ENDPOINT": "127.0.0.1:36002", + "PADDLE_TRAINERS_NUM": "2", + "PADDLE_TRAINER_ENDPOINTS": "127.0.0.1:36001,127.0.0.1:36002", + "http_proxy": "", + "https_proxy": "" + } + + def node_func(): + import paddle.distributed.fleet as fleet + import paddle.fluid.incubate.fleet.base.role_maker as role_maker + role = role_maker.PaddleCloudRoleMaker(is_collective=True) + fleet.init(role) + input_x = paddle.fluid.layers.data( + name="x", shape=[32], dtype='float32') + input_y = paddle.fluid.layers.data( + name="y", shape=[1], dtype='int64') + + fc_1 = paddle.fluid.layers.fc(input=input_x, size=64, act='tanh') + fc_2 = paddle.fluid.layers.fc(input=fc_1, size=64, act='tanh') + prediction = paddle.fluid.layers.fc(input=[fc_2], + size=2, + act='softmax') + cost = paddle.fluid.layers.cross_entropy( + input=prediction, label=input_y) + avg_cost = paddle.fluid.layers.mean(x=cost) + + strategy = paddle.distributed.fleet.DistributedStrategy() + strategy.nccl_comm_num = 2 + strategy.sync_nccl_allreduce = True + optimizer = paddle.fluid.optimizer.SGD(learning_rate=0.01) optimizer = fleet.distributed_optimizer( optimizer, strategy=strategy) optimizer.minimize(avg_cost) diff --git a/python/paddle/fluid/tests/unittests/test_imperative_optimizer_v2.py b/python/paddle/fluid/tests/unittests/test_imperative_optimizer_v2.py new file mode 100644 index 0000000000000..9f75c92b185ed --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_imperative_optimizer_v2.py @@ -0,0 +1,728 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import contextlib +import unittest +import numpy as np +import six +import itertools + +import paddle +import paddle.fluid as fluid +from paddle.fluid import core +from paddle.fluid.optimizer import SGDOptimizer, Adam, MomentumOptimizer, LarsMomentumOptimizer, AdagradOptimizer, AdamaxOptimizer, DpsgdOptimizer, DecayedAdagradOptimizer, AdadeltaOptimizer, RMSPropOptimizer, FtrlOptimizer, LambOptimizer +from paddle.fluid.optimizer import ModelAverage, DGCMomentumOptimizer, ExponentialMovingAverage, PipelineOptimizer, LookaheadOptimizer, RecomputeOptimizer +from paddle.fluid.dygraph import Linear +from paddle.fluid.dygraph.base import to_variable +from test_imperative_base import new_program_scope + +# Note(wangzhongpu) +# In dygraph, don't support ModelAverage, DGCMomentumOptimizer, ExponentialMovingAverage, PipelineOptimizer, LookaheadOptimizer, RecomputeOptimizer. + + +class MLP(fluid.Layer): + def __init__(self, param_attr=None, bias_attr=None): + super(MLP, self).__init__() + + self._fc1 = Linear(784, 10) + self._fc2 = Linear(10, 10) + + def forward(self, inputs): + y = self._fc1(inputs) + y = self._fc2(y) + return y + + +class TestImperativeOptimizerBase(unittest.TestCase): + def setUp(self): + self.batch_num = 20 + + def get_optimizer_dygraph(self, parameter_list): + raise NotImplementedError() + + def get_optimizer(self): + raise NotImplementedError() + + def reader_decorator(self, reader): + def _reader_imple(): + for item in reader(): + image = np.array(item[0]).reshape(1, 784) + label = np.array(item[1]).astype('int64').reshape(1) + yield image, label + + return _reader_imple + + def _check_exception(self, exception_message, place=None): + seed = 90 + batch_size = 128 + if place == None: + place = fluid.CUDAPlace(0) if core.is_compiled_with_cuda( + ) else fluid.CPUPlace() + + with fluid.dygraph.guard(place): + try: + fluid.default_startup_program().random_seed = seed + fluid.default_main_program().random_seed = seed + mlp = MLP() + optimizer = self.get_optimizer_dygraph( + parameter_list=mlp.parameters()) + except Exception as e: + assert str(e) == exception_message + + def _check_mlp(self, place=None): + seed = 90 + batch_size = 128 + + if place == None: + place = fluid.CPUPlace() if not core.is_compiled_with_cuda( + ) else fluid.CUDAPlace(0) + + with fluid.dygraph.guard(place): + fluid.default_startup_program().random_seed = seed + fluid.default_main_program().random_seed = seed + + mlp = MLP() + optimizer = self.get_optimizer_dygraph( + parameter_list=mlp.parameters()) + + batch_py_reader = fluid.io.PyReader(capacity=1) + batch_py_reader.decorate_sample_list_generator( + paddle.batch( + self.reader_decorator(paddle.dataset.mnist.train()), + batch_size=batch_size, + drop_last=True), + places=fluid.CPUPlace()) + + dy_param_init_value = {} + for batch_id, data in enumerate(batch_py_reader()): + if batch_id >= self.batch_num: + break + + img = data[0] + label = data[1] + label.stop_gradient = True + + img = fluid.layers.reshape(img, shape=[batch_size, -1]) + cost = mlp(img) + avg_loss = fluid.layers.reduce_mean(cost) + dy_out = avg_loss.numpy() + + if batch_id == 0: + for param in mlp.parameters(): + dy_param_init_value[param.name] = param.numpy() + + avg_loss.backward() + optimizer.minimize(avg_loss) + mlp.clear_gradients() + dy_param_value = {} + for param in mlp.parameters(): + dy_param_value[param.name] = param.numpy() + + with new_program_scope(): + fluid.default_startup_program().random_seed = seed + fluid.default_main_program().random_seed = seed + + if place == None: + place = fluid.CPUPlace() if not core.is_compiled_with_cuda( + ) else fluid.CUDAPlace(0) + + exe = fluid.Executor(place) + + mlp = MLP() + optimizer = self.get_optimizer() + train_reader = paddle.batch( + paddle.dataset.mnist.train(), batch_size=128, drop_last=True) + + img = fluid.layers.data( + name='pixel', shape=[1, 28, 28], dtype='float32') + label = fluid.layers.data(name='label', shape=[1], dtype='int64') + img = fluid.layers.reshape(img, shape=[batch_size, 784]) + cost = mlp(img) + avg_loss = fluid.layers.reduce_mean(cost) + optimizer.minimize(avg_loss) + + # initialize params and fetch them + static_param_init_value = {} + static_param_name_list = [] + for param in mlp.parameters(): + static_param_name_list.append(param.name) + + out = exe.run(fluid.default_startup_program(), + fetch_list=static_param_name_list) + + for i in range(len(static_param_name_list)): + static_param_init_value[static_param_name_list[i]] = out[i] + + for batch_id, data in enumerate(train_reader()): + if batch_id >= self.batch_num: + break + + static_x_data = np.array( + [x[0].reshape(1, 28, 28) for x in data]).astype('float32') + y_data = np.array([x[1] for x in data]).astype('int64').reshape( + [128, 1]) + + fetch_list = [avg_loss.name] + fetch_list.extend(static_param_name_list) + out = exe.run(fluid.default_main_program(), + feed={"pixel": static_x_data, + "label": y_data}, + fetch_list=fetch_list) + + static_param_value = {} + static_out = out[0] + for i in range(1, len(out)): + static_param_value[static_param_name_list[i - 1]] = out[i] + + for key, value in six.iteritems(static_param_init_value): + self.assertTrue(np.allclose(value, dy_param_init_value[key])) + + self.assertTrue(np.allclose(static_out, dy_out)) + + for key, value in six.iteritems(static_param_value): + self.assertTrue(np.allclose(value, dy_param_value[key])) + + +class TestImperativeOptimizerPiecewiseDecay(TestImperativeOptimizerBase): + def get_optimizer_dygraph(self, parameter_list): + bd = [3, 6, 9] + optimizer = SGDOptimizer( + learning_rate=fluid.layers.piecewise_decay( + boundaries=bd, + values=[0.1 * (0.1**i) for i in range(len(bd) + 1)]), + parameter_list=parameter_list) + return optimizer + + def get_optimizer(self): + bd = [3, 6, 9] + optimizer = SGDOptimizer(learning_rate=fluid.layers.piecewise_decay( + boundaries=bd, values=[0.1 * (0.1**i) for i in range(len(bd) + 1)])) + return optimizer + + def test_sgd(self): + self._check_mlp() + + +class TestImperativeOptimizerNaturalExpDecay(TestImperativeOptimizerBase): + def get_optimizer_dygraph(self, parameter_list): + optimizer = SGDOptimizer( + learning_rate=fluid.layers.natural_exp_decay( + learning_rate=0.1, + decay_steps=10000, + decay_rate=0.5, + staircase=True), + parameter_list=parameter_list) + return optimizer + + def get_optimizer(self): + optimizer = SGDOptimizer(learning_rate=fluid.layers.natural_exp_decay( + learning_rate=0.1, + decay_steps=10000, + decay_rate=0.5, + staircase=True)) + return optimizer + + def test_sgd(self): + self._check_mlp() + + +class TestImperativeOptimizerExponentialDecay(TestImperativeOptimizerBase): + def get_optimizer_dygraph(self, parameter_list): + optimizer = SGDOptimizer( + learning_rate=fluid.layers.exponential_decay( + learning_rate=0.1, + decay_steps=10000, + decay_rate=0.5, + staircase=True), + parameter_list=parameter_list) + return optimizer + + def get_optimizer(self): + optimizer = SGDOptimizer(learning_rate=fluid.layers.exponential_decay( + learning_rate=0.1, + decay_steps=10000, + decay_rate=0.5, + staircase=True)) + return optimizer + + def test_sgd(self): + self._check_mlp() + + +class TestImperativeOptimizerInverseTimeDecay(TestImperativeOptimizerBase): + def get_optimizer_dygraph(self, parameter_list): + optimizer = Adam( + learning_rate=fluid.layers.inverse_time_decay( + learning_rate=0.1, + decay_steps=10000, + decay_rate=0.5, + staircase=True), + parameter_list=parameter_list) + return optimizer + + def get_optimizer(self): + optimizer = Adam(learning_rate=fluid.layers.inverse_time_decay( + learning_rate=0.1, + decay_steps=10000, + decay_rate=0.5, + staircase=True)) + return optimizer + + def test_adam(self): + self._check_mlp() + + +class TestImperativeOptimizerPolynomialDecay(TestImperativeOptimizerBase): + def get_optimizer_dygraph(self, parameter_list): + optimizer = SGDOptimizer( + learning_rate=fluid.layers.polynomial_decay( + learning_rate=0.1, decay_steps=5, cycle=self.cycle), + parameter_list=parameter_list) + return optimizer + + def get_optimizer(self): + optimizer = SGDOptimizer(learning_rate=fluid.layers.polynomial_decay( + learning_rate=0.1, decay_steps=5, cycle=self.cycle)) + return optimizer + + def test_sgd_cycle(self): + self.cycle = True + self._check_mlp() + + def test_sgd(self): + self.cycle = False + self._check_mlp() + + +class TestImperativeOptimizerCosineDecay(TestImperativeOptimizerBase): + def get_optimizer_dygraph(self, parameter_list): + optimizer = SGDOptimizer( + learning_rate=fluid.layers.cosine_decay( + learning_rate=0.1, step_each_epoch=10000, epochs=120), + parameter_list=parameter_list) + return optimizer + + def get_optimizer(self): + optimizer = SGDOptimizer(learning_rate=fluid.layers.cosine_decay( + learning_rate=0.1, step_each_epoch=10000, epochs=120)) + return optimizer + + def test_sgd(self): + self._check_mlp() + + +class TestImperativeOptimizerNoamDecay(TestImperativeOptimizerBase): + def get_optimizer_dygraph(self, parameter_list): + optimizer = SGDOptimizer( + learning_rate=fluid.layers.noam_decay( + d_model=512, warmup_steps=8000), + parameter_list=parameter_list) + return optimizer + + def get_optimizer(self): + optimizer = SGDOptimizer(learning_rate=fluid.layers.noam_decay( + d_model=512, warmup_steps=8000)) + return optimizer + + def test_sgd(self): + self._check_mlp() + + +class TestOptimizerLearningRate(unittest.TestCase): + def test_constant_lr(self): + with fluid.dygraph.guard(): + a = np.random.uniform(-0.1, 0.1, [10, 10]).astype("float32") + + linear = fluid.dygraph.nn.Linear(10, 10) + + a = fluid.dygraph.to_variable(a) + + b = linear(a) + + loss = fluid.layers.reduce_mean(b) + + adam = paddle.optimizer.Adam(0.001, parameters=linear.parameters()) + + self.assertTrue( + np.allclose( + adam.get_lr(), 0.001, rtol=1e-06, atol=0.0)) + + for i in range(10): + adam.minimize(loss) + lr = adam.get_lr() + + self.assertTrue(np.allclose(lr, 0.001, rtol=1e-06, atol=0.0)) + + def test_lr_decay(self): + with fluid.dygraph.guard(): + a = np.random.uniform(-0.1, 0.1, [10, 10]).astype("float32") + + linear = fluid.dygraph.nn.Linear(10, 10) + + a = fluid.dygraph.to_variable(a) + + b = linear(a) + + loss = fluid.layers.reduce_mean(b) + + bd = [2, 4, 6, 8] + value = [0.2, 0.4, 0.6, 0.8, 1.0] + + adam = paddle.optimizer.Adam( + fluid.dygraph.PiecewiseDecay(bd, value, 0), + parameters=linear.parameters()) + + self.assertTrue( + np.allclose( + adam.get_lr(), 0.2, rtol=1e-06, atol=0.0)) + + ret = [0.2, 0.2, 0.4, 0.4, 0.6, 0.6, 0.8, 0.8, 1.0, 1.0, 1.0, 1.0] + for i in range(12): + adam.minimize(loss) + lr = adam.get_lr() + + self.assertTrue(np.allclose(lr, ret[i], rtol=1e-06, atol=0.0)) + + def test_lr_decay_natural_exp(self): + with fluid.dygraph.guard(): + a = np.random.uniform(-0.1, 0.1, [10, 10]).astype("float32") + + linear = fluid.dygraph.nn.Linear(10, 10) + + a = fluid.dygraph.to_variable(a) + + b = linear(a) + + loss = fluid.layers.reduce_mean(b) + base_lr = 1.0 + + adam = paddle.optimizer.Adam( + fluid.dygraph.NaturalExpDecay( + learning_rate=base_lr, + decay_steps=3, + decay_rate=0.5, + staircase=True), + parameters=linear.parameters()) + + self.assertTrue( + np.allclose( + adam.get_lr(), 1.0, rtol=1e-06, atol=0.0)) + + ret = [1.0, 1.0, 1.0, np.exp(-0.5), np.exp(-0.5)] + for i in range(5): + adam.minimize(loss) + lr = adam.get_lr() + + self.assertTrue(np.allclose(lr, ret[i], rtol=1e-06, atol=0.0)) + + def test_set_lr(self): + with fluid.dygraph.guard(): + a = np.random.uniform(-0.1, 0.1, [10, 10]).astype("float32") + + linear = fluid.dygraph.nn.Linear(10, 10) + + a = fluid.dygraph.to_variable(a) + + b = linear(a) + + loss = fluid.layers.reduce_mean(b) + + adam = paddle.optimizer.Adam(0.1, parameters=linear.parameters()) + + lr_list = [0.2, 0.3, 0.4, 0.5, 0.6] + for i in range(5): + adam.set_lr(lr_list[i]) + adam.minimize(loss) + lr = adam.get_lr() + self.assertTrue( + np.allclose( + lr, lr_list[i], rtol=1e-06, atol=0.0)) + + lr_var = fluid.layers.create_global_var( + shape=[1], value=0.7, dtype='float32') + adam.set_lr(lr_var) + adam.minimize(loss) + lr = adam.get_lr() + self.assertTrue(np.allclose(lr, 0.7, rtol=1e-06, atol=0.0)) + + with self.assertRaises(RuntimeError): + adam = paddle.optimizer.Adam( + fluid.dygraph.NaturalExpDecay( + learning_rate=0.1, + decay_steps=3, + decay_rate=0.5, + staircase=True), + parameters=linear.parameters()) + adam.set_lr(0.01) + + +class TestImperativeMomentumOptimizer(TestImperativeOptimizerBase): + def get_optimizer_dygraph(self, parameter_list): + optimizer = MomentumOptimizer( + learning_rate=0.001, momentum=0.9, parameter_list=parameter_list) + return optimizer + + def get_optimizer(self): + optimizer = MomentumOptimizer(learning_rate=0.001, momentum=0.9) + return optimizer + + def test_momentum(self): + self._check_mlp() + + +class TestImperativeLarsMomentumOptimizer(TestImperativeOptimizerBase): + def get_optimizer_dygraph(self, parameter_list): + optimizer = LarsMomentumOptimizer( + learning_rate=0.001, momentum=0.9, parameter_list=parameter_list) + return optimizer + + def get_optimizer(self): + optimizer = LarsMomentumOptimizer(learning_rate=0.001, momentum=0.9) + return optimizer + + def test_larsmomentum(self): + self._check_mlp() + + +class TestImperativeAdagradOptimizer(TestImperativeOptimizerBase): + def get_optimizer_dygraph(self, parameter_list): + optimizer = AdagradOptimizer( + learning_rate=0.2, parameter_list=parameter_list) + return optimizer + + def get_optimizer(self): + optimizer = AdagradOptimizer(learning_rate=0.2) + return optimizer + + def test_adagrad(self): + self._check_mlp() + + +class TestImperativeAdamaxOptimizer(TestImperativeOptimizerBase): + def get_optimizer_dygraph(self, parameter_list): + optimizer = AdamaxOptimizer( + learning_rate=0.2, parameter_list=parameter_list) + return optimizer + + def get_optimizer(self): + optimizer = AdamaxOptimizer(learning_rate=0.2) + return optimizer + + def test_adamax(self): + self._check_mlp() + + +class TestImperativeDpsgdOptimizer(TestImperativeOptimizerBase): + def get_optimizer_dygraph(self, parameter_list): + optimizer = DpsgdOptimizer( + learning_rate=0.01, + clip=10.0, + batch_size=16.0, + sigma=1.0, + parameter_list=parameter_list) + optimizer._seed = 100 + return optimizer + + def get_optimizer(self): + optimizer = DpsgdOptimizer( + learning_rate=0.01, clip=10.0, batch_size=16.0, sigma=1.0) + optimizer._seed = 100 + return optimizer + + def test_dpsgd(self): + self._check_mlp(place=fluid.CPUPlace()) + + +class TestImperativeDecayedAdagradOptimizer(TestImperativeOptimizerBase): + def get_optimizer_dygraph(self, parameter_list): + optimizer = DecayedAdagradOptimizer( + learning_rate=0.2, parameter_list=parameter_list) + return optimizer + + def get_optimizer(self): + optimizer = DecayedAdagradOptimizer(learning_rate=0.2) + return optimizer + + def test_decayadagrad(self): + self._check_mlp() + + +class TestImperativeAdadeltaOptimizer(TestImperativeOptimizerBase): + def get_optimizer_dygraph(self, parameter_list): + optimizer = AdadeltaOptimizer( + learning_rate=0.0003, + epsilon=1.0e-6, + rho=0.95, + parameter_list=parameter_list) + return optimizer + + def get_optimizer(self): + optimizer = AdadeltaOptimizer( + learning_rate=0.0003, epsilon=1.0e-6, rho=0.95) + return optimizer + + def test_adadelta(self): + self._check_mlp() + + +class TestImperativeRMSPropOptimizer(TestImperativeOptimizerBase): + def get_optimizer_dygraph(self, parameter_list): + optimizer = RMSPropOptimizer( + learning_rate=0.1, parameter_list=parameter_list) + return optimizer + + def get_optimizer(self): + optimizer = RMSPropOptimizer(learning_rate=0.1) + return optimizer + + def test_rmsprop(self): + self._check_mlp() + + +class TestImperativeFtrlOptimizer(TestImperativeOptimizerBase): + def get_optimizer_dygraph(self, parameter_list): + optimizer = FtrlOptimizer( + learning_rate=0.1, parameter_list=parameter_list) + return optimizer + + def get_optimizer(self): + optimizer = FtrlOptimizer(learning_rate=0.1) + return optimizer + + def test_ftrl(self): + self._check_mlp() + + +def exclude_fn(param): + return param.name.endswith('.b_0') + + +class TestImperativeLambOptimizer(TestImperativeOptimizerBase): + def get_optimizer_dygraph(self, parameter_list): + optimizer = LambOptimizer( + learning_rate=0.002, + exclude_from_weight_decay_fn=exclude_fn, + parameter_list=parameter_list) + return optimizer + + def get_optimizer(self): + optimizer = LambOptimizer( + learning_rate=0.002, exclude_from_weight_decay_fn=exclude_fn) + return optimizer + + def test_lamb(self): + self._check_mlp() + + +class TestImperativeModelAverage(TestImperativeOptimizerBase): + def get_optimizer_dygraph(self, parameter_list): + optimizer = ModelAverage( + 0.15, min_average_window=10000, max_average_window=12500) + return optimizer + + def test_modelaverage(self): + exception_message = "In dygraph, don't support ModelAverage." + self._check_exception(exception_message) + + +class TestImperativeDGCMomentumOptimizer(TestImperativeOptimizerBase): + def get_optimizer_dygraph(self, parameter_list): + optimizer = DGCMomentumOptimizer( + learning_rate=0.0001, + momentum=0.9, + rampup_step=1000, + rampup_begin_step=1252, + sparsity=[0.999, 0.999]) + return optimizer + + def test_dgcmomentum(self): + exception_message = "In dygraph, don't support DGCMomentumOptimizer." + self._check_exception(exception_message) + + +class TestImperativeExponentialMovingAverage(TestImperativeOptimizerBase): + def get_optimizer_dygraph(self, parameter_list): + optimizer = ExponentialMovingAverage(0.999) + return optimizer + + def test_exponentialmoving(self): + exception_message = "In dygraph, don't support ExponentialMovingAverage." + self._check_exception(exception_message) + + +class TestImperativePipelineOptimizer(TestImperativeOptimizerBase): + def get_optimizer_dygraph(self, parameter_list): + optimizer = paddle.optimizer.SGD(learning_rate=0.5, + parameter_list=parameter_list) + optimizer = PipelineOptimizer(optimizer) + return optimizer + + def test_pipline(self): + exception_message = "In dygraph, don't support PipelineOptimizer." + self._check_exception(exception_message) + + +class TestImperativeLookaheadOptimizer(TestImperativeOptimizerBase): + def get_optimizer_dygraph(self, parameter_list): + optimizer = paddle.optimizer.SGD(learning_rate=0.5, + parameter_list=parameter_list) + optimizer = LookaheadOptimizer(optimizer, alpha=0.5, k=5) + return optimizer + + def test_lookahead(self): + exception_message = "In dygraph, don't support LookaheadOptimizer." + self._check_exception(exception_message) + + +class TestImperativeRecomputeOptimizer(TestImperativeOptimizerBase): + def get_optimizer_dygraph(self, parameter_list): + optimizer = paddle.optimizer.SGD(learning_rate=0.5, + parameter_list=parameter_list) + optimizer = RecomputeOptimizer(optimizer) + return optimizer + + def test_recompute(self): + exception_message = "In dygraph, don't support RecomputeOptimizer." + self._check_exception(exception_message) + + +class TestImperativeOptimizerList(unittest.TestCase): + def test_parameter_list(self): + with fluid.dygraph.guard(): + linear_1 = Linear(10, 10) + linear_2 = Linear(10, 10) + + sgd = SGDOptimizer( + 1.0, + parameter_list=itertools.chain(linear_1.parameters(), + linear_2.parameters())) + + in_np = np.random.uniform(-0.1, 0.1, [10, 10]).astype("float32") + in_data = fluid.dygraph.to_variable(in_np) + + y = linear_1(in_data) + y = linear_2(y) + loss = fluid.layers.reduce_mean(y) + loss.backward() + sgd.minimize(loss) + + self.assertTrue( + len(sgd._parameter_list) == + len(linear_1.parameters() + linear_2.parameters())) + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/optimizer/adam.py b/python/paddle/optimizer/adam.py index b829bcd6047be..4ba5ecddb974e 100644 --- a/python/paddle/optimizer/adam.py +++ b/python/paddle/optimizer/adam.py @@ -45,8 +45,8 @@ class Adam(Optimizer): Related paper: `Adam: A Method for Stochastic Optimization `_ Args: - learning_rate (float|Tensor, optional): The learning rate used to update ``Parameter``. - It can be a float value or a ``Tensor`` with a float type. The default value is 0.001. + learning_rate (float|Tensor|LearningRateDecay, optional): The learning rate used to update ``Parameter``. + It can be a float value, a ``Tensor`` with a float type or a LearningRateDecay. The default value is 0.001. beta1 (float|Tensor, optional): The exponential decay rate for the 1st moment estimates. It should be a float number or a Tensor with shape [1] and data type as float32. The default value is 0.9. diff --git a/python/paddle/optimizer/adamw.py b/python/paddle/optimizer/adamw.py index cc46e8bdf9944..7d47aa3f20291 100644 --- a/python/paddle/optimizer/adamw.py +++ b/python/paddle/optimizer/adamw.py @@ -145,8 +145,8 @@ class AdamW(DecoupledWeightDecay, Adam): Args: - learning_rate (float|Tensor, optional): The learning rate used to update ``Parameter``. - It can be a float value or a ``Tensor`` with a float type. The default value is 0.001. + learning_rate (float|Tensor|LearningRateDecay, optional): The learning rate used to update ``Parameter``. + It can be a float value, a ``Tensor`` with a float type or a LearningRateDecay. The default value is 0.001. parameters (list, optional): List of ``Tensor`` names to update to minimize ``loss``. \ This parameter is required in dygraph mode. \ The default value is None in static mode, at this time all parameters will be updated. diff --git a/python/paddle/optimizer/optimizer.py b/python/paddle/optimizer/optimizer.py index ff894805c0c7f..4b8882f5fc28f 100644 --- a/python/paddle/optimizer/optimizer.py +++ b/python/paddle/optimizer/optimizer.py @@ -52,8 +52,8 @@ class Optimizer(object): but need to use one of it's implementation. Args: - learning_rate (float|Tensor): The learning rate used to update ``Parameter``. - It can be a float value or a ``Tensor`` with a float type. + learning_rate (float|Tensor|LearningRateDecay): The learning rate used to update ``Parameter``. + It can be a float value, a ``Tensor`` with a float type or a LearningRateDecay. parameters (list, optional): List of ``Tensor`` names to update to minimize ``loss``. \ This parameter is required in dygraph mode. \ The default value is None in static mode, at this time all parameters will be updated. @@ -161,7 +161,7 @@ def __init__(self, self._opti_name_list = [] self._accumulators_holder = {} self._param_device_map = dict() - self.clear_grad = self.clear_gradients + self.clear_gradients = self.clear_grad @framework.dygraph_only def state_dict(self): @@ -366,7 +366,7 @@ def set_lr(self, value): lr_list = [0.2, 0.3, 0.4, 0.5, 0.6] for i in range(5): adam.set_lr(lr_list[i]) - lr = adam.current_step_lr() + lr = adam.get_lr() print("current lr is {}".format(lr)) # Print: # current lr is 0.2 @@ -380,7 +380,7 @@ def set_lr(self, value): lr_var = paddle.create_global_var( shape=[1], value=0.7, dtype='float32') adam.set_lr(lr_var) - lr = adam.current_step_lr() + lr = adam.get_lr() print("current lr is {}".format(lr)) # Print: # current lr is 0.7 @@ -416,7 +416,7 @@ def set_lr(self, value): self._learning_rate_map[framework.default_main_program()] = value @framework.dygraph_only - def current_step_lr(self): + def get_lr(self): """ :api_attr: imperative @@ -435,7 +435,7 @@ def current_step_lr(self): paddle.disable_static() emb = paddle.nn.Embedding([10, 10]) adam = paddle.optimizer.Adam(0.001, parameters = emb.parameters()) - lr = adam.current_step_lr() + lr = adam.get_lr() print(lr) # 0.001 # example2: PiecewiseDecay is used, return the step learning rate @@ -452,13 +452,13 @@ def current_step_lr(self): parameters=linear.parameters()) # first step: learning rate is 0.2 - np.allclose(adam.current_step_lr(), 0.2, rtol=1e-06, atol=0.0) # True + np.allclose(adam.get_lr(), 0.2, rtol=1e-06, atol=0.0) # True # learning rate for different steps ret = [0.2, 0.2, 0.4, 0.4, 0.6, 0.6, 0.8, 0.8, 1.0, 1.0, 1.0, 1.0] for i in range(12): adam.step() - lr = adam.current_step_lr() + lr = adam.get_lr() np.allclose(lr, ret[i], rtol=1e-06, atol=0.0) # True """ @@ -778,7 +778,7 @@ def backward(self, out = linear(a) out.backward() adam.step() - adam.clear_gradients() + adam.clear_grad() """ act_no_grad_set = None if framework.in_dygraph_mode(): @@ -899,7 +899,7 @@ def _get_no_grad_set(self, loss, no_grad_set=None): return no_grad_set @framework.dygraph_only - def clear_gradients(self): + def clear_grad(self): """ Clear the gradients of all optimized parameters for model. @@ -921,7 +921,7 @@ def clear_gradients(self): out = linear(a) out.backward() adam.step() - adam.clear_gradients() + adam.clear_grad() """ for p in self._parameter_list: @@ -1021,7 +1021,7 @@ def step(self): out = linear(a) out.backward() adam.step() - adam.clear_gradients() + adam.clear_grad() """ parameter_list = self._parameter_list self._dtype = None diff --git a/python/paddle/optimizer/rmsprop.py b/python/paddle/optimizer/rmsprop.py index 155a04b42db88..9ed82646e6ef0 100644 --- a/python/paddle/optimizer/rmsprop.py +++ b/python/paddle/optimizer/rmsprop.py @@ -69,7 +69,8 @@ class RMSProp(Optimizer): Parameters: - learning_rate(float): Global learning rate. + learning_rate (float|Tensor|LearningRateDecay): The learning rate used to update ``Parameter``. + It can be a float value, a ``Tensor`` with a float type or a LearningRateDecay. rho(float): rho is :math: `\\rho` in equation, default is 0.95. epsilon(float): :math: `\\epsilon` in equation is smoothing term to avoid division by zero, default is 1e-6. From 5d4242076575a9d128a0aa23d09dcc5abe8af217 Mon Sep 17 00:00:00 2001 From: MRXLT Date: Fri, 21 Aug 2020 19:44:22 +0800 Subject: [PATCH 29/30] update doc && fix ut --- .../fluid/tests/unittests/test_adam_op.py | 17 +++++++++-------- python/paddle/optimizer/adam.py | 4 ++-- python/paddle/optimizer/adamax.py | 4 ++-- python/paddle/optimizer/adamw.py | 4 ++-- python/paddle/optimizer/optimizer.py | 4 ++-- python/paddle/optimizer/rmsprop.py | 4 ++-- 6 files changed, 19 insertions(+), 18 deletions(-) diff --git a/python/paddle/fluid/tests/unittests/test_adam_op.py b/python/paddle/fluid/tests/unittests/test_adam_op.py index a6841feb79657..7025d5b9e2af0 100644 --- a/python/paddle/fluid/tests/unittests/test_adam_op.py +++ b/python/paddle/fluid/tests/unittests/test_adam_op.py @@ -468,16 +468,17 @@ def test_adam_op_with_state_dict(self): adam.set_state_dict(state_dict) #leanrning_rate is Tensor - learning_rate = np.array([0.01]).astype("float32") - learning_rate = paddle.to_tensor(learning_rate) - adam = paddle.optimizer.Adam( - learning_rate=learning_rate, parameters=emb.parameters()) + with self.assertRaises(TypeError): + learning_rate = np.array([0.01]).astype("float32") + learning_rate = paddle.to_tensor(learning_rate) + adam = paddle.optimizer.Adam( + learning_rate=learning_rate, parameters=emb.parameters()) - state_dict = adam.state_dict() - adam.set_state_dict(state_dict) + state_dict = adam.state_dict() + adam.set_state_dict(state_dict) - params = adam.get_opti_var_name_list() - assert (params is not None) + params = adam.get_opti_var_name_list() + assert (params is not None) def test_adam_op_with_set_lr(self): import paddle diff --git a/python/paddle/optimizer/adam.py b/python/paddle/optimizer/adam.py index 4ba5ecddb974e..0da8053fe8a34 100644 --- a/python/paddle/optimizer/adam.py +++ b/python/paddle/optimizer/adam.py @@ -45,8 +45,8 @@ class Adam(Optimizer): Related paper: `Adam: A Method for Stochastic Optimization `_ Args: - learning_rate (float|Tensor|LearningRateDecay, optional): The learning rate used to update ``Parameter``. - It can be a float value, a ``Tensor`` with a float type or a LearningRateDecay. The default value is 0.001. + learning_rate (float|LearningRateDecay, optional): The learning rate used to update ``Parameter``. + It can be a float value or a LearningRateDecay. The default value is 0.001. beta1 (float|Tensor, optional): The exponential decay rate for the 1st moment estimates. It should be a float number or a Tensor with shape [1] and data type as float32. The default value is 0.9. diff --git a/python/paddle/optimizer/adamax.py b/python/paddle/optimizer/adamax.py index f0e05f98f4d15..73a78b17cbba5 100644 --- a/python/paddle/optimizer/adamax.py +++ b/python/paddle/optimizer/adamax.py @@ -47,8 +47,8 @@ class Adamax(Optimizer): it is added here for numerical stability to prevent the division by 0 error. Args: - learning_rate (float|Tensor, optional): The learning rate used to update ``Parameter``. - It can be a float value or a ``Tensor`` with a float type. The default value is 0.001. + learning_rate (float|LearningRateDecay, optional): The learning rate used to update ``Parameter``. + It can be a float value or a LearningRateDecay. The default value is 0.001. beta1 (float, optional): The exponential decay rate for the 1st moment estimates. The default value is 0.9. beta2 (float, optional): The exponential decay rate for the 2nd moment estimates. diff --git a/python/paddle/optimizer/adamw.py b/python/paddle/optimizer/adamw.py index 7d47aa3f20291..f498fcbffa24e 100644 --- a/python/paddle/optimizer/adamw.py +++ b/python/paddle/optimizer/adamw.py @@ -145,8 +145,8 @@ class AdamW(DecoupledWeightDecay, Adam): Args: - learning_rate (float|Tensor|LearningRateDecay, optional): The learning rate used to update ``Parameter``. - It can be a float value, a ``Tensor`` with a float type or a LearningRateDecay. The default value is 0.001. + learning_rate (float|LearningRateDecay, optional): The learning rate used to update ``Parameter``. + It can be a float value or a LearningRateDecay. The default value is 0.001. parameters (list, optional): List of ``Tensor`` names to update to minimize ``loss``. \ This parameter is required in dygraph mode. \ The default value is None in static mode, at this time all parameters will be updated. diff --git a/python/paddle/optimizer/optimizer.py b/python/paddle/optimizer/optimizer.py index 4b8882f5fc28f..b3773ddeb42fc 100644 --- a/python/paddle/optimizer/optimizer.py +++ b/python/paddle/optimizer/optimizer.py @@ -52,8 +52,8 @@ class Optimizer(object): but need to use one of it's implementation. Args: - learning_rate (float|Tensor|LearningRateDecay): The learning rate used to update ``Parameter``. - It can be a float value, a ``Tensor`` with a float type or a LearningRateDecay. + learning_rate (float|LearningRateDecay): The learning rate used to update ``Parameter``. + It can be a float value or a LearningRateDecay. parameters (list, optional): List of ``Tensor`` names to update to minimize ``loss``. \ This parameter is required in dygraph mode. \ The default value is None in static mode, at this time all parameters will be updated. diff --git a/python/paddle/optimizer/rmsprop.py b/python/paddle/optimizer/rmsprop.py index 9ed82646e6ef0..0bc4c9bfd53dc 100644 --- a/python/paddle/optimizer/rmsprop.py +++ b/python/paddle/optimizer/rmsprop.py @@ -69,8 +69,8 @@ class RMSProp(Optimizer): Parameters: - learning_rate (float|Tensor|LearningRateDecay): The learning rate used to update ``Parameter``. - It can be a float value, a ``Tensor`` with a float type or a LearningRateDecay. + learning_rate (float|LearningRateDecay): The learning rate used to update ``Parameter``. + It can be a float value or a LearningRateDecay. rho(float): rho is :math: `\\rho` in equation, default is 0.95. epsilon(float): :math: `\\epsilon` in equation is smoothing term to avoid division by zero, default is 1e-6. From 909478274369233f04301b14e1ff8636292f4463 Mon Sep 17 00:00:00 2001 From: MRXLT Date: Sun, 23 Aug 2020 13:34:52 +0800 Subject: [PATCH 30/30] add ut --- .../fluid/tests/unittests/test_adam_op.py | 26 +- .../fluid/tests/unittests/test_adamw_op.py | 2 +- .../unittests/test_imperative_save_load_v2.py | 917 ++++++++++++++++++ python/paddle/optimizer/optimizer.py | 42 - 4 files changed, 937 insertions(+), 50 deletions(-) create mode 100644 python/paddle/fluid/tests/unittests/test_imperative_save_load_v2.py diff --git a/python/paddle/fluid/tests/unittests/test_adam_op.py b/python/paddle/fluid/tests/unittests/test_adam_op.py index 7025d5b9e2af0..990499858ca52 100644 --- a/python/paddle/fluid/tests/unittests/test_adam_op.py +++ b/python/paddle/fluid/tests/unittests/test_adam_op.py @@ -454,7 +454,6 @@ def test_adam_op_with_state_dict(self): adam = paddle.optimizer.Adam(0.001, parameters=emb.parameters()) state_dict = adam.state_dict() - adam.set_state_dict(state_dict) #learning_rate is Decay @@ -463,7 +462,7 @@ def test_adam_op_with_state_dict(self): learning_rate=learning_rate, weight_decay=fluid.regularizer.L2Decay(0.001), parameters=emb.parameters()) - + lr = adam.get_lr() state_dict = adam.state_dict() adam.set_state_dict(state_dict) @@ -474,14 +473,23 @@ def test_adam_op_with_state_dict(self): adam = paddle.optimizer.Adam( learning_rate=learning_rate, parameters=emb.parameters()) - state_dict = adam.state_dict() - adam.set_state_dict(state_dict) + params = adam.get_opti_var_name_list() + assert (params is not None) - params = adam.get_opti_var_name_list() - assert (params is not None) + def test_adam_with_grad_clip(self): + paddle.disable_static() + value = np.arange(26).reshape(2, 13).astype("float32") + a = fluid.dygraph.to_variable(value) + linear = fluid.Linear(13, 5, dtype="float32") + clip = fluid.clip.GradientClipByGlobalNorm(clip_norm=1.0) + adam = paddle.optimizer.Adam( + 0.1, parameters=linear.parameters(), grad_clip=clip) + out = linear(a) + out.backward() + adam.step() + adam.clear_gradients() def test_adam_op_with_set_lr(self): - import paddle paddle.disable_static() linear = paddle.nn.Linear(10, 10) adam = paddle.optimizer.Adam(0.1, parameters=linear.parameters()) @@ -496,6 +504,10 @@ def test_adam_op_with_set_lr(self): cur_lr = adam.get_lr() assert (np.float32(lr) == cur_lr) + with self.assertRaises(TypeError): + lr = int(1) + adam.set_lr(lr) + if __name__ == "__main__": unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_adamw_op.py b/python/paddle/fluid/tests/unittests/test_adamw_op.py index 0c2cd7ec9c7ed..ddb70d6e6400c 100644 --- a/python/paddle/fluid/tests/unittests/test_adamw_op.py +++ b/python/paddle/fluid/tests/unittests/test_adamw_op.py @@ -44,7 +44,7 @@ def test_adamw_op_coverage(self): parameters=linear.parameters(), apply_decay_param_fun=lambda name: True, weight_decay=0.01) - assert (adam.__str__ is not None) + assert (adam.__str__() is not None) def test_adamw_op(self): place = fluid.CPUPlace() diff --git a/python/paddle/fluid/tests/unittests/test_imperative_save_load_v2.py b/python/paddle/fluid/tests/unittests/test_imperative_save_load_v2.py new file mode 100644 index 0000000000000..4ab35a21aff43 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_imperative_save_load_v2.py @@ -0,0 +1,917 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import os +import unittest +import paddle.fluid as fluid +import paddle.fluid.core as core +from paddle.fluid.dygraph.nn import Embedding, Linear +import paddle.fluid.framework as framework +from paddle.optimizer import Adam +from paddle.fluid.dygraph.base import to_variable +from paddle.fluid.dygraph.learning_rate_scheduler import LearningRateDecay +from test_imperative_base import new_program_scope +import numpy as np +import six +import paddle + + +class SimpleLSTMRNN(fluid.Layer): + def __init__(self, + hidden_size, + num_steps, + num_layers=2, + init_scale=0.1, + dropout=None): + super(SimpleLSTMRNN, self).__init__() + self._hidden_size = hidden_size + self._num_layers = num_layers + self._init_scale = init_scale + self._dropout = dropout + self._input = None + self._num_steps = num_steps + self.cell_array = [] + self.hidden_array = [] + self.weight_1_arr = [] + self.weight_2_arr = [] + self.bias_arr = [] + self.mask_array = [] + + for i in range(self._num_layers): + weight_1 = self.create_parameter( + attr=fluid.ParamAttr( + initializer=fluid.initializer.UniformInitializer( + low=-self._init_scale, high=self._init_scale)), + shape=[self._hidden_size * 2, self._hidden_size * 4], + dtype="float32", + default_initializer=fluid.initializer.UniformInitializer( + low=-self._init_scale, high=self._init_scale)) + self.weight_1_arr.append(self.add_parameter('w_%d' % i, weight_1)) + bias_1 = self.create_parameter( + attr=fluid.ParamAttr( + initializer=fluid.initializer.UniformInitializer( + low=-self._init_scale, high=self._init_scale)), + shape=[self._hidden_size * 4], + dtype="float32", + default_initializer=fluid.initializer.Constant(0.0)) + self.bias_arr.append(self.add_parameter('b_%d' % i, bias_1)) + + def forward(self, input_embedding, init_hidden=None, init_cell=None): + self.cell_array = [] + self.hidden_array = [] + + for i in range(self._num_layers): + pre_hidden = fluid.layers.slice( + init_hidden, axes=[0], starts=[i], ends=[i + 1]) + pre_cell = fluid.layers.slice( + init_cell, axes=[0], starts=[i], ends=[i + 1]) + pre_hidden = fluid.layers.reshape( + pre_hidden, shape=[-1, self._hidden_size]) + pre_cell = fluid.layers.reshape( + pre_cell, shape=[-1, self._hidden_size]) + self.hidden_array.append(pre_hidden) + self.cell_array.append(pre_cell) + + res = [] + for index in range(self._num_steps): + self._input = fluid.layers.slice( + input_embedding, axes=[1], starts=[index], ends=[index + 1]) + self._input = fluid.layers.reshape( + self._input, shape=[-1, self._hidden_size]) + for k in range(self._num_layers): + pre_hidden = self.hidden_array[k] + pre_cell = self.cell_array[k] + weight_1 = self.weight_1_arr[k] + bias = self.bias_arr[k] + + nn = fluid.layers.concat([self._input, pre_hidden], 1) + gate_input = fluid.layers.matmul(x=nn, y=weight_1) + + gate_input = fluid.layers.elementwise_add(gate_input, bias) + i, j, f, o = fluid.layers.split( + gate_input, num_or_sections=4, dim=-1) + c = pre_cell * fluid.layers.sigmoid(f) + fluid.layers.sigmoid( + i) * fluid.layers.tanh(j) + m = fluid.layers.tanh(c) * fluid.layers.sigmoid(o) + self.hidden_array[k] = m + self.cell_array[k] = c + self._input = m + + if self._dropout is not None and self._dropout > 0.0: + self._input = fluid.layers.dropout( + self._input, + dropout_prob=self._dropout, + dropout_implementation='upscale_in_train') + res.append( + fluid.layers.reshape( + self._input, shape=[1, -1, self._hidden_size])) + real_res = fluid.layers.concat(res, 0) + real_res = fluid.layers.transpose(x=real_res, perm=[1, 0, 2]) + last_hidden = fluid.layers.concat(self.hidden_array, 1) + last_hidden = fluid.layers.reshape( + last_hidden, shape=[-1, self._num_layers, self._hidden_size]) + last_hidden = fluid.layers.transpose(x=last_hidden, perm=[1, 0, 2]) + last_cell = fluid.layers.concat(self.cell_array, 1) + last_cell = fluid.layers.reshape( + last_cell, shape=[-1, self._num_layers, self._hidden_size]) + last_cell = fluid.layers.transpose(x=last_cell, perm=[1, 0, 2]) + return real_res, last_hidden, last_cell + + +class PtbModel(fluid.Layer): + def __init__(self, + hidden_size, + vocab_size, + num_layers=2, + num_steps=20, + init_scale=0.1, + dropout=None): + super(PtbModel, self).__init__() + self.hidden_size = hidden_size + self.vocab_size = vocab_size + self.init_scale = init_scale + self.num_layers = num_layers + self.num_steps = num_steps + self.dropout = dropout + self.simple_lstm_rnn = SimpleLSTMRNN( + hidden_size, + num_steps, + num_layers=num_layers, + init_scale=init_scale, + dropout=dropout) + self.embedding = Embedding( + size=[vocab_size, hidden_size], + dtype='float32', + is_sparse=False, + param_attr=fluid.ParamAttr( + name='embedding_para', + initializer=fluid.initializer.UniformInitializer( + low=-init_scale, high=init_scale))) + + self.softmax_weight = self.create_parameter( + attr=fluid.ParamAttr(), + shape=[self.hidden_size, self.vocab_size], + dtype="float32", + default_initializer=fluid.initializer.UniformInitializer( + low=-self.init_scale, high=self.init_scale)) + self.softmax_bias = self.create_parameter( + attr=fluid.ParamAttr(), + shape=[self.vocab_size], + dtype="float32", + default_initializer=fluid.initializer.UniformInitializer( + low=-self.init_scale, high=self.init_scale)) + + def forward(self, input, label, init_hidden, init_cell): + init_h = fluid.layers.reshape( + init_hidden, shape=[self.num_layers, -1, self.hidden_size]) + + init_c = fluid.layers.reshape( + init_cell, shape=[self.num_layers, -1, self.hidden_size]) + + x_emb = self.embedding(input) + x_emb = fluid.layers.reshape( + x_emb, shape=[-1, self.num_steps, self.hidden_size]) + if self.dropout is not None and self.dropout > 0.0: + x_emb = fluid.layers.dropout( + x_emb, + dropout_prob=self.drop_out, + dropout_implementation='upscale_in_train') + rnn_out, last_hidden, last_cell = self.simple_lstm_rnn(x_emb, init_h, + init_c) + rnn_out = fluid.layers.reshape( + rnn_out, shape=[-1, self.num_steps, self.hidden_size]) + + projection = fluid.layers.matmul(rnn_out, self.softmax_weight) + projection = fluid.layers.elementwise_add(projection, self.softmax_bias) + projection = fluid.layers.reshape( + projection, shape=[-1, self.vocab_size]) + loss = fluid.layers.softmax_with_cross_entropy( + logits=projection, label=label, soft_label=False) + loss = fluid.layers.reshape(loss, shape=[-1, self.num_steps]) + loss = fluid.layers.reduce_mean(loss, dim=[0]) + loss = fluid.layers.reduce_sum(loss) + + return loss, last_hidden, last_cell + + +class TestDygraphPtbRnn(unittest.TestCase): + def setUp(self): + seed = 90 + hidden_size = 10 + vocab_size = 1000 + num_layers = 1 + num_steps = 3 + init_scale = 0.1 + batch_size = 4 + batch_num = 200 + + with fluid.dygraph.guard(): + fluid.default_startup_program().random_seed = seed + fluid.default_main_program().random_seed = seed + # TODO: marsyang1993 Change seed to + ptb_model = PtbModel( + hidden_size=hidden_size, + vocab_size=vocab_size, + num_layers=num_layers, + num_steps=num_steps, + init_scale=init_scale) + + bd = [] + lr_arr = [1.0] + # this a fake lr decay strategy + for i in range(1, 10): + bd.append(100 * i) + new_lr = 1.0 + lr_arr.append(new_lr) + + place = fluid.CPUPlace() if not core.is_compiled_with_cuda( + ) else fluid.CUDAPlace(0) + adam = Adam( + learning_rate=fluid.layers.piecewise_decay( + boundaries=bd, values=lr_arr), + parameters=ptb_model.parameters()) + dy_param_updated = dict() + dy_param_init = dict() + dy_loss = None + last_hidden = None + last_cell = None + + for i in range(batch_num): + x_data = np.arange(12).reshape(4, 3).astype('int64') + y_data = np.arange(1, 13).reshape(4, 3).astype('int64') + y_data = y_data.reshape((-1, 1)) + init_hidden_data = np.zeros( + (num_layers, batch_size, hidden_size), dtype='float32') + init_cell_data = np.zeros( + (num_layers, batch_size, hidden_size), dtype='float32') + x = to_variable(x_data) + y = to_variable(y_data) + init_hidden = to_variable(init_hidden_data) + init_cell = to_variable(init_cell_data) + dy_loss, last_hidden, last_cell = ptb_model(x, y, init_hidden, + init_cell) + if i == 0: + for param in ptb_model.parameters(): + dy_param_init[param.name] = param.numpy() + dy_loss.backward() + adam.minimize(dy_loss) + ptb_model.clear_gradients() + if i == batch_num - 1: + for param in ptb_model.parameters(): + dy_param_updated[param.name] = param.numpy() + + # check optimizer + self.opti_dict = adam.state_dict() + self.base_opti = {} + for k, v in self.opti_dict.items(): + if isinstance(v, core.VarBase): + self.base_opti[v.name] = v.numpy() + self.assertTrue(np.sum(np.abs(v.numpy())) != 0) + else: + self.base_opti[k] = v + + fluid.save_dygraph(self.opti_dict, "./test_dy") + + self.state_dict = ptb_model.state_dict() + + self.model_base = {} + for k, v in self.state_dict.items(): + np_t = v.numpy() + self.model_base[k] = np_t + + paddle.save(self.state_dict, "./test_dy") + + def testLoadAndSetVarBase(self): + seed = 90 + hidden_size = 10 + vocab_size = 1000 + num_layers = 1 + num_steps = 3 + init_scale = 0.1 + batch_size = 4 + batch_num = 200 + + with fluid.dygraph.guard(): + fluid.default_startup_program().random_seed = seed + fluid.default_main_program().random_seed = seed + # TODO: marsyang1993 Change seed to + ptb_model = PtbModel( + hidden_size=hidden_size, + vocab_size=vocab_size, + num_layers=num_layers, + num_steps=num_steps, + init_scale=init_scale) + + bd = [] + lr_arr = [1.0] + # this a fake lr decay strategy + for i in range(1, 10): + bd.append(100 * i) + new_lr = 1.0 + lr_arr.append(new_lr) + + place = fluid.CPUPlace() if not core.is_compiled_with_cuda( + ) else fluid.CUDAPlace(0) + adam = Adam( + learning_rate=fluid.layers.piecewise_decay( + boundaries=bd, values=lr_arr), + parameters=ptb_model.parameters()) + dy_param_updated = dict() + dy_param_init = dict() + dy_loss = None + last_hidden = None + last_cell = None + + for i in range(batch_num): + x_data = np.arange(12).reshape(4, 3).astype('int64') + y_data = np.arange(1, 13).reshape(4, 3).astype('int64') + y_data = y_data.reshape((-1, 1)) + init_hidden_data = np.zeros( + (num_layers, batch_size, hidden_size), dtype='float32') + init_cell_data = np.zeros( + (num_layers, batch_size, hidden_size), dtype='float32') + x = to_variable(x_data) + y = to_variable(y_data) + init_hidden = to_variable(init_hidden_data) + init_cell = to_variable(init_cell_data) + dy_loss, last_hidden, last_cell = ptb_model(x, y, init_hidden, + init_cell) + if i == 0: + for param in ptb_model.parameters(): + dy_param_init[param.name] = param.numpy() + dy_loss.backward() + adam.minimize(dy_loss) + ptb_model.clear_gradients() + if i == batch_num - 1: + for param in ptb_model.parameters(): + dy_param_updated[param.name] = param.numpy() + + # check optimizer + opti_dict = adam.state_dict() + # set to zero + for k, v in opti_dict.items(): + if isinstance(v, core.VarBase): + np_t = v.numpy() + var = v.value().get_tensor() + var.set(np.zeros_like(np_t), place) + + self.assertTrue(np.sum(np.abs(v.numpy())) == 0) + + if isinstance(adam._learning_rate, LearningRateDecay): + adam._learning_rate.step_num = 0 + + para_state_dict, opti_state_dict = paddle.load("./test_dy") + adam.set_state_dict(opti_state_dict) + + opti_dict = adam.state_dict() + for k, v in opti_dict.items(): + if isinstance(v, core.VarBase): + self.assertTrue( + np.array_equal(v.numpy(), self.base_opti[v.name])) + else: + self.assertEqual(v, self.base_opti[k]) + + # check parameter + state_dict = ptb_model.state_dict() + for k, v in state_dict.items(): + np_t = v.numpy() + var = v.value().get_tensor() + + var.set(np.zeros_like(np_t), place) + + ptb_model.set_dict(para_state_dict) + + state_dict = ptb_model.state_dict() + + for k, v in state_dict.items(): + new_t = v.numpy() + + base_t = self.model_base[k] + + self.assertTrue(np.array_equal(new_t, base_t)) + + def testSetVariable(self): + seed = 90 + hidden_size = 10 + vocab_size = 1000 + num_layers = 1 + num_steps = 3 + init_scale = 0.1 + batch_size = 4 + batch_num = 200 + + with fluid.dygraph.guard(): + fluid.default_startup_program().random_seed = seed + fluid.default_main_program().random_seed = seed + # TODO: marsyang1993 Change seed to + ptb_model = PtbModel( + hidden_size=hidden_size, + vocab_size=vocab_size, + num_layers=num_layers, + num_steps=num_steps, + init_scale=init_scale) + + bd = [] + lr_arr = [1.0] + # this a fake lr decay strategy + for i in range(1, 10): + bd.append(100 * i) + new_lr = 1.0 + lr_arr.append(new_lr) + + place = fluid.CPUPlace() if not core.is_compiled_with_cuda( + ) else fluid.CUDAPlace(0) + adam = Adam( + learning_rate=fluid.layers.piecewise_decay( + boundaries=bd, values=lr_arr), + parameters=ptb_model.parameters()) + dy_param_updated = dict() + dy_param_init = dict() + dy_loss = None + last_hidden = None + last_cell = None + + for i in range(batch_num): + x_data = np.arange(12).reshape(4, 3).astype('int64') + y_data = np.arange(1, 13).reshape(4, 3).astype('int64') + y_data = y_data.reshape((-1, 1)) + init_hidden_data = np.zeros( + (num_layers, batch_size, hidden_size), dtype='float32') + init_cell_data = np.zeros( + (num_layers, batch_size, hidden_size), dtype='float32') + x = to_variable(x_data) + y = to_variable(y_data) + init_hidden = to_variable(init_hidden_data) + init_cell = to_variable(init_cell_data) + dy_loss, last_hidden, last_cell = ptb_model(x, y, init_hidden, + init_cell) + if i == 0: + for param in ptb_model.parameters(): + dy_param_init[param.name] = param.numpy() + dy_loss.backward() + adam.minimize(dy_loss) + ptb_model.clear_gradients() + if i == batch_num - 1: + for param in ptb_model.parameters(): + dy_param_updated[param.name] = param.numpy() + + # check optimizer + opti_dict = adam.state_dict() + # set to zero + for k, v in opti_dict.items(): + if isinstance(v, core.VarBase): + np_t = v.numpy() + var = v.value().get_tensor() + var.set(np.zeros_like(np_t), place) + + self.assertTrue(np.sum(np.abs(v.numpy())) == 0) + + if isinstance(adam._learning_rate, LearningRateDecay): + adam._learning_rate.step_num = 0 + + adam.set_state_dict(self.opti_dict) + opti_dict = adam.state_dict() + for k, v in opti_dict.items(): + if isinstance(v, core.VarBase): + self.assertTrue( + np.array_equal(v.numpy(), self.base_opti[v.name])) + else: + self.assertEqual(v, self.base_opti[k]) + + # check parameter + state_dict = ptb_model.state_dict() + for k, v in state_dict.items(): + np_t = v.numpy() + var = v.value().get_tensor() + + var.set(np.zeros_like(np_t), place) + + ptb_model.set_dict(self.state_dict) + + state_dict = ptb_model.state_dict() + + for k, v in state_dict.items(): + new_t = v.numpy() + + base_t = self.model_base[k] + + self.assertTrue(np.array_equal(new_t, base_t)) + + def testSetNumpy(self): + seed = 90 + hidden_size = 10 + vocab_size = 1000 + num_layers = 1 + num_steps = 3 + init_scale = 0.1 + batch_size = 4 + batch_num = 200 + + with fluid.dygraph.guard(): + fluid.default_startup_program().random_seed = seed + fluid.default_main_program().random_seed = seed + # TODO: marsyang1993 Change seed to + ptb_model = PtbModel( + hidden_size=hidden_size, + vocab_size=vocab_size, + num_layers=num_layers, + num_steps=num_steps, + init_scale=init_scale) + + bd = [] + lr_arr = [1.0] + # this a fake lr decay strategy + for i in range(1, 10): + bd.append(100 * i) + new_lr = 1.0 + lr_arr.append(new_lr) + + place = fluid.CPUPlace() if not core.is_compiled_with_cuda( + ) else fluid.CUDAPlace(0) + adam = Adam( + learning_rate=fluid.layers.piecewise_decay( + boundaries=bd, values=lr_arr), + parameters=ptb_model.parameters()) + dy_param_updated = dict() + dy_param_init = dict() + dy_loss = None + last_hidden = None + last_cell = None + + for i in range(batch_num): + x_data = np.arange(12).reshape(4, 3).astype('int64') + y_data = np.arange(1, 13).reshape(4, 3).astype('int64') + y_data = y_data.reshape((-1, 1)) + init_hidden_data = np.zeros( + (num_layers, batch_size, hidden_size), dtype='float32') + init_cell_data = np.zeros( + (num_layers, batch_size, hidden_size), dtype='float32') + x = to_variable(x_data) + y = to_variable(y_data) + init_hidden = to_variable(init_hidden_data) + init_cell = to_variable(init_cell_data) + dy_loss, last_hidden, last_cell = ptb_model(x, y, init_hidden, + init_cell) + if i == 0: + for param in ptb_model.parameters(): + dy_param_init[param.name] = param.numpy() + dy_loss.backward() + adam.minimize(dy_loss) + ptb_model.clear_gradients() + if i == batch_num - 1: + for param in ptb_model.parameters(): + dy_param_updated[param.name] = param.numpy() + + # check optimizer + opti_dict = adam.state_dict() + np_opti_dict = {} + # set to zero + for k, v in opti_dict.items(): + if isinstance(v, core.VarBase): + np_t = v.numpy() + np_opti_dict[v.name] = np_t + var = v.value().get_tensor() + var.set(np.zeros_like(np_t), place) + self.assertTrue(np.sum(np.abs(v.numpy())) == 0) + else: + np_opti_dict[k] = v + + if isinstance(adam._learning_rate, LearningRateDecay): + adam._learning_rate.step_num = 0 + + adam.set_state_dict(np_opti_dict) + + opti_dict = adam.state_dict() + for k, v in opti_dict.items(): + if isinstance(v, core.VarBase): + self.assertTrue( + np.array_equal(v.numpy(), self.base_opti[v.name])) + else: + self.assertEqual(v, self.base_opti[k]) + + # check parameter + state_dict = ptb_model.state_dict() + np_state_dict = {} + for k, v in state_dict.items(): + np_t = v.numpy() + np_state_dict[k] = np_t + var = v.value().get_tensor() + + var.set(np.zeros_like(np_t), place) + + ptb_model.set_dict(np_state_dict) + + state_dict = ptb_model.state_dict() + + for k, v in state_dict.items(): + new_t = v.numpy() + + base_t = self.model_base[k] + + self.assertTrue(np.array_equal(new_t, base_t)) + + def testSetVariableBeforeTrain(self): + seed = 90 + hidden_size = 10 + vocab_size = 1000 + num_layers = 1 + num_steps = 3 + init_scale = 0.1 + batch_size = 4 + batch_num = 200 + + with fluid.dygraph.guard(): + fluid.default_startup_program().random_seed = seed + fluid.default_main_program().random_seed = seed + # TODO: marsyang1993 Change seed to + ptb_model = PtbModel( + hidden_size=hidden_size, + vocab_size=vocab_size, + num_layers=num_layers, + num_steps=num_steps, + init_scale=init_scale) + + place = fluid.CPUPlace() if not core.is_compiled_with_cuda( + ) else fluid.CUDAPlace(0) + adam = Adam( + learning_rate=0.0, + beta1=0.8, + beta2=0.6, + parameters=ptb_model.parameters()) + dy_param_updated = dict() + dy_param_init = dict() + dy_loss = None + last_hidden = None + last_cell = None + + adam.set_state_dict(self.opti_dict) + ptb_model.set_dict(self.state_dict) + + for i in range(1): + x_data = np.arange(12).reshape(4, 3).astype('int64') + y_data = np.arange(1, 13).reshape(4, 3).astype('int64') + y_data = y_data.reshape((-1, 1)) + init_hidden_data = np.zeros( + (num_layers, batch_size, hidden_size), dtype='float32') + init_cell_data = np.zeros( + (num_layers, batch_size, hidden_size), dtype='float32') + x = to_variable(x_data) + y = to_variable(y_data) + init_hidden = to_variable(init_hidden_data) + init_cell = to_variable(init_cell_data) + dy_loss, last_hidden, last_cell = ptb_model(x, y, init_hidden, + init_cell) + + dy_loss.backward() + adam.minimize(dy_loss) + ptb_model.clear_gradients() + + opti_dict = adam.state_dict() + for k, v in opti_dict.items(): + if k == "global_step": + self.assertTrue( + np.array_equal(v.numpy(), self.base_opti[v.name] + 1)) + + if k.find("beta1_pow_acc_0") > 0: + self.assertTrue( + np.array_equal(v.numpy(), self.base_opti[v.name] * + adam._beta1)) + if k.find("beta2_pow_acc_0") > 0: + self.assertTrue( + np.array_equal(v.numpy(), self.base_opti[v.name] * + adam._beta2)) + + state_dict = ptb_model.state_dict() + + for k, v in state_dict.items(): + new_t = v.numpy() + + base_t = self.model_base[k] + self.assertTrue(np.array_equal(new_t, base_t)) + + def testLoadAndSetVarBaseBeforeTrain(self): + seed = 90 + hidden_size = 10 + vocab_size = 1000 + num_layers = 1 + num_steps = 3 + init_scale = 0.1 + batch_size = 4 + batch_num = 200 + + with fluid.dygraph.guard(): + fluid.default_startup_program().random_seed = seed + fluid.default_main_program().random_seed = seed + # TODO: marsyang1993 Change seed to + ptb_model = PtbModel( + hidden_size=hidden_size, + vocab_size=vocab_size, + num_layers=num_layers, + num_steps=num_steps, + init_scale=init_scale) + + bd = [] + lr_arr = [0.0] + # this a fake lr decay strategy + for i in range(1, 10): + bd.append(100 * i) + # set lr to zero not update parameter + new_lr = 0.0 + lr_arr.append(new_lr) + + place = fluid.CPUPlace() if not core.is_compiled_with_cuda( + ) else fluid.CUDAPlace(0) + adam = Adam( + learning_rate=0.0, + beta1=0.8, + beta2=0.6, + parameters=ptb_model.parameters()) + dy_param_updated = dict() + dy_param_init = dict() + dy_loss = None + last_hidden = None + last_cell = None + + state_dict, opti_dict = fluid.load_dygraph("./test_dy") + adam.set_state_dict(opti_dict) + ptb_model.set_dict(state_dict) + + for i in range(1): + x_data = np.arange(12).reshape(4, 3).astype('int64') + y_data = np.arange(1, 13).reshape(4, 3).astype('int64') + y_data = y_data.reshape((-1, 1)) + init_hidden_data = np.zeros( + (num_layers, batch_size, hidden_size), dtype='float32') + init_cell_data = np.zeros( + (num_layers, batch_size, hidden_size), dtype='float32') + x = to_variable(x_data) + y = to_variable(y_data) + init_hidden = to_variable(init_hidden_data) + init_cell = to_variable(init_cell_data) + dy_loss, last_hidden, last_cell = ptb_model(x, y, init_hidden, + init_cell) + + dy_loss.backward() + adam.minimize(dy_loss) + ptb_model.clear_gradients() + + opti_dict = adam.state_dict() + for k, v in opti_dict.items(): + if k == "global_step": + self.assertTrue( + np.array_equal(v.numpy(), self.base_opti[v.name] + 1)) + + if k.find("beta1_pow_acc_0") > 0: + self.assertTrue( + np.array_equal(v.numpy(), self.base_opti[v.name] * + adam._beta1)) + if k.find("beta2_pow_acc_0") > 0: + self.assertTrue( + np.array_equal(v.numpy(), self.base_opti[v.name] * + adam._beta2)) + + # check parameter + + state_dict = ptb_model.state_dict() + + for k, v in state_dict.items(): + new_t = v.numpy() + + base_t = self.model_base[k] + self.assertTrue(np.array_equal(new_t, base_t)) + + def testSetNumpyBeforeTrain(self): + seed = 90 + hidden_size = 10 + vocab_size = 1000 + num_layers = 1 + num_steps = 3 + init_scale = 0.1 + batch_size = 4 + batch_num = 200 + + with fluid.dygraph.guard(): + fluid.default_startup_program().random_seed = seed + fluid.default_main_program().random_seed = seed + # TODO: marsyang1993 Change seed to + ptb_model = PtbModel( + hidden_size=hidden_size, + vocab_size=vocab_size, + num_layers=num_layers, + num_steps=num_steps, + init_scale=init_scale) + + bd = [] + lr_arr = [0.0] + # this a fake lr decay strategy + for i in range(1, 10): + bd.append(100 * i) + # set lr to 0.0, not update parameter + new_lr = 0.0 + lr_arr.append(new_lr) + + place = fluid.CPUPlace() if not core.is_compiled_with_cuda( + ) else fluid.CUDAPlace(0) + adam = Adam( + learning_rate=fluid.layers.piecewise_decay( + boundaries=bd, values=lr_arr), + beta1=0.8, + beta2=0.6, + parameters=ptb_model.parameters()) + dy_param_updated = dict() + dy_param_init = dict() + dy_loss = None + last_hidden = None + last_cell = None + + np_opti_dict = {} + np_state_dict = {} + + for k, v in self.opti_dict.items(): + if isinstance(v, core.VarBase): + np_opti_dict[v.name] = v.numpy() + else: + np_opti_dict[k] = v + + for k, v in self.state_dict.items(): + np_state_dict[k] = v.numpy() + + adam.set_state_dict(np_opti_dict) + ptb_model.set_dict(np_state_dict) + for i in range(1): + x_data = np.arange(12).reshape(4, 3).astype('int64') + y_data = np.arange(1, 13).reshape(4, 3).astype('int64') + y_data = y_data.reshape((-1, 1)) + init_hidden_data = np.zeros( + (num_layers, batch_size, hidden_size), dtype='float32') + init_cell_data = np.zeros( + (num_layers, batch_size, hidden_size), dtype='float32') + x = to_variable(x_data) + y = to_variable(y_data) + init_hidden = to_variable(init_hidden_data) + init_cell = to_variable(init_cell_data) + dy_loss, last_hidden, last_cell = ptb_model(x, y, init_hidden, + init_cell) + + dy_loss.backward() + adam.minimize(dy_loss) + ptb_model.clear_gradients() + + opti_dict = adam.state_dict() + for k, v in opti_dict.items(): + if k == "global_step": + self.assertTrue( + np.array_equal(v.numpy(), self.base_opti[v.name] + 1)) + + if k.find("beta1_pow_acc_0") > 0: + self.assertTrue( + np.array_equal(v.numpy(), self.base_opti[v.name] * + adam._beta1)) + if k.find("beta2_pow_acc_0") > 0: + self.assertTrue( + np.array_equal(v.numpy(), self.base_opti[v.name] * + adam._beta2)) + + # check parameter + + state_dict = ptb_model.state_dict() + + for k, v in state_dict.items(): + new_t = v.numpy() + + base_t = self.model_base[k] + self.assertTrue(np.array_equal(new_t, base_t)) + + def testOnlyLoadParams(self): + with fluid.dygraph.guard(): + emb = fluid.dygraph.Embedding([10, 10]) + state_dict = emb.state_dict() + paddle.save(state_dict, os.path.join('saved_dy', 'emb_dy')) + + para_state_dict, opti_state_dict = paddle.load( + os.path.join('saved_dy', 'emb_dy')) + + self.assertTrue(opti_state_dict == None) + + para_state_dict, opti_state_dict = paddle.load( + os.path.join('saved_dy', 'emb_dy.pdparams')) + + para_state_dict, opti_state_dict = paddle.load( + os.path.join('saved_dy', 'emb_dy.pdopt')) + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/optimizer/optimizer.py b/python/paddle/optimizer/optimizer.py index b3773ddeb42fc..2c2f6f1ce7e14 100644 --- a/python/paddle/optimizer/optimizer.py +++ b/python/paddle/optimizer/optimizer.py @@ -691,48 +691,6 @@ def _create_optimization_pass(self, parameters_and_grads): end = len(target_block.ops) return target_block._slice_ops(start, end) - def _process_distribute_lookuptable(self, param_grads): - """ - Because distribute lookup table only support SGD optimizer for now, not support - other optimizer and regularization, so we should find the table parameter out, - and avoid to add regularization and other op for it, and add sgd optimize op - for it independently. - :param param_grads(list((Var, Var))): list of (param, grad) pair. - :param loss: the loss tensor. - :param startup_program: the startup program - """ - program = framework.default_main_program() - global_block = framework.default_main_program().global_block() - table_name = find_distributed_lookup_table(program) - table_param = None - table_grad = None - new_param_grads = [] - for p, g in param_grads: - if p.name == table_name: - if table_param is not None: - raise RuntimeError( - "multi dist table var found, only support one now!") - table_param = p - table_grad = g - else: - new_param_grads.append((p, g)) - sgd_op = None - if table_param is not None: - param_and_grad = [table_param, table_grad] - with table_param.block.program._optimized_guard(param_and_grad), \ - framework.name_scope("optimizer"): - self._create_global_learning_rate() - # create the optimize op - sgd_op = global_block.append_op( - type='sgd', - inputs={ - "Param": table_param, - "Grad": table_grad, - "LearningRate": self._create_param_lr(param_and_grad) - }, - outputs={"ParamOut": param_and_grad[0]}) - return new_param_grads, (table_param, table_grad), sgd_op - def _append_dgc_ops(self, param_and_grad): pass