From 3bb1a5f4b672474416834a612cffdbcebe4b6dda Mon Sep 17 00:00:00 2001 From: xiongkun Date: Thu, 9 Mar 2023 07:25:44 +0000 Subject: [PATCH 01/43] merge --- .../fluid/eager/to_static/run_program_op_func.h | 12 +++++++----- .../fluid/eager/to_static/run_program_op_node.h | 17 ++++++++++++++--- 2 files changed, 21 insertions(+), 8 deletions(-) diff --git a/paddle/fluid/eager/to_static/run_program_op_func.h b/paddle/fluid/eager/to_static/run_program_op_func.h index fad6f7bd31e43..3b5fc14c04901 100644 --- a/paddle/fluid/eager/to_static/run_program_op_func.h +++ b/paddle/fluid/eager/to_static/run_program_op_func.h @@ -60,11 +60,6 @@ inline void run_program_ad_func( std::vector& step_scope, // NOLINT std::vector& dout, // NOLINT const paddle::framework::AttributeMap& attrs) { - VLOG(2) << "start run run_program"; - // Call forward function - RunProgramAPI(x, params, out, step_scope, dout, attrs); - VLOG(2) << "start run run_program grad"; - // Prepare Autograd Meta auto deref_out = details::DereferenceTensors(out); std::vector p_autograd_x = @@ -78,6 +73,13 @@ inline void run_program_ad_func( bool require_any_grad = egr::EagerUtils::ComputeRequireGrad( trace_backward, &p_autograd_x, &p_autograd_params); + VLOG(2) << "start run run_program with require_any_grad = " + << require_any_grad; + // Call forward function + // if require_any_grad is False, don't save any middle vars. + RunProgramAPI(x, params, out, step_scope, dout, require_any_grad, attrs); + VLOG(2) << "start run run_program grad"; + if (require_any_grad) { egr::EagerUtils::PassStopGradient(false, &p_autograd_outs); // Create GradOpNode (1 means [out_grad], 2 means [x_grad, paramx_grad]) diff --git a/paddle/fluid/eager/to_static/run_program_op_node.h b/paddle/fluid/eager/to_static/run_program_op_node.h index d19d671f1ea0b..0a192437f2901 100644 --- a/paddle/fluid/eager/to_static/run_program_op_node.h +++ b/paddle/fluid/eager/to_static/run_program_op_node.h @@ -283,6 +283,7 @@ inline void RunProgramAPI( std::vector &out, // NOLINT std::vector &step_scope, // NOLINT std::vector &dout, // NOLINT + bool require_any_grad, const paddle::framework::AttributeMap &attrs) { VLOG(2) << "RunProgramOpKernel Compute"; // In the original run_program OP, the default value of the is_test @@ -430,8 +431,10 @@ inline void RunProgramAPI( VLOG(3) << paddle::framework::GenScopeTreeDebugInfo(out_scope_vec->front()); - if (is_test || !egr::Controller::Instance().HasGrad()) { - VLOG(4) << "is test, set this scope can reused"; + if (is_test || !require_any_grad) { + VLOG(4) << "don't require any grad, set this scope can reused"; + VLOG(4) << "is_test: " << is_test + << ", require_any_grad: " << require_any_grad; global_inner_scope->SetCanReuesd(true); details::GcScope(global_inner_scope); } else { @@ -580,7 +583,15 @@ class GradNodeRunProgram : public egr::GradNodeBase { GradNodeRunProgram(size_t bwd_in_slot_num, size_t bwd_out_slot_num) : egr::GradNodeBase(bwd_in_slot_num, bwd_out_slot_num) {} - ~GradNodeRunProgram() override = default; + ~GradNodeRunProgram() { + auto *out_scope_vec = &step_scope_; + // Normally out_scope_vec.size() == 1. for safty, we add for-loop here. + for (size_t i = 0; i < out_scope_vec->size(); ++i) { + paddle::framework::Scope *global_inner_scope = out_scope_vec->at(i); + global_inner_scope->SetCanReuesd(true); // set this to reuse scope. + details::GcScope(global_inner_scope); + } + } // Functor: perform backward computations virtual paddle::small_vector, egr::kSlotSmallVectorSize> From a7b4981161533c44da6d48623d1b753ddf7ec7d1 Mon Sep 17 00:00:00 2001 From: xiongkun Date: Thu, 9 Mar 2023 12:21:38 +0000 Subject: [PATCH 02/43] fix bugs while backward multi-times. --- paddle/fluid/eager/to_static/run_program_op_node.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/paddle/fluid/eager/to_static/run_program_op_node.h b/paddle/fluid/eager/to_static/run_program_op_node.h index 0a192437f2901..e0dfe4e5dc482 100644 --- a/paddle/fluid/eager/to_static/run_program_op_node.h +++ b/paddle/fluid/eager/to_static/run_program_op_node.h @@ -573,7 +573,7 @@ inline void RunProgramGradAPI( *backward_global_block, global_inner_scope); VLOG(4) << "after backward gc all vars"; - global_inner_scope->SetCanReuesd(true); + global_inner_scope->SetCanReuesd(false); // can't reuse util call `~GradNodeRunProgram` details::GcScope(global_inner_scope); } } From 337ca6ff86c5926c572146b54b9208338093d438 Mon Sep 17 00:00:00 2001 From: xiongkun Date: Fri, 10 Mar 2023 06:05:41 +0000 Subject: [PATCH 03/43] train-step first commit: loss.backward support. --- python/paddle/fluid/framework.py | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py index 6e94f528b1a21..cacb574b46cd7 100644 --- a/python/paddle/fluid/framework.py +++ b/python/paddle/fluid/framework.py @@ -27,6 +27,7 @@ import multiprocessing import sys import logging +import paddle from .proto import framework_pb2, data_feed_pb2 @@ -1681,7 +1682,7 @@ def numpy(self): """ pass - @fake_interface_only + @_non_static_only_ def backward(self, retain_graph=False): """ **Notes**: @@ -1712,13 +1713,21 @@ def backward(self, retain_graph=False): # if we don't set tmp's stop_gradient as False then, all path to loss will has no gradient since # there is no one need gradient on it. tmp.stop_gradient=False - inputs.append(tmp) + INPUTS.APPEND(TMP) ret = paddle.add_n(inputs) loss = paddle.sum(ret) loss.backward() """ - pass + if retain_graph is True: + raise AssertionError( + "`retain_graph` == True is not supported in @to_static function." + "please set retain_graph = False." + ) + param_grad_list = paddle.fluid.backward.append_backward(self) + for param, param_grad in param_grad_list: + # set grad to simulate dygraph loss.backward() in static mode. + setattr(param, "grad", param_grad) @fake_interface_only def gradient(self): From 5718f65370b486951bd670704dfa10362f4d79fc Mon Sep 17 00:00:00 2001 From: xiongkun Date: Mon, 13 Mar 2023 03:36:18 +0000 Subject: [PATCH 04/43] 1. support Adam and Accumulator. 2. remote double_grad in Dy2static. (test_grad.py is OK) --- python/paddle/fluid/compiler.py | 2 +- python/paddle/fluid/framework.py | 9 + .../paddle/jit/dy2static/partial_program.py | 36 +--- .../jit/dy2static/program_translator.py | 2 +- python/paddle/optimizer/adam.py | 6 +- python/paddle/optimizer/optimizer.py | 155 ++++++++++-------- 6 files changed, 111 insertions(+), 99 deletions(-) diff --git a/python/paddle/fluid/compiler.py b/python/paddle/fluid/compiler.py index bff15df5f9fcc..24d1802d30b9d 100644 --- a/python/paddle/fluid/compiler.py +++ b/python/paddle/fluid/compiler.py @@ -733,7 +733,7 @@ def patch_getter(self, item): self._caches[item_id] = ( concrete_program, - partial_program_from(concrete_program), + partial_program_from(concrete_program, item.class_instance is not None), ) # Note: raise warnings if number of traced program is more than `max_tracing_count` current_tracing_count = len(self._caches) diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py index cacb574b46cd7..55b1b8fa70db5 100644 --- a/python/paddle/fluid/framework.py +++ b/python/paddle/fluid/framework.py @@ -7593,6 +7593,15 @@ def _get_var(name, program=None): return program.global_block().var(name) +@signature_safe_contextmanager +def dygraph_guard_if_declarative(): + from .dygraph.base import in_declarative_mode + if in_declarative_mode(): + # Under @paddle.jit.to_static decorator, we switch back dygraph mode temporarily. + with paddle.fluid.framework._dygraph_guard(tracer=paddle.fluid.dygraph.Tracer()): + yield + else: + yield @signature_safe_contextmanager def _dygraph_guard(tracer): diff --git a/python/paddle/jit/dy2static/partial_program.py b/python/paddle/jit/dy2static/partial_program.py index 3d86441087f09..f4a348f038762 100644 --- a/python/paddle/jit/dy2static/partial_program.py +++ b/python/paddle/jit/dy2static/partial_program.py @@ -238,7 +238,8 @@ def _get_scope(self, program_id=None, use_scope_cache=False): @LazyInitialized def _double_grads(self): - return self._get_double_grads(self._origin_main_program) + # TODO: check the affects. + return None # whole @switch_to_static_graph @@ -296,8 +297,9 @@ def _create_forward_backward_train_program(self): _, forward_end_op_index = self._infer_info('fp32', self._create_program) assert forward_end_op_index >= 0 + # TODO: wait for the PR: https://github.com/PaddlePaddle/Paddle/pull/50885 return self._get_forward_backward_program_form( - whole_program, forward_end_op_index + whole_program, 9 ) @switch_to_static_graph @@ -650,32 +652,6 @@ def _prune_unused_params(self, program): self._params = required_params - def _get_double_grads(self, program): - double_grads = [] - for block in program.blocks: - for name in block.vars: - if "@GRAD" in name: - var_desc = block.vars[name].desc - var_base = None - if not framework.global_var._in_eager_mode_: - var_base = core.VarBase( - var_desc.dtype(), - var_desc.shape(), - var_desc.name(), - var_desc.type(), - False, - ) - else: - var_base = core.eager.Tensor( - var_desc.dtype(), - var_desc.shape(), - var_desc.name(), - var_desc.type(), - False, - ) - double_grads.append(var_base) - return self._valid_vars(double_grads) - def _cast_fp16_if_pure_fp16(self, in_vars): from paddle.amp.auto_cast import _in_pure_fp16_guard @@ -1085,9 +1061,9 @@ def _valid_vars(self, vars): return vars if vars else None -def partial_program_from(concrete_program): +def partial_program_from(concrete_program, from_method=False): inputs = concrete_program.inputs - if inputs and isinstance(inputs[0], layers.Layer): + if inputs and from_method: inputs = inputs[1:] return PartialProgramLayer( diff --git a/python/paddle/jit/dy2static/program_translator.py b/python/paddle/jit/dy2static/program_translator.py index 41722ad4bad91..70c20877cce7d 100644 --- a/python/paddle/jit/dy2static/program_translator.py +++ b/python/paddle/jit/dy2static/program_translator.py @@ -1195,7 +1195,7 @@ def _build_once(self, cache_key): if not _in_amp_guard() and not _in_pure_fp16_guard(): concrete_program._to_prim() - return concrete_program, partial_program_from(concrete_program) + return concrete_program, partial_program_from(concrete_program, cache_key.class_instance is not None) def __getitem__(self, item): if not isinstance(item, CacheKey): diff --git a/python/paddle/optimizer/adam.py b/python/paddle/optimizer/adam.py index 070efdff2d126..063ef289c64cf 100644 --- a/python/paddle/optimizer/adam.py +++ b/python/paddle/optimizer/adam.py @@ -444,7 +444,7 @@ def _append_optimize_op(self, block, param_and_grad): return adam_op @imperative_base.no_grad - @framework.dygraph_only + @framework._non_static_only_ def step(self): """ Execute the optimizer and update parameters once. @@ -467,6 +467,10 @@ def step(self): adam.step() adam.clear_grad() """ + if paddle.fluid.dygraph.base.in_declarative_mode(): + self._declarative_step() + return + if not isinstance(self._parameter_list[0], dict): params_grads = [] for param in self._parameter_list: diff --git a/python/paddle/optimizer/optimizer.py b/python/paddle/optimizer/optimizer.py index 71ca201ff9f04..24fd0d645e9d5 100644 --- a/python/paddle/optimizer/optimizer.py +++ b/python/paddle/optimizer/optimizer.py @@ -411,65 +411,69 @@ def get_opti_var_name_list(self): return self._opti_name_list def _create_global_learning_rate(self): - # lr var can't be float16 or bfloat16, for pure fp16 or bf16 training, should extra handle the dtype for lr - _lr_dtype = ( - paddle.get_default_dtype() if self._dtype is None else self._dtype - ) - _lr_dtype = ( - paddle.float32 - if ( - ( - paddle.get_default_dtype() != "float16" - and _lr_dtype == paddle.float16 - ) - or ( - paddle.get_default_dtype() != "bfloat16" - and _lr_dtype == paddle.bfloat16 - ) + def do_create(): + # lr var can't be float16 or bfloat16, for pure fp16 or bf16 training, should extra handle the dtype for lr + _lr_dtype = ( + paddle.get_default_dtype() if self._dtype is None else self._dtype ) - else _lr_dtype - ) - if isinstance(self._learning_rate, LRScheduler): - lr_var = self._global_learning_rate() - # only create global lr_var once - if not isinstance(lr_var, framework.Variable): - lr_name = unique_name.generate('learning_rate') - self._learning_rate._var_name = lr_name - lr_var = self.helper.create_global_variable( - name=lr_name, - shape=[1], - persistable=True, - stop_gradient=True, - dtype=_lr_dtype, + _lr_dtype = ( + paddle.float32 + if ( + ( + paddle.get_default_dtype() != "float16" + and _lr_dtype == paddle.float16 + ) + or ( + paddle.get_default_dtype() != "bfloat16" + and _lr_dtype == paddle.bfloat16 + ) ) - main_prog = framework.default_main_program() - main_prog.lr_sheduler = self._learning_rate - main_prog.lr_var = lr_var - - self._learning_rate_map[ - framework.default_main_program() - ] = lr_var - - lr_value = float(self._learning_rate()) - self.helper.set_variable_initializer( - lr_var, - initializer=paddle.nn.initializer.Constant(value=lr_value), + else _lr_dtype ) - elif isinstance(self._learning_rate, float): - # only create global lr_var once - lr = self._global_learning_rate() - if isinstance(lr, framework.Variable): - return - else: - self._learning_rate_map[ - framework.default_main_program() - ] = paddle.static.create_global_var( - name=unique_name.generate("learning_rate"), - shape=[1], - value=float(self._learning_rate), - dtype=_lr_dtype, - persistable=True, + if isinstance(self._learning_rate, LRScheduler): + lr_var = self._global_learning_rate() + # only create global lr_var once + if not isinstance(lr_var, framework.Variable): + lr_name = unique_name.generate('learning_rate') + self._learning_rate._var_name = lr_name + lr_var = self.helper.create_global_variable( + name=lr_name, + shape=[1], + persistable=True, + stop_gradient=True, + dtype=_lr_dtype, + ) + main_prog = framework.default_main_program() + main_prog.lr_sheduler = self._learning_rate + main_prog.lr_var = lr_var + + self._learning_rate_map[ + framework.default_main_program() + ] = lr_var + + lr_value = float(self._learning_rate()) + self.helper.set_variable_initializer( + lr_var, + initializer=paddle.nn.initializer.Constant(value=lr_value), ) + elif isinstance(self._learning_rate, float): + # only create global lr_var once + lr = self._global_learning_rate() + if isinstance(lr, framework.Variable): + return + else: + self._learning_rate_map[ + framework.default_main_program() + ] = paddle.static.create_global_var( + name=unique_name.generate("learning_rate"), + shape=[1], + value=float(self._learning_rate), + dtype=_lr_dtype, + persistable=True, + ) + + with paddle.fluid.framework.dygraph_guard_if_declarative(): + do_create() @framework.dygraph_only def set_lr(self, value): @@ -905,14 +909,15 @@ def _create_optimization_pass( ) if isinstance(parameters_and_grads, list): - self._create_accumulators( - target_block, - [ - p[0] - for p in parameters_and_grads - if not p[0].stop_gradient - ], - ) + with paddle.fluid.framework.dygraph_guard_if_declarative(): + self._create_accumulators( + target_block, + [ + p[0] + for p in parameters_and_grads + if not p[0].stop_gradient + ], + ) else: params_acc_dict = parameters_and_grads.copy() params_acc_dict['params'] = [ @@ -920,7 +925,8 @@ def _create_optimization_pass( for p in params_acc_dict['params'] if not p[0].stop_gradient ] - self._create_accumulators(target_block, params_acc_dict) + with paddle.fluid.framework.dygraph_guard_if_declarative(): + self._create_accumulators(target_block, params_acc_dict) if framework._non_static_mode(): if isinstance(parameters_and_grads, list): @@ -1265,7 +1271,7 @@ def _get_no_grad_set(self, loss, no_grad_set=None): return no_grad_set - @framework.dygraph_only + @framework._non_static_only_ def clear_grad(self, set_to_zero=True): """ Clear the gradients of all optimized parameters for model. @@ -1378,8 +1384,22 @@ def minimize( return optimize_ops, params_grads + def _declarative_step(self): + """ + In declarative mode, we forward `call step` to `call apply_gradients` + """ + params = paddle.static.default_main_program().global_block().all_parameters() + assert isinstance(self._parameter_list, list), "Only list of parameters is supported while using optimizer in @paddle.jit.static." + selected_params = set([param.name for param in self._parameter_list]) + parameters = [param for param in params if param.trainable] + parameters = list(filter (lambda x: x.name in selected_params and hasattr(x, "grad"), + parameters)) + params_grads = [(param, param.grad) for param in parameters] + optimize_ops = self.apply_gradients(params_grads) + return + @imperative_base.no_grad() - @framework.dygraph_only + @framework._non_static_only_ def step(self): """ Execute the optimizer and update parameters once. @@ -1402,6 +1422,9 @@ def step(self): adam.step() adam.clear_grad() """ + if paddle.fluid.dygraph.base.in_declarative_mode(): + self._declarative_step() + return if not isinstance(self._param_groups[0], dict): params_grads = [] From b88961c9d5c671c6565eeba812785bd647c2942d Mon Sep 17 00:00:00 2001 From: xiongkun Date: Mon, 13 Mar 2023 03:58:06 +0000 Subject: [PATCH 05/43] code format by ci --- paddle/fluid/eager/to_static/run_program_op_node.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/paddle/fluid/eager/to_static/run_program_op_node.h b/paddle/fluid/eager/to_static/run_program_op_node.h index e0dfe4e5dc482..f78912087e328 100644 --- a/paddle/fluid/eager/to_static/run_program_op_node.h +++ b/paddle/fluid/eager/to_static/run_program_op_node.h @@ -573,7 +573,8 @@ inline void RunProgramGradAPI( *backward_global_block, global_inner_scope); VLOG(4) << "after backward gc all vars"; - global_inner_scope->SetCanReuesd(false); // can't reuse util call `~GradNodeRunProgram` + global_inner_scope->SetCanReuesd( + false); // can't reuse util call `~GradNodeRunProgram` details::GcScope(global_inner_scope); } } From 0aecaac30b044f5c12c4b3f3bb383f811d0f5fd5 Mon Sep 17 00:00:00 2001 From: SigureMo Date: Wed, 15 Mar 2023 07:12:15 +0000 Subject: [PATCH 06/43] use `non_static_only` instead of `_non_static_only_` --- python/paddle/optimizer/optimizer.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/paddle/optimizer/optimizer.py b/python/paddle/optimizer/optimizer.py index 100ec6149b52d..9a9adb99e6cfc 100644 --- a/python/paddle/optimizer/optimizer.py +++ b/python/paddle/optimizer/optimizer.py @@ -1337,7 +1337,7 @@ def _get_no_grad_set(self, loss, no_grad_set=None): return no_grad_set - @framework._non_static_only_ + @framework.non_static_only def clear_grad(self, set_to_zero=True): """ Clear the gradients of all optimized parameters for model. @@ -1473,7 +1473,7 @@ def _declarative_step(self): return @imperative_base.no_grad() - @framework._non_static_only_ + @framework.non_static_only def step(self): """ Execute the optimizer and update parameters once. From f7fe08d610cfea56e6ae4b90675a64f70698f271 Mon Sep 17 00:00:00 2001 From: SigureMo Date: Wed, 15 Mar 2023 07:16:09 +0000 Subject: [PATCH 07/43] fix assertion --- python/paddle/optimizer/optimizer.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/paddle/optimizer/optimizer.py b/python/paddle/optimizer/optimizer.py index 9a9adb99e6cfc..aa659f5a8fd3d 100644 --- a/python/paddle/optimizer/optimizer.py +++ b/python/paddle/optimizer/optimizer.py @@ -1457,8 +1457,8 @@ def _declarative_step(self): params = ( paddle.static.default_main_program().global_block().all_parameters() ) - assert isinstance( - self._parameter_list, list + assert not isinstance( + self._parameter_list[0], dict ), "Only list of parameters is supported while using optimizer in @paddle.jit.static." selected_params = set([param.name for param in self._parameter_list]) parameters = [param for param in params if param.trainable] From eb278aa1561965b08dafe2a69a0a5519463ca734 Mon Sep 17 00:00:00 2001 From: SigureMo Date: Wed, 15 Mar 2023 07:17:59 +0000 Subject: [PATCH 08/43] restore sample code --- python/paddle/fluid/framework.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py index 642b76ed7baa1..2b49539b97b8c 100644 --- a/python/paddle/fluid/framework.py +++ b/python/paddle/fluid/framework.py @@ -1713,7 +1713,7 @@ def backward(self, retain_graph=False): # if we don't set tmp's stop_gradient as False then, all path to loss will has no gradient since # there is no one need gradient on it. tmp.stop_gradient=False - INPUTS.APPEND(TMP) + inputs.append(tmp) ret = paddle.add_n(inputs) loss = paddle.sum(ret) loss.backward() From 354155fae8ec6ba218ea01aa89a7a0269a149e72 Mon Sep 17 00:00:00 2001 From: SigureMo Date: Wed, 15 Mar 2023 07:24:18 +0000 Subject: [PATCH 09/43] fix op._set_attr --- python/paddle/jit/dy2static/partial_program.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/paddle/jit/dy2static/partial_program.py b/python/paddle/jit/dy2static/partial_program.py index 6626966a9101a..6840ac7eae87c 100644 --- a/python/paddle/jit/dy2static/partial_program.py +++ b/python/paddle/jit/dy2static/partial_program.py @@ -111,7 +111,7 @@ def _change_is_test_status(program, is_test): for block in program.blocks: for op in block.ops: if op.has_attr('is_test'): - op._set_attr('is_test', is_test) + op.desc._set_bool_attr('is_test', is_test) return program From 7ca1384703887dcecc0be489b09809d7917a85bc Mon Sep 17 00:00:00 2001 From: SigureMo Date: Wed, 15 Mar 2023 13:20:50 +0000 Subject: [PATCH 10/43] support all optimizer, add uts --- .../dygraph_to_static/test_train_step.py | 189 ++++++++++++++++++ python/paddle/optimizer/adam.py | 2 +- python/paddle/optimizer/adamw.py | 6 +- 3 files changed, 195 insertions(+), 2 deletions(-) create mode 100644 python/paddle/fluid/tests/unittests/dygraph_to_static/test_train_step.py diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_train_step.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_train_step.py new file mode 100644 index 0000000000000..bfee93aa74bec --- /dev/null +++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_train_step.py @@ -0,0 +1,189 @@ +import random +import unittest +from functools import partial + +import numpy as np + +import paddle +from paddle.vision.models import resnet18 + +# paddle.set_device('cpu') + + +def reset_seed(): + paddle.seed(1010) + np.random.seed(1010) + random.seed(1010) + + +def loss_fn_tiny_model(x): + return x.mean() + + +def train_step_tiny_model(net, x, loss_fn, opt): + out = net(x) + loss = loss_fn(out) + loss.backward() + opt.step() + opt.clear_grad() + return loss + + +class TinyModel(paddle.nn.Layer): + def __init__(self): + super(TinyModel, self).__init__() + self.layer1 = paddle.nn.Linear(10, 10) + + def forward(self, data): + return self.layer1(data) + + +class TestTrainStepTinyModel(unittest.TestCase): + def setUp(self): + self.input = paddle.randn([10000, 10]) + self.net_creator = TinyModel + self.optimizer_creator = partial(paddle.optimizer.SGD, 0.001) + self.loss_fn = loss_fn_tiny_model + self.train_step_func = train_step_tiny_model + self.steps = 5 + + def get_train_step_losses(self, func, steps): + losses = [] + net = self.net_creator() + optimizer = self.optimizer_creator(parameters=net.parameters()) + for _ in range(steps): + out = func(net, self.input, self.loss_fn, optimizer) + losses.append(out) + return losses + + def test_train_step(self): + reset_seed() + dygraph_losses = self.get_train_step_losses( + self.train_step_func, self.steps + ) + reset_seed() + static_losses = self.get_train_step_losses( + paddle.jit.to_static(self.train_step_func), self.steps + ) + print(dygraph_losses, static_losses) + self.assertEqual(len(dygraph_losses), len(static_losses)) + for dygraph_loss, static_loss in zip(dygraph_losses, static_losses): + dygraph_loss = dygraph_loss.numpy() + static_loss = static_loss.numpy() + np.testing.assert_allclose(dygraph_loss, static_loss, rtol=1e-5) + + +class TestTrainStepTinyModelAdadelta(TestTrainStepTinyModel): + def setUp(self): + self.input = paddle.randn([10000, 10]) + self.net_creator = TinyModel + self.optimizer_creator = partial(paddle.optimizer.Adadelta, 0.001) + self.loss_fn = loss_fn_tiny_model + self.train_step_func = train_step_tiny_model + self.steps = 5 + + +class TestTrainStepTinyModelAdadelta(TestTrainStepTinyModel): + def setUp(self): + self.input = paddle.randn([10000, 10]) + self.net_creator = TinyModel + self.optimizer_creator = partial(paddle.optimizer.Adadelta, 0.001) + self.loss_fn = loss_fn_tiny_model + self.train_step_func = train_step_tiny_model + self.steps = 5 + + +class TestTrainStepTinyModelAdagrad(TestTrainStepTinyModel): + def setUp(self): + self.input = paddle.randn([10000, 10]) + self.net_creator = TinyModel + self.optimizer_creator = partial(paddle.optimizer.Adagrad, 0.001) + self.loss_fn = loss_fn_tiny_model + self.train_step_func = train_step_tiny_model + self.steps = 5 + + +class TestTrainStepTinyModelAdam(TestTrainStepTinyModel): + def setUp(self): + self.input = paddle.randn([10000, 10]) + self.net_creator = TinyModel + self.optimizer_creator = partial(paddle.optimizer.Adam, 0.001) + self.loss_fn = loss_fn_tiny_model + self.train_step_func = train_step_tiny_model + self.steps = 5 + + +class TestTrainStepTinyModelAdamax(TestTrainStepTinyModel): + def setUp(self): + self.input = paddle.randn([10000, 10]) + self.net_creator = TinyModel + self.optimizer_creator = partial(paddle.optimizer.Adamax, 0.001) + self.loss_fn = loss_fn_tiny_model + self.train_step_func = train_step_tiny_model + self.steps = 5 + + +class TestTrainStepTinyModelAdamW(TestTrainStepTinyModel): + def setUp(self): + self.input = paddle.randn([10000, 10]) + self.net_creator = TinyModel + self.optimizer_creator = partial(paddle.optimizer.AdamW, 0.001) + self.loss_fn = loss_fn_tiny_model + self.train_step_func = train_step_tiny_model + self.steps = 5 + + +class TestTrainStepTinyModelLamb(TestTrainStepTinyModel): + def setUp(self): + self.input = paddle.randn([10000, 10]) + self.net_creator = TinyModel + self.optimizer_creator = partial( + paddle.optimizer.Lamb, learning_rate=0.001, lamb_weight_decay=0.01 + ) + self.loss_fn = loss_fn_tiny_model + self.train_step_func = train_step_tiny_model + self.steps = 5 + + +class TestTrainStepTinyModelMomentum(TestTrainStepTinyModel): + def setUp(self): + self.input = paddle.randn([10000, 10]) + self.net_creator = TinyModel + self.optimizer_creator = partial(paddle.optimizer.Momentum, 0.001) + self.loss_fn = loss_fn_tiny_model + self.train_step_func = train_step_tiny_model + self.steps = 5 + + +class TestTrainStepTinyModelRMSProp(TestTrainStepTinyModel): + def setUp(self): + self.input = paddle.randn([10000, 10]) + self.net_creator = TinyModel + self.optimizer_creator = partial(paddle.optimizer.RMSProp, 0.001) + self.loss_fn = loss_fn_tiny_model + self.train_step_func = train_step_tiny_model + self.steps = 5 + + +# class TestTrainStepResNet18Sgd(TestTrainStepTinyModel): +# def setUp(self): +# self.input = paddle.randn([64, 3, 224, 224]) +# self.net_creator = resnet18 +# self.optimizer_creator = partial(paddle.optimizer.SGD, 0.001) +# self.loss_fn = loss_fn_tiny_model +# self.train_step_func = train_step_tiny_model +# self.steps = 5 + + +# class TestTrainStepResNet18Adam(TestTrainStepTinyModel): +# def setUp(self): +# self.input = paddle.randn([64, 3, 224, 224]) +# self.net_creator = resnet18 +# self.optimizer_creator = partial(paddle.optimizer.Adam, 0.001) +# self.loss_fn = loss_fn_tiny_model +# self.train_step_func = train_step_tiny_model +# self.steps = 5 + + +if __name__ == "__main__": + unittest.main() diff --git a/python/paddle/optimizer/adam.py b/python/paddle/optimizer/adam.py index eceb796c154d0..c86adbb1ee45a 100644 --- a/python/paddle/optimizer/adam.py +++ b/python/paddle/optimizer/adam.py @@ -389,7 +389,7 @@ def _append_optimize_op(self, block, param_and_grad): return adam_op @imperative_base.no_grad - @framework._non_static_only_ + @framework.non_static_only def step(self): """ Execute the optimizer and update parameters once. diff --git a/python/paddle/optimizer/adamw.py b/python/paddle/optimizer/adamw.py index 0233ad1c972c1..3a27c1859010c 100644 --- a/python/paddle/optimizer/adamw.py +++ b/python/paddle/optimizer/adamw.py @@ -530,7 +530,7 @@ def __str__(self): return " ".join(["Weight Decay, params:", ",".join(self._params_name)]) @imperative_base.no_grad - @framework.dygraph_only + @framework.non_static_only def step(self): """ Execute the optimizer and update parameters once. @@ -553,6 +553,10 @@ def step(self): opt.step() opt.clear_grad() """ + if paddle.fluid.dygraph.base.in_declarative_mode(): + self._declarative_step() + return + if not isinstance(self._parameter_list[0], dict): params_grads = [] for param in self._parameter_list: From 6775b4b820d844bd60a9f0341122070332e8aade Mon Sep 17 00:00:00 2001 From: SigureMo Date: Wed, 15 Mar 2023 13:30:18 +0000 Subject: [PATCH 11/43] fix codestyle --- .../dygraph_to_static/test_train_step.py | 25 +++++++++++-------- python/paddle/optimizer/optimizer.py | 2 +- 2 files changed, 15 insertions(+), 12 deletions(-) diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_train_step.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_train_step.py index bfee93aa74bec..3dc1b56000715 100644 --- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_train_step.py +++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_train_step.py @@ -1,3 +1,17 @@ +# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + import random import unittest from functools import partial @@ -5,7 +19,6 @@ import numpy as np import paddle -from paddle.vision.models import resnet18 # paddle.set_device('cpu') @@ -83,16 +96,6 @@ def setUp(self): self.steps = 5 -class TestTrainStepTinyModelAdadelta(TestTrainStepTinyModel): - def setUp(self): - self.input = paddle.randn([10000, 10]) - self.net_creator = TinyModel - self.optimizer_creator = partial(paddle.optimizer.Adadelta, 0.001) - self.loss_fn = loss_fn_tiny_model - self.train_step_func = train_step_tiny_model - self.steps = 5 - - class TestTrainStepTinyModelAdagrad(TestTrainStepTinyModel): def setUp(self): self.input = paddle.randn([10000, 10]) diff --git a/python/paddle/optimizer/optimizer.py b/python/paddle/optimizer/optimizer.py index aa659f5a8fd3d..4b6725e27a0db 100644 --- a/python/paddle/optimizer/optimizer.py +++ b/python/paddle/optimizer/optimizer.py @@ -469,7 +469,7 @@ def do_create(): framework.default_main_program() ] = paddle.static.create_global_var( name=unique_name.generate("learning_rate"), - shape=[], + shape=[1], value=float(self._learning_rate), dtype=_lr_dtype, persistable=True, From 46ce083eac6988e8b84f3cfe0ec6f00450a62586 Mon Sep 17 00:00:00 2001 From: SigureMo Date: Wed, 15 Mar 2023 13:36:29 +0000 Subject: [PATCH 12/43] add resnet18 tests --- .../dygraph_to_static/test_train_step.py | 37 ++++++++++--------- 1 file changed, 19 insertions(+), 18 deletions(-) diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_train_step.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_train_step.py index 3dc1b56000715..da12e1f9d57f7 100644 --- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_train_step.py +++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_train_step.py @@ -19,6 +19,7 @@ import numpy as np import paddle +from paddle.vision.models import resnet18 # paddle.set_device('cpu') @@ -168,24 +169,24 @@ def setUp(self): self.steps = 5 -# class TestTrainStepResNet18Sgd(TestTrainStepTinyModel): -# def setUp(self): -# self.input = paddle.randn([64, 3, 224, 224]) -# self.net_creator = resnet18 -# self.optimizer_creator = partial(paddle.optimizer.SGD, 0.001) -# self.loss_fn = loss_fn_tiny_model -# self.train_step_func = train_step_tiny_model -# self.steps = 5 - - -# class TestTrainStepResNet18Adam(TestTrainStepTinyModel): -# def setUp(self): -# self.input = paddle.randn([64, 3, 224, 224]) -# self.net_creator = resnet18 -# self.optimizer_creator = partial(paddle.optimizer.Adam, 0.001) -# self.loss_fn = loss_fn_tiny_model -# self.train_step_func = train_step_tiny_model -# self.steps = 5 +class TestTrainStepResNet18Sgd(TestTrainStepTinyModel): + def setUp(self): + self.input = paddle.randn([64, 3, 224, 224]) + self.net_creator = resnet18 + self.optimizer_creator = partial(paddle.optimizer.SGD, 0.001) + self.loss_fn = loss_fn_tiny_model + self.train_step_func = train_step_tiny_model + self.steps = 3 + + +class TestTrainStepResNet18Adam(TestTrainStepTinyModel): + def setUp(self): + self.input = paddle.randn([64, 3, 224, 224]) + self.net_creator = resnet18 + self.optimizer_creator = partial(paddle.optimizer.Adam, 0.001) + self.loss_fn = loss_fn_tiny_model + self.train_step_func = train_step_tiny_model + self.steps = 3 if __name__ == "__main__": From 618df70f588b6c9f534d4e25042232fffb778966 Mon Sep 17 00:00:00 2001 From: SigureMo Date: Thu, 16 Mar 2023 03:15:39 +0000 Subject: [PATCH 13/43] up the train step time limit --- .../unittests/dygraph_to_static/CMakeLists.txt | 1 + .../unittests/dygraph_to_static/test_train_step.py | 14 ++++++++++++++ 2 files changed, 15 insertions(+) diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/CMakeLists.txt b/python/paddle/fluid/tests/unittests/dygraph_to_static/CMakeLists.txt index e1e14bd75c910..185de39d7c339 100644 --- a/python/paddle/fluid/tests/unittests/dygraph_to_static/CMakeLists.txt +++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/CMakeLists.txt @@ -66,6 +66,7 @@ set_tests_properties(test_transformer PROPERTIES TIMEOUT 200) set_tests_properties(test_bmn PROPERTIES TIMEOUT 120) #set_tests_properties(test_mnist PROPERTIES TIMEOUT 120) set_tests_properties(test_build_strategy PROPERTIES TIMEOUT 120) +set_tests_properties(test_train_step PROPERTIES TIMEOUT 120) if(NOT WIN32) set_tests_properties(test_resnet_v2 PROPERTIES TIMEOUT 120) diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_train_step.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_train_step.py index da12e1f9d57f7..193c8c8be3ea3 100644 --- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_train_step.py +++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_train_step.py @@ -169,6 +169,20 @@ def setUp(self): self.steps = 5 +# This will raise an error due to 0D lr +# class TestTrainStepTinyModelLRNoamDecay(TestTrainStepTinyModel): +# def setUp(self): +# self.input = paddle.randn([64, 3, 224, 224]) +# self.net_creator = resnet18 +# self.optimizer_creator = partial( +# paddle.optimizer.SGD, +# paddle.optimizer.lr.NoamDecay(d_model=0.01, warmup_steps=100), +# ) +# self.loss_fn = loss_fn_tiny_model +# self.train_step_func = train_step_tiny_model +# self.steps = 3 + + class TestTrainStepResNet18Sgd(TestTrainStepTinyModel): def setUp(self): self.input = paddle.randn([64, 3, 224, 224]) From 531a5ec0f333853f78e10f2432c34df7f0505adf Mon Sep 17 00:00:00 2001 From: SigureMo Date: Thu, 16 Mar 2023 06:23:57 +0000 Subject: [PATCH 14/43] up the train step time limit --- .../fluid/tests/unittests/dygraph_to_static/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/CMakeLists.txt b/python/paddle/fluid/tests/unittests/dygraph_to_static/CMakeLists.txt index 185de39d7c339..ed079ca26aa25 100644 --- a/python/paddle/fluid/tests/unittests/dygraph_to_static/CMakeLists.txt +++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/CMakeLists.txt @@ -66,7 +66,7 @@ set_tests_properties(test_transformer PROPERTIES TIMEOUT 200) set_tests_properties(test_bmn PROPERTIES TIMEOUT 120) #set_tests_properties(test_mnist PROPERTIES TIMEOUT 120) set_tests_properties(test_build_strategy PROPERTIES TIMEOUT 120) -set_tests_properties(test_train_step PROPERTIES TIMEOUT 120) +set_tests_properties(test_train_step PROPERTIES TIMEOUT 200) if(NOT WIN32) set_tests_properties(test_resnet_v2 PROPERTIES TIMEOUT 120) From 9f01a0ccb5d2da2eaa1fbbd64ec2a64d1593a8ae Mon Sep 17 00:00:00 2001 From: SigureMo Date: Thu, 16 Mar 2023 08:29:42 +0000 Subject: [PATCH 15/43] up the train step time limit --- .../fluid/tests/unittests/dygraph_to_static/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/CMakeLists.txt b/python/paddle/fluid/tests/unittests/dygraph_to_static/CMakeLists.txt index ed079ca26aa25..0f1b351336655 100644 --- a/python/paddle/fluid/tests/unittests/dygraph_to_static/CMakeLists.txt +++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/CMakeLists.txt @@ -66,7 +66,7 @@ set_tests_properties(test_transformer PROPERTIES TIMEOUT 200) set_tests_properties(test_bmn PROPERTIES TIMEOUT 120) #set_tests_properties(test_mnist PROPERTIES TIMEOUT 120) set_tests_properties(test_build_strategy PROPERTIES TIMEOUT 120) -set_tests_properties(test_train_step PROPERTIES TIMEOUT 200) +set_tests_properties(test_train_step PROPERTIES TIMEOUT 500) if(NOT WIN32) set_tests_properties(test_resnet_v2 PROPERTIES TIMEOUT 120) From f30242fa37d58a0f689b01fdd742f10987b84145 Mon Sep 17 00:00:00 2001 From: SigureMo Date: Mon, 20 Mar 2023 03:15:31 +0000 Subject: [PATCH 16/43] fix prim issues due to merge conflicts --- python/paddle/jit/dy2static/program_translator.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/python/paddle/jit/dy2static/program_translator.py b/python/paddle/jit/dy2static/program_translator.py index ae7248fbce0ef..f23186038fd83 100644 --- a/python/paddle/jit/dy2static/program_translator.py +++ b/python/paddle/jit/dy2static/program_translator.py @@ -1182,14 +1182,14 @@ def _build_once(self, cache_key): ) ) - partial_program = partial_program_from(concrete_program) + partial_program = partial_program_from( + concrete_program, cache_key.class_instance is not None + ) if core._is_fwd_prim_enabled() and not _in_amp_guard(): partial_program.set_hooker( PrimHooker(concrete_program.main_program) ) - return concrete_program, partial_program_from( - concrete_program, cache_key.class_instance is not None - ) + return concrete_program, partial_program def __getitem__(self, item): if not isinstance(item, CacheKey): From b82ee42f2e6a00d42d705df80a278af81a164088 Mon Sep 17 00:00:00 2001 From: SigureMo Date: Mon, 20 Mar 2023 03:48:49 +0000 Subject: [PATCH 17/43] `_non_static_only_` -> `non_static_only` --- python/paddle/fluid/framework.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py index 2b49539b97b8c..d3f837ce23927 100644 --- a/python/paddle/fluid/framework.py +++ b/python/paddle/fluid/framework.py @@ -1682,7 +1682,7 @@ def numpy(self): """ pass - @_non_static_only_ + @non_static_only def backward(self, retain_graph=False): """ **Notes**: From 821d4f17aa1c938eb3f18095e1d6a01ca3f51314 Mon Sep 17 00:00:00 2001 From: SigureMo Date: Mon, 20 Mar 2023 06:26:46 +0000 Subject: [PATCH 18/43] add all lr scheduler tests --- .../dygraph_to_static/test_train_step.py | 248 ++++++++++++++++-- python/paddle/optimizer/optimizer.py | 2 +- 2 files changed, 228 insertions(+), 22 deletions(-) diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_train_step.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_train_step.py index 193c8c8be3ea3..1d6879da0b1bc 100644 --- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_train_step.py +++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_train_step.py @@ -21,6 +21,7 @@ import paddle from paddle.vision.models import resnet18 +# TODO(SigureMo): remove this before merge # paddle.set_device('cpu') @@ -79,7 +80,6 @@ def test_train_step(self): static_losses = self.get_train_step_losses( paddle.jit.to_static(self.train_step_func), self.steps ) - print(dygraph_losses, static_losses) self.assertEqual(len(dygraph_losses), len(static_losses)) for dygraph_loss, static_loss in zip(dygraph_losses, static_losses): dygraph_loss = dygraph_loss.numpy() @@ -94,7 +94,7 @@ def setUp(self): self.optimizer_creator = partial(paddle.optimizer.Adadelta, 0.001) self.loss_fn = loss_fn_tiny_model self.train_step_func = train_step_tiny_model - self.steps = 5 + self.steps = 3 class TestTrainStepTinyModelAdagrad(TestTrainStepTinyModel): @@ -104,7 +104,7 @@ def setUp(self): self.optimizer_creator = partial(paddle.optimizer.Adagrad, 0.001) self.loss_fn = loss_fn_tiny_model self.train_step_func = train_step_tiny_model - self.steps = 5 + self.steps = 3 class TestTrainStepTinyModelAdam(TestTrainStepTinyModel): @@ -114,7 +114,7 @@ def setUp(self): self.optimizer_creator = partial(paddle.optimizer.Adam, 0.001) self.loss_fn = loss_fn_tiny_model self.train_step_func = train_step_tiny_model - self.steps = 5 + self.steps = 3 class TestTrainStepTinyModelAdamax(TestTrainStepTinyModel): @@ -124,7 +124,7 @@ def setUp(self): self.optimizer_creator = partial(paddle.optimizer.Adamax, 0.001) self.loss_fn = loss_fn_tiny_model self.train_step_func = train_step_tiny_model - self.steps = 5 + self.steps = 3 class TestTrainStepTinyModelAdamW(TestTrainStepTinyModel): @@ -134,7 +134,7 @@ def setUp(self): self.optimizer_creator = partial(paddle.optimizer.AdamW, 0.001) self.loss_fn = loss_fn_tiny_model self.train_step_func = train_step_tiny_model - self.steps = 5 + self.steps = 3 class TestTrainStepTinyModelLamb(TestTrainStepTinyModel): @@ -146,7 +146,7 @@ def setUp(self): ) self.loss_fn = loss_fn_tiny_model self.train_step_func = train_step_tiny_model - self.steps = 5 + self.steps = 3 class TestTrainStepTinyModelMomentum(TestTrainStepTinyModel): @@ -156,7 +156,7 @@ def setUp(self): self.optimizer_creator = partial(paddle.optimizer.Momentum, 0.001) self.loss_fn = loss_fn_tiny_model self.train_step_func = train_step_tiny_model - self.steps = 5 + self.steps = 3 class TestTrainStepTinyModelRMSProp(TestTrainStepTinyModel): @@ -166,21 +166,227 @@ def setUp(self): self.optimizer_creator = partial(paddle.optimizer.RMSProp, 0.001) self.loss_fn = loss_fn_tiny_model self.train_step_func = train_step_tiny_model - self.steps = 5 + self.steps = 3 + + +class TestTrainStepTinyModelLRNoamDecay(TestTrainStepTinyModel): + def setUp(self): + self.input = paddle.randn([10000, 10]) + self.net_creator = TinyModel + self.optimizer_creator = partial( + paddle.optimizer.SGD, + paddle.optimizer.lr.NoamDecay(d_model=0.01, warmup_steps=100), + ) + self.loss_fn = loss_fn_tiny_model + self.train_step_func = train_step_tiny_model + self.steps = 3 + + +class TestTrainStepTinyModelLRPiecewiseDecay(TestTrainStepTinyModel): + def setUp(self): + self.input = paddle.randn([10000, 10]) + self.net_creator = TinyModel + self.optimizer_creator = partial( + paddle.optimizer.SGD, + paddle.optimizer.lr.PiecewiseDecay( + boundaries=[3, 6, 9], values=[0.1, 0.2, 0.3, 0.4] + ), + ) + self.loss_fn = loss_fn_tiny_model + self.train_step_func = train_step_tiny_model + self.steps = 3 + + +class TestTrainStepTinyModelLRNaturalExpDecay(TestTrainStepTinyModel): + def setUp(self): + self.input = paddle.randn([10000, 10]) + self.net_creator = TinyModel + self.optimizer_creator = partial( + paddle.optimizer.SGD, + paddle.optimizer.lr.NaturalExpDecay(learning_rate=0.5, gamma=0.1), + ) + self.loss_fn = loss_fn_tiny_model + self.train_step_func = train_step_tiny_model + self.steps = 3 + + +class TestTrainStepTinyModelLRInverseTimeDecay(TestTrainStepTinyModel): + def setUp(self): + self.input = paddle.randn([10000, 10]) + self.net_creator = TinyModel + self.optimizer_creator = partial( + paddle.optimizer.SGD, + paddle.optimizer.lr.InverseTimeDecay(learning_rate=0.5, gamma=0.1), + ) + self.loss_fn = loss_fn_tiny_model + self.train_step_func = train_step_tiny_model + self.steps = 3 + + +class TestTrainStepTinyModelLRPolynomialDecay(TestTrainStepTinyModel): + def setUp(self): + self.input = paddle.randn([10000, 10]) + self.net_creator = TinyModel + self.optimizer_creator = partial( + paddle.optimizer.SGD, + paddle.optimizer.lr.PolynomialDecay( + learning_rate=0.5, decay_steps=20 + ), + ) + self.loss_fn = loss_fn_tiny_model + self.train_step_func = train_step_tiny_model + self.steps = 3 + + +class TestTrainStepTinyModelLRLinearWarmup(TestTrainStepTinyModel): + def setUp(self): + self.input = paddle.randn([10000, 10]) + self.net_creator = TinyModel + self.optimizer_creator = partial( + paddle.optimizer.SGD, + paddle.optimizer.lr.LinearWarmup( + learning_rate=0.5, warmup_steps=2, start_lr=0, end_lr=0.5 + ), + ) + self.loss_fn = loss_fn_tiny_model + self.train_step_func = train_step_tiny_model + self.steps = 3 + + +class TestTrainStepTinyModelLRExponentialDecay(TestTrainStepTinyModel): + def setUp(self): + self.input = paddle.randn([10000, 10]) + self.net_creator = TinyModel + self.optimizer_creator = partial( + paddle.optimizer.SGD, + paddle.optimizer.lr.ExponentialDecay(learning_rate=0.5, gamma=0.9), + ) + self.loss_fn = loss_fn_tiny_model + self.train_step_func = train_step_tiny_model + self.steps = 3 + + +class TestTrainStepTinyModelLRMultiStepDecay(TestTrainStepTinyModel): + def setUp(self): + self.input = paddle.randn([10000, 10]) + self.net_creator = TinyModel + self.optimizer_creator = partial( + paddle.optimizer.SGD, + paddle.optimizer.lr.MultiStepDecay( + learning_rate=0.5, milestones=[2, 4, 6], gamma=0.8 + ), + ) + self.loss_fn = loss_fn_tiny_model + self.train_step_func = train_step_tiny_model + self.steps = 3 + + +class TestTrainStepTinyModelLRStepDecay(TestTrainStepTinyModel): + def setUp(self): + self.input = paddle.randn([10000, 10]) + self.net_creator = TinyModel + self.optimizer_creator = partial( + paddle.optimizer.SGD, + paddle.optimizer.lr.StepDecay( + learning_rate=0.5, step_size=5, gamma=0.8 + ), + ) + self.loss_fn = loss_fn_tiny_model + self.train_step_func = train_step_tiny_model + self.steps = 3 + + +class TestTrainStepTinyModelLRLambdaDecay(TestTrainStepTinyModel): + def setUp(self): + self.input = paddle.randn([10000, 10]) + self.net_creator = TinyModel + self.optimizer_creator = partial( + paddle.optimizer.SGD, + paddle.optimizer.lr.LambdaDecay( + learning_rate=0.5, lr_lambda=lambda x: 0.95**x + ), + ) + self.loss_fn = loss_fn_tiny_model + self.train_step_func = train_step_tiny_model + self.steps = 3 + + +class TestTrainStepTinyModelLRReduceOnPlateau(TestTrainStepTinyModel): + def setUp(self): + self.input = paddle.randn([10000, 10]) + self.net_creator = TinyModel + self.optimizer_creator = partial( + paddle.optimizer.SGD, + paddle.optimizer.lr.ReduceOnPlateau( + learning_rate=1.0, factor=0.5, patience=5 + ), + ) + self.loss_fn = loss_fn_tiny_model + self.train_step_func = train_step_tiny_model + self.steps = 3 -# This will raise an error due to 0D lr -# class TestTrainStepTinyModelLRNoamDecay(TestTrainStepTinyModel): -# def setUp(self): -# self.input = paddle.randn([64, 3, 224, 224]) -# self.net_creator = resnet18 -# self.optimizer_creator = partial( -# paddle.optimizer.SGD, -# paddle.optimizer.lr.NoamDecay(d_model=0.01, warmup_steps=100), -# ) -# self.loss_fn = loss_fn_tiny_model -# self.train_step_func = train_step_tiny_model -# self.steps = 3 +class TestTrainStepTinyModelLRCosineAnnealingDecay(TestTrainStepTinyModel): + def setUp(self): + self.input = paddle.randn([10000, 10]) + self.net_creator = TinyModel + self.optimizer_creator = partial( + paddle.optimizer.SGD, + paddle.optimizer.lr.CosineAnnealingDecay( + learning_rate=0.5, T_max=10 + ), + ) + self.loss_fn = loss_fn_tiny_model + self.train_step_func = train_step_tiny_model + self.steps = 3 + + +class TestTrainStepTinyModelLRMultiplicativeDecay(TestTrainStepTinyModel): + def setUp(self): + self.input = paddle.randn([10000, 10]) + self.net_creator = TinyModel + self.optimizer_creator = partial( + paddle.optimizer.SGD, + paddle.optimizer.lr.MultiplicativeDecay( + learning_rate=0.5, lr_lambda=lambda x: 0.95 + ), + ) + self.loss_fn = loss_fn_tiny_model + self.train_step_func = train_step_tiny_model + self.steps = 3 + + +class TestTrainStepTinyModelLROneCycleLR(TestTrainStepTinyModel): + def setUp(self): + self.input = paddle.randn([10000, 10]) + self.net_creator = TinyModel + self.optimizer_creator = partial( + paddle.optimizer.SGD, + paddle.optimizer.lr.OneCycleLR( + max_learning_rate=1.0, total_steps=3 + ), + ) + self.loss_fn = loss_fn_tiny_model + self.train_step_func = train_step_tiny_model + self.steps = 3 + + +class TestTrainStepTinyModelLRCyclicLR(TestTrainStepTinyModel): + def setUp(self): + self.input = paddle.randn([10000, 10]) + self.net_creator = TinyModel + self.optimizer_creator = partial( + paddle.optimizer.SGD, + paddle.optimizer.lr.CyclicLR( + base_learning_rate=0.5, + max_learning_rate=1.0, + step_size_up=15, + step_size_down=5, + ), + ) + self.loss_fn = loss_fn_tiny_model + self.train_step_func = train_step_tiny_model + self.steps = 3 class TestTrainStepResNet18Sgd(TestTrainStepTinyModel): diff --git a/python/paddle/optimizer/optimizer.py b/python/paddle/optimizer/optimizer.py index 4b6725e27a0db..d3ec69e10f20c 100644 --- a/python/paddle/optimizer/optimizer.py +++ b/python/paddle/optimizer/optimizer.py @@ -441,7 +441,7 @@ def do_create(): self._learning_rate._var_name = lr_name lr_var = self.helper.create_global_variable( name=lr_name, - shape=[], + shape=[1], persistable=True, stop_gradient=True, dtype=_lr_dtype, From d02dcedbb8d7464f873a1ec40c7cb93ba116ce26 Mon Sep 17 00:00:00 2001 From: SigureMo Date: Tue, 21 Mar 2023 06:56:05 +0000 Subject: [PATCH 19/43] add notes about `why remove first arg from method` --- python/paddle/jit/dy2static/partial_program.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/python/paddle/jit/dy2static/partial_program.py b/python/paddle/jit/dy2static/partial_program.py index 6840ac7eae87c..b2ecceb1aafa8 100644 --- a/python/paddle/jit/dy2static/partial_program.py +++ b/python/paddle/jit/dy2static/partial_program.py @@ -1094,6 +1094,8 @@ def _valid_vars(self, vars): def partial_program_from(concrete_program, from_method=False): inputs = concrete_program.inputs + + # NOTE(SigureMo): Remove the first arg `self` from method args. if inputs and from_method: inputs = inputs[1:] From 72d5c78e39e27d1b0dc8d302d706e14f6b9d9c3f Mon Sep 17 00:00:00 2001 From: SigureMo Date: Tue, 21 Mar 2023 07:28:30 +0000 Subject: [PATCH 20/43] for codestyle --- .../fluid/tests/unittests/dygraph_to_static/test_train_step.py | 2 +- python/paddle/jit/dy2static/partial_program.py | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_train_step.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_train_step.py index 1d6879da0b1bc..e16951d38ef7d 100644 --- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_train_step.py +++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_train_step.py @@ -46,7 +46,7 @@ def train_step_tiny_model(net, x, loss_fn, opt): class TinyModel(paddle.nn.Layer): def __init__(self): - super(TinyModel, self).__init__() + super().__init__() self.layer1 = paddle.nn.Linear(10, 10) def forward(self, data): diff --git a/python/paddle/jit/dy2static/partial_program.py b/python/paddle/jit/dy2static/partial_program.py index fe80cf68d8260..284b449ae8a51 100644 --- a/python/paddle/jit/dy2static/partial_program.py +++ b/python/paddle/jit/dy2static/partial_program.py @@ -23,7 +23,6 @@ from paddle.fluid.compiler import BuildStrategy from paddle.fluid.dygraph.base import switch_to_static_graph from paddle.fluid.framework import _apply_pass -from paddle.nn.layer import layers from . import logging_utils from .utils import RETURN_NO_VALUE_MAGIC_NUM, _out_grad_names, _param_grad_names From d473b416304a1e5647bbce0c9cc50f7b45840b49 Mon Sep 17 00:00:00 2001 From: SigureMo Date: Tue, 21 Mar 2023 12:02:46 +0000 Subject: [PATCH 21/43] inc train step time limit --- .../fluid/tests/unittests/dygraph_to_static/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/CMakeLists.txt b/python/paddle/fluid/tests/unittests/dygraph_to_static/CMakeLists.txt index 0f1b351336655..e6a965a3f4a0b 100644 --- a/python/paddle/fluid/tests/unittests/dygraph_to_static/CMakeLists.txt +++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/CMakeLists.txt @@ -66,7 +66,7 @@ set_tests_properties(test_transformer PROPERTIES TIMEOUT 200) set_tests_properties(test_bmn PROPERTIES TIMEOUT 120) #set_tests_properties(test_mnist PROPERTIES TIMEOUT 120) set_tests_properties(test_build_strategy PROPERTIES TIMEOUT 120) -set_tests_properties(test_train_step PROPERTIES TIMEOUT 500) +set_tests_properties(test_train_step PROPERTIES TIMEOUT 900) if(NOT WIN32) set_tests_properties(test_resnet_v2 PROPERTIES TIMEOUT 120) From fabb8c617940ef9c2968d3b147d808c3f3369499 Mon Sep 17 00:00:00 2001 From: SigureMo Date: Wed, 22 Mar 2023 02:56:23 +0000 Subject: [PATCH 22/43] remove some lr scheduler test --- .../dygraph_to_static/CMakeLists.txt | 2 +- .../dygraph_to_static/test_train_step.py | 207 ------------------ 2 files changed, 1 insertion(+), 208 deletions(-) diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/CMakeLists.txt b/python/paddle/fluid/tests/unittests/dygraph_to_static/CMakeLists.txt index e6a965a3f4a0b..0f1b351336655 100644 --- a/python/paddle/fluid/tests/unittests/dygraph_to_static/CMakeLists.txt +++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/CMakeLists.txt @@ -66,7 +66,7 @@ set_tests_properties(test_transformer PROPERTIES TIMEOUT 200) set_tests_properties(test_bmn PROPERTIES TIMEOUT 120) #set_tests_properties(test_mnist PROPERTIES TIMEOUT 120) set_tests_properties(test_build_strategy PROPERTIES TIMEOUT 120) -set_tests_properties(test_train_step PROPERTIES TIMEOUT 900) +set_tests_properties(test_train_step PROPERTIES TIMEOUT 500) if(NOT WIN32) set_tests_properties(test_resnet_v2 PROPERTIES TIMEOUT 120) diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_train_step.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_train_step.py index e16951d38ef7d..d7c083aa16b05 100644 --- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_train_step.py +++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_train_step.py @@ -169,34 +169,6 @@ def setUp(self): self.steps = 3 -class TestTrainStepTinyModelLRNoamDecay(TestTrainStepTinyModel): - def setUp(self): - self.input = paddle.randn([10000, 10]) - self.net_creator = TinyModel - self.optimizer_creator = partial( - paddle.optimizer.SGD, - paddle.optimizer.lr.NoamDecay(d_model=0.01, warmup_steps=100), - ) - self.loss_fn = loss_fn_tiny_model - self.train_step_func = train_step_tiny_model - self.steps = 3 - - -class TestTrainStepTinyModelLRPiecewiseDecay(TestTrainStepTinyModel): - def setUp(self): - self.input = paddle.randn([10000, 10]) - self.net_creator = TinyModel - self.optimizer_creator = partial( - paddle.optimizer.SGD, - paddle.optimizer.lr.PiecewiseDecay( - boundaries=[3, 6, 9], values=[0.1, 0.2, 0.3, 0.4] - ), - ) - self.loss_fn = loss_fn_tiny_model - self.train_step_func = train_step_tiny_model - self.steps = 3 - - class TestTrainStepTinyModelLRNaturalExpDecay(TestTrainStepTinyModel): def setUp(self): self.input = paddle.randn([10000, 10]) @@ -210,185 +182,6 @@ def setUp(self): self.steps = 3 -class TestTrainStepTinyModelLRInverseTimeDecay(TestTrainStepTinyModel): - def setUp(self): - self.input = paddle.randn([10000, 10]) - self.net_creator = TinyModel - self.optimizer_creator = partial( - paddle.optimizer.SGD, - paddle.optimizer.lr.InverseTimeDecay(learning_rate=0.5, gamma=0.1), - ) - self.loss_fn = loss_fn_tiny_model - self.train_step_func = train_step_tiny_model - self.steps = 3 - - -class TestTrainStepTinyModelLRPolynomialDecay(TestTrainStepTinyModel): - def setUp(self): - self.input = paddle.randn([10000, 10]) - self.net_creator = TinyModel - self.optimizer_creator = partial( - paddle.optimizer.SGD, - paddle.optimizer.lr.PolynomialDecay( - learning_rate=0.5, decay_steps=20 - ), - ) - self.loss_fn = loss_fn_tiny_model - self.train_step_func = train_step_tiny_model - self.steps = 3 - - -class TestTrainStepTinyModelLRLinearWarmup(TestTrainStepTinyModel): - def setUp(self): - self.input = paddle.randn([10000, 10]) - self.net_creator = TinyModel - self.optimizer_creator = partial( - paddle.optimizer.SGD, - paddle.optimizer.lr.LinearWarmup( - learning_rate=0.5, warmup_steps=2, start_lr=0, end_lr=0.5 - ), - ) - self.loss_fn = loss_fn_tiny_model - self.train_step_func = train_step_tiny_model - self.steps = 3 - - -class TestTrainStepTinyModelLRExponentialDecay(TestTrainStepTinyModel): - def setUp(self): - self.input = paddle.randn([10000, 10]) - self.net_creator = TinyModel - self.optimizer_creator = partial( - paddle.optimizer.SGD, - paddle.optimizer.lr.ExponentialDecay(learning_rate=0.5, gamma=0.9), - ) - self.loss_fn = loss_fn_tiny_model - self.train_step_func = train_step_tiny_model - self.steps = 3 - - -class TestTrainStepTinyModelLRMultiStepDecay(TestTrainStepTinyModel): - def setUp(self): - self.input = paddle.randn([10000, 10]) - self.net_creator = TinyModel - self.optimizer_creator = partial( - paddle.optimizer.SGD, - paddle.optimizer.lr.MultiStepDecay( - learning_rate=0.5, milestones=[2, 4, 6], gamma=0.8 - ), - ) - self.loss_fn = loss_fn_tiny_model - self.train_step_func = train_step_tiny_model - self.steps = 3 - - -class TestTrainStepTinyModelLRStepDecay(TestTrainStepTinyModel): - def setUp(self): - self.input = paddle.randn([10000, 10]) - self.net_creator = TinyModel - self.optimizer_creator = partial( - paddle.optimizer.SGD, - paddle.optimizer.lr.StepDecay( - learning_rate=0.5, step_size=5, gamma=0.8 - ), - ) - self.loss_fn = loss_fn_tiny_model - self.train_step_func = train_step_tiny_model - self.steps = 3 - - -class TestTrainStepTinyModelLRLambdaDecay(TestTrainStepTinyModel): - def setUp(self): - self.input = paddle.randn([10000, 10]) - self.net_creator = TinyModel - self.optimizer_creator = partial( - paddle.optimizer.SGD, - paddle.optimizer.lr.LambdaDecay( - learning_rate=0.5, lr_lambda=lambda x: 0.95**x - ), - ) - self.loss_fn = loss_fn_tiny_model - self.train_step_func = train_step_tiny_model - self.steps = 3 - - -class TestTrainStepTinyModelLRReduceOnPlateau(TestTrainStepTinyModel): - def setUp(self): - self.input = paddle.randn([10000, 10]) - self.net_creator = TinyModel - self.optimizer_creator = partial( - paddle.optimizer.SGD, - paddle.optimizer.lr.ReduceOnPlateau( - learning_rate=1.0, factor=0.5, patience=5 - ), - ) - self.loss_fn = loss_fn_tiny_model - self.train_step_func = train_step_tiny_model - self.steps = 3 - - -class TestTrainStepTinyModelLRCosineAnnealingDecay(TestTrainStepTinyModel): - def setUp(self): - self.input = paddle.randn([10000, 10]) - self.net_creator = TinyModel - self.optimizer_creator = partial( - paddle.optimizer.SGD, - paddle.optimizer.lr.CosineAnnealingDecay( - learning_rate=0.5, T_max=10 - ), - ) - self.loss_fn = loss_fn_tiny_model - self.train_step_func = train_step_tiny_model - self.steps = 3 - - -class TestTrainStepTinyModelLRMultiplicativeDecay(TestTrainStepTinyModel): - def setUp(self): - self.input = paddle.randn([10000, 10]) - self.net_creator = TinyModel - self.optimizer_creator = partial( - paddle.optimizer.SGD, - paddle.optimizer.lr.MultiplicativeDecay( - learning_rate=0.5, lr_lambda=lambda x: 0.95 - ), - ) - self.loss_fn = loss_fn_tiny_model - self.train_step_func = train_step_tiny_model - self.steps = 3 - - -class TestTrainStepTinyModelLROneCycleLR(TestTrainStepTinyModel): - def setUp(self): - self.input = paddle.randn([10000, 10]) - self.net_creator = TinyModel - self.optimizer_creator = partial( - paddle.optimizer.SGD, - paddle.optimizer.lr.OneCycleLR( - max_learning_rate=1.0, total_steps=3 - ), - ) - self.loss_fn = loss_fn_tiny_model - self.train_step_func = train_step_tiny_model - self.steps = 3 - - -class TestTrainStepTinyModelLRCyclicLR(TestTrainStepTinyModel): - def setUp(self): - self.input = paddle.randn([10000, 10]) - self.net_creator = TinyModel - self.optimizer_creator = partial( - paddle.optimizer.SGD, - paddle.optimizer.lr.CyclicLR( - base_learning_rate=0.5, - max_learning_rate=1.0, - step_size_up=15, - step_size_down=5, - ), - ) - self.loss_fn = loss_fn_tiny_model - self.train_step_func = train_step_tiny_model - self.steps = 3 - - class TestTrainStepResNet18Sgd(TestTrainStepTinyModel): def setUp(self): self.input = paddle.randn([64, 3, 224, 224]) From 84d5dc60e7cc12b76539f4bf29d4da151a7dd557 Mon Sep 17 00:00:00 2001 From: SigureMo Date: Wed, 22 Mar 2023 03:00:39 +0000 Subject: [PATCH 23/43] Revert "remove some lr scheduler test" This reverts commit fabb8c617940ef9c2968d3b147d808c3f3369499. --- .../dygraph_to_static/CMakeLists.txt | 2 +- .../dygraph_to_static/test_train_step.py | 207 ++++++++++++++++++ 2 files changed, 208 insertions(+), 1 deletion(-) diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/CMakeLists.txt b/python/paddle/fluid/tests/unittests/dygraph_to_static/CMakeLists.txt index 0f1b351336655..e6a965a3f4a0b 100644 --- a/python/paddle/fluid/tests/unittests/dygraph_to_static/CMakeLists.txt +++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/CMakeLists.txt @@ -66,7 +66,7 @@ set_tests_properties(test_transformer PROPERTIES TIMEOUT 200) set_tests_properties(test_bmn PROPERTIES TIMEOUT 120) #set_tests_properties(test_mnist PROPERTIES TIMEOUT 120) set_tests_properties(test_build_strategy PROPERTIES TIMEOUT 120) -set_tests_properties(test_train_step PROPERTIES TIMEOUT 500) +set_tests_properties(test_train_step PROPERTIES TIMEOUT 900) if(NOT WIN32) set_tests_properties(test_resnet_v2 PROPERTIES TIMEOUT 120) diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_train_step.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_train_step.py index d7c083aa16b05..e16951d38ef7d 100644 --- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_train_step.py +++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_train_step.py @@ -169,6 +169,34 @@ def setUp(self): self.steps = 3 +class TestTrainStepTinyModelLRNoamDecay(TestTrainStepTinyModel): + def setUp(self): + self.input = paddle.randn([10000, 10]) + self.net_creator = TinyModel + self.optimizer_creator = partial( + paddle.optimizer.SGD, + paddle.optimizer.lr.NoamDecay(d_model=0.01, warmup_steps=100), + ) + self.loss_fn = loss_fn_tiny_model + self.train_step_func = train_step_tiny_model + self.steps = 3 + + +class TestTrainStepTinyModelLRPiecewiseDecay(TestTrainStepTinyModel): + def setUp(self): + self.input = paddle.randn([10000, 10]) + self.net_creator = TinyModel + self.optimizer_creator = partial( + paddle.optimizer.SGD, + paddle.optimizer.lr.PiecewiseDecay( + boundaries=[3, 6, 9], values=[0.1, 0.2, 0.3, 0.4] + ), + ) + self.loss_fn = loss_fn_tiny_model + self.train_step_func = train_step_tiny_model + self.steps = 3 + + class TestTrainStepTinyModelLRNaturalExpDecay(TestTrainStepTinyModel): def setUp(self): self.input = paddle.randn([10000, 10]) @@ -182,6 +210,185 @@ def setUp(self): self.steps = 3 +class TestTrainStepTinyModelLRInverseTimeDecay(TestTrainStepTinyModel): + def setUp(self): + self.input = paddle.randn([10000, 10]) + self.net_creator = TinyModel + self.optimizer_creator = partial( + paddle.optimizer.SGD, + paddle.optimizer.lr.InverseTimeDecay(learning_rate=0.5, gamma=0.1), + ) + self.loss_fn = loss_fn_tiny_model + self.train_step_func = train_step_tiny_model + self.steps = 3 + + +class TestTrainStepTinyModelLRPolynomialDecay(TestTrainStepTinyModel): + def setUp(self): + self.input = paddle.randn([10000, 10]) + self.net_creator = TinyModel + self.optimizer_creator = partial( + paddle.optimizer.SGD, + paddle.optimizer.lr.PolynomialDecay( + learning_rate=0.5, decay_steps=20 + ), + ) + self.loss_fn = loss_fn_tiny_model + self.train_step_func = train_step_tiny_model + self.steps = 3 + + +class TestTrainStepTinyModelLRLinearWarmup(TestTrainStepTinyModel): + def setUp(self): + self.input = paddle.randn([10000, 10]) + self.net_creator = TinyModel + self.optimizer_creator = partial( + paddle.optimizer.SGD, + paddle.optimizer.lr.LinearWarmup( + learning_rate=0.5, warmup_steps=2, start_lr=0, end_lr=0.5 + ), + ) + self.loss_fn = loss_fn_tiny_model + self.train_step_func = train_step_tiny_model + self.steps = 3 + + +class TestTrainStepTinyModelLRExponentialDecay(TestTrainStepTinyModel): + def setUp(self): + self.input = paddle.randn([10000, 10]) + self.net_creator = TinyModel + self.optimizer_creator = partial( + paddle.optimizer.SGD, + paddle.optimizer.lr.ExponentialDecay(learning_rate=0.5, gamma=0.9), + ) + self.loss_fn = loss_fn_tiny_model + self.train_step_func = train_step_tiny_model + self.steps = 3 + + +class TestTrainStepTinyModelLRMultiStepDecay(TestTrainStepTinyModel): + def setUp(self): + self.input = paddle.randn([10000, 10]) + self.net_creator = TinyModel + self.optimizer_creator = partial( + paddle.optimizer.SGD, + paddle.optimizer.lr.MultiStepDecay( + learning_rate=0.5, milestones=[2, 4, 6], gamma=0.8 + ), + ) + self.loss_fn = loss_fn_tiny_model + self.train_step_func = train_step_tiny_model + self.steps = 3 + + +class TestTrainStepTinyModelLRStepDecay(TestTrainStepTinyModel): + def setUp(self): + self.input = paddle.randn([10000, 10]) + self.net_creator = TinyModel + self.optimizer_creator = partial( + paddle.optimizer.SGD, + paddle.optimizer.lr.StepDecay( + learning_rate=0.5, step_size=5, gamma=0.8 + ), + ) + self.loss_fn = loss_fn_tiny_model + self.train_step_func = train_step_tiny_model + self.steps = 3 + + +class TestTrainStepTinyModelLRLambdaDecay(TestTrainStepTinyModel): + def setUp(self): + self.input = paddle.randn([10000, 10]) + self.net_creator = TinyModel + self.optimizer_creator = partial( + paddle.optimizer.SGD, + paddle.optimizer.lr.LambdaDecay( + learning_rate=0.5, lr_lambda=lambda x: 0.95**x + ), + ) + self.loss_fn = loss_fn_tiny_model + self.train_step_func = train_step_tiny_model + self.steps = 3 + + +class TestTrainStepTinyModelLRReduceOnPlateau(TestTrainStepTinyModel): + def setUp(self): + self.input = paddle.randn([10000, 10]) + self.net_creator = TinyModel + self.optimizer_creator = partial( + paddle.optimizer.SGD, + paddle.optimizer.lr.ReduceOnPlateau( + learning_rate=1.0, factor=0.5, patience=5 + ), + ) + self.loss_fn = loss_fn_tiny_model + self.train_step_func = train_step_tiny_model + self.steps = 3 + + +class TestTrainStepTinyModelLRCosineAnnealingDecay(TestTrainStepTinyModel): + def setUp(self): + self.input = paddle.randn([10000, 10]) + self.net_creator = TinyModel + self.optimizer_creator = partial( + paddle.optimizer.SGD, + paddle.optimizer.lr.CosineAnnealingDecay( + learning_rate=0.5, T_max=10 + ), + ) + self.loss_fn = loss_fn_tiny_model + self.train_step_func = train_step_tiny_model + self.steps = 3 + + +class TestTrainStepTinyModelLRMultiplicativeDecay(TestTrainStepTinyModel): + def setUp(self): + self.input = paddle.randn([10000, 10]) + self.net_creator = TinyModel + self.optimizer_creator = partial( + paddle.optimizer.SGD, + paddle.optimizer.lr.MultiplicativeDecay( + learning_rate=0.5, lr_lambda=lambda x: 0.95 + ), + ) + self.loss_fn = loss_fn_tiny_model + self.train_step_func = train_step_tiny_model + self.steps = 3 + + +class TestTrainStepTinyModelLROneCycleLR(TestTrainStepTinyModel): + def setUp(self): + self.input = paddle.randn([10000, 10]) + self.net_creator = TinyModel + self.optimizer_creator = partial( + paddle.optimizer.SGD, + paddle.optimizer.lr.OneCycleLR( + max_learning_rate=1.0, total_steps=3 + ), + ) + self.loss_fn = loss_fn_tiny_model + self.train_step_func = train_step_tiny_model + self.steps = 3 + + +class TestTrainStepTinyModelLRCyclicLR(TestTrainStepTinyModel): + def setUp(self): + self.input = paddle.randn([10000, 10]) + self.net_creator = TinyModel + self.optimizer_creator = partial( + paddle.optimizer.SGD, + paddle.optimizer.lr.CyclicLR( + base_learning_rate=0.5, + max_learning_rate=1.0, + step_size_up=15, + step_size_down=5, + ), + ) + self.loss_fn = loss_fn_tiny_model + self.train_step_func = train_step_tiny_model + self.steps = 3 + + class TestTrainStepResNet18Sgd(TestTrainStepTinyModel): def setUp(self): self.input = paddle.randn([64, 3, 224, 224]) From 251144de72e2338c3e55ba07dc67cf6f3ea5ce72 Mon Sep 17 00:00:00 2001 From: SigureMo Date: Wed, 22 Mar 2023 03:21:37 +0000 Subject: [PATCH 24/43] split test into 3 difference tests --- .../dygraph_to_static/CMakeLists.txt | 3 +- .../dygraph_to_static/test_train_step.py | 24 ------------ .../test_train_step_resnet18_adam.py | 39 +++++++++++++++++++ .../test_train_step_resnet18_sgd.py | 39 +++++++++++++++++++ 4 files changed, 80 insertions(+), 25 deletions(-) create mode 100644 python/paddle/fluid/tests/unittests/dygraph_to_static/test_train_step_resnet18_adam.py create mode 100644 python/paddle/fluid/tests/unittests/dygraph_to_static/test_train_step_resnet18_sgd.py diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/CMakeLists.txt b/python/paddle/fluid/tests/unittests/dygraph_to_static/CMakeLists.txt index e6a965a3f4a0b..55cab1f34ebb4 100644 --- a/python/paddle/fluid/tests/unittests/dygraph_to_static/CMakeLists.txt +++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/CMakeLists.txt @@ -66,7 +66,8 @@ set_tests_properties(test_transformer PROPERTIES TIMEOUT 200) set_tests_properties(test_bmn PROPERTIES TIMEOUT 120) #set_tests_properties(test_mnist PROPERTIES TIMEOUT 120) set_tests_properties(test_build_strategy PROPERTIES TIMEOUT 120) -set_tests_properties(test_train_step PROPERTIES TIMEOUT 900) +set_tests_properties(test_train_step_resnet18_sgd PROPERTIES TIMEOUT 900) +set_tests_properties(test_train_step_resnet18_adam PROPERTIES TIMEOUT 900) if(NOT WIN32) set_tests_properties(test_resnet_v2 PROPERTIES TIMEOUT 120) diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_train_step.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_train_step.py index e16951d38ef7d..144cea8c64dcd 100644 --- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_train_step.py +++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_train_step.py @@ -19,10 +19,6 @@ import numpy as np import paddle -from paddle.vision.models import resnet18 - -# TODO(SigureMo): remove this before merge -# paddle.set_device('cpu') def reset_seed(): @@ -389,25 +385,5 @@ def setUp(self): self.steps = 3 -class TestTrainStepResNet18Sgd(TestTrainStepTinyModel): - def setUp(self): - self.input = paddle.randn([64, 3, 224, 224]) - self.net_creator = resnet18 - self.optimizer_creator = partial(paddle.optimizer.SGD, 0.001) - self.loss_fn = loss_fn_tiny_model - self.train_step_func = train_step_tiny_model - self.steps = 3 - - -class TestTrainStepResNet18Adam(TestTrainStepTinyModel): - def setUp(self): - self.input = paddle.randn([64, 3, 224, 224]) - self.net_creator = resnet18 - self.optimizer_creator = partial(paddle.optimizer.Adam, 0.001) - self.loss_fn = loss_fn_tiny_model - self.train_step_func = train_step_tiny_model - self.steps = 3 - - if __name__ == "__main__": unittest.main() diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_train_step_resnet18_adam.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_train_step_resnet18_adam.py new file mode 100644 index 0000000000000..e4b68eaf85ab5 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_train_step_resnet18_adam.py @@ -0,0 +1,39 @@ +# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest +from functools import partial + +from test_train_step import ( + TestTrainStepTinyModel, + loss_fn_tiny_model, + train_step_tiny_model, +) + +import paddle +from paddle.vision.models import resnet18 + + +class TestTrainStepResNet18Adam(TestTrainStepTinyModel): + def setUp(self): + self.input = paddle.randn([64, 3, 224, 224]) + self.net_creator = resnet18 + self.optimizer_creator = partial(paddle.optimizer.Adam, 0.001) + self.loss_fn = loss_fn_tiny_model + self.train_step_func = train_step_tiny_model + self.steps = 3 + + +if __name__ == "__main__": + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_train_step_resnet18_sgd.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_train_step_resnet18_sgd.py new file mode 100644 index 0000000000000..2a08467e1b16b --- /dev/null +++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_train_step_resnet18_sgd.py @@ -0,0 +1,39 @@ +# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest +from functools import partial + +from test_train_step import ( + TestTrainStepTinyModel, + loss_fn_tiny_model, + train_step_tiny_model, +) + +import paddle +from paddle.vision.models import resnet18 + + +class TestTrainStepResNet18Sgd(TestTrainStepTinyModel): + def setUp(self): + self.input = paddle.randn([64, 3, 224, 224]) + self.net_creator = resnet18 + self.optimizer_creator = partial(paddle.optimizer.SGD, 0.001) + self.loss_fn = loss_fn_tiny_model + self.train_step_func = train_step_tiny_model + self.steps = 3 + + +if __name__ == "__main__": + unittest.main() From 878512948ea8ee22812e57523704ae49a786d825 Mon Sep 17 00:00:00 2001 From: SigureMo Date: Thu, 23 Mar 2023 05:10:47 +0000 Subject: [PATCH 25/43] remove a useless TODO --- python/paddle/jit/dy2static/partial_program.py | 1 - 1 file changed, 1 deletion(-) diff --git a/python/paddle/jit/dy2static/partial_program.py b/python/paddle/jit/dy2static/partial_program.py index 284b449ae8a51..0f5e61fcd98e4 100644 --- a/python/paddle/jit/dy2static/partial_program.py +++ b/python/paddle/jit/dy2static/partial_program.py @@ -314,7 +314,6 @@ def _create_forward_backward_train_program(self): forward_end_op_index = self.get_forward_end_op_idx(whole_program) assert forward_end_op_index >= 0 - # TODO: wait for the PR: https://github.com/PaddlePaddle/Paddle/pull/50885 return self._get_forward_backward_program_form( whole_program, forward_end_op_index ) From bead669fdbe1f0f84805dfbe5f5756f6ca473cd2 Mon Sep 17 00:00:00 2001 From: SigureMo Date: Fri, 24 Mar 2023 07:44:22 +0000 Subject: [PATCH 26/43] add lr scheduler support --- .../dygraph_to_static/test_train_step.py | 193 ++++++++++-------- .../test_train_step_resnet18_adam.py | 4 +- .../test_train_step_resnet18_sgd.py | 4 +- .../paddle/jit/dy2static/partial_program.py | 19 ++ 4 files changed, 130 insertions(+), 90 deletions(-) diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_train_step.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_train_step.py index 144cea8c64dcd..36cff5a7f3990 100644 --- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_train_step.py +++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_train_step.py @@ -53,7 +53,8 @@ class TestTrainStepTinyModel(unittest.TestCase): def setUp(self): self.input = paddle.randn([10000, 10]) self.net_creator = TinyModel - self.optimizer_creator = partial(paddle.optimizer.SGD, 0.001) + self.lr_creator = lambda: 0.001 + self.optimizer_creator = paddle.optimizer.SGD self.loss_fn = loss_fn_tiny_model self.train_step_func = train_step_tiny_model self.steps = 5 @@ -61,10 +62,17 @@ def setUp(self): def get_train_step_losses(self, func, steps): losses = [] net = self.net_creator() - optimizer = self.optimizer_creator(parameters=net.parameters()) + lr = self.lr_creator() + optimizer = self.optimizer_creator( + learning_rate=lr, parameters=net.parameters() + ) for _ in range(steps): - out = func(net, self.input, self.loss_fn, optimizer) - losses.append(out) + loss = func(net, self.input, self.loss_fn, optimizer) + if isinstance(lr, paddle.optimizer.lr.ReduceOnPlateau): + lr.step(loss) + elif isinstance(lr, paddle.optimizer.lr.LRScheduler): + lr.step() + losses.append(loss) return losses def test_train_step(self): @@ -73,21 +81,21 @@ def test_train_step(self): self.train_step_func, self.steps ) reset_seed() - static_losses = self.get_train_step_losses( - paddle.jit.to_static(self.train_step_func), self.steps - ) + static_func = paddle.jit.to_static(self.train_step_func) + static_losses = self.get_train_step_losses(static_func, self.steps) self.assertEqual(len(dygraph_losses), len(static_losses)) for dygraph_loss, static_loss in zip(dygraph_losses, static_losses): dygraph_loss = dygraph_loss.numpy() static_loss = static_loss.numpy() - np.testing.assert_allclose(dygraph_loss, static_loss, rtol=1e-5) + np.testing.assert_allclose(dygraph_loss, static_loss, rtol=1e-4) class TestTrainStepTinyModelAdadelta(TestTrainStepTinyModel): def setUp(self): self.input = paddle.randn([10000, 10]) self.net_creator = TinyModel - self.optimizer_creator = partial(paddle.optimizer.Adadelta, 0.001) + self.lr_creator = lambda: 0.001 + self.optimizer_creator = paddle.optimizer.Adadelta self.loss_fn = loss_fn_tiny_model self.train_step_func = train_step_tiny_model self.steps = 3 @@ -97,7 +105,8 @@ class TestTrainStepTinyModelAdagrad(TestTrainStepTinyModel): def setUp(self): self.input = paddle.randn([10000, 10]) self.net_creator = TinyModel - self.optimizer_creator = partial(paddle.optimizer.Adagrad, 0.001) + self.lr_creator = lambda: 0.001 + self.optimizer_creator = paddle.optimizer.Adagrad self.loss_fn = loss_fn_tiny_model self.train_step_func = train_step_tiny_model self.steps = 3 @@ -107,7 +116,8 @@ class TestTrainStepTinyModelAdam(TestTrainStepTinyModel): def setUp(self): self.input = paddle.randn([10000, 10]) self.net_creator = TinyModel - self.optimizer_creator = partial(paddle.optimizer.Adam, 0.001) + self.lr_creator = lambda: 0.001 + self.optimizer_creator = paddle.optimizer.Adam self.loss_fn = loss_fn_tiny_model self.train_step_func = train_step_tiny_model self.steps = 3 @@ -117,7 +127,8 @@ class TestTrainStepTinyModelAdamax(TestTrainStepTinyModel): def setUp(self): self.input = paddle.randn([10000, 10]) self.net_creator = TinyModel - self.optimizer_creator = partial(paddle.optimizer.Adamax, 0.001) + self.lr_creator = lambda: 0.001 + self.optimizer_creator = paddle.optimizer.Adamax self.loss_fn = loss_fn_tiny_model self.train_step_func = train_step_tiny_model self.steps = 3 @@ -127,7 +138,8 @@ class TestTrainStepTinyModelAdamW(TestTrainStepTinyModel): def setUp(self): self.input = paddle.randn([10000, 10]) self.net_creator = TinyModel - self.optimizer_creator = partial(paddle.optimizer.AdamW, 0.001) + self.lr_creator = lambda: 0.001 + self.optimizer_creator = paddle.optimizer.AdamW self.loss_fn = loss_fn_tiny_model self.train_step_func = train_step_tiny_model self.steps = 3 @@ -137,8 +149,9 @@ class TestTrainStepTinyModelLamb(TestTrainStepTinyModel): def setUp(self): self.input = paddle.randn([10000, 10]) self.net_creator = TinyModel + self.lr_creator = lambda: 0.001 self.optimizer_creator = partial( - paddle.optimizer.Lamb, learning_rate=0.001, lamb_weight_decay=0.01 + paddle.optimizer.Lamb, lamb_weight_decay=0.01 ) self.loss_fn = loss_fn_tiny_model self.train_step_func = train_step_tiny_model @@ -149,7 +162,8 @@ class TestTrainStepTinyModelMomentum(TestTrainStepTinyModel): def setUp(self): self.input = paddle.randn([10000, 10]) self.net_creator = TinyModel - self.optimizer_creator = partial(paddle.optimizer.Momentum, 0.001) + self.lr_creator = lambda: 0.001 + self.optimizer_creator = paddle.optimizer.Momentum self.loss_fn = loss_fn_tiny_model self.train_step_func = train_step_tiny_model self.steps = 3 @@ -159,7 +173,8 @@ class TestTrainStepTinyModelRMSProp(TestTrainStepTinyModel): def setUp(self): self.input = paddle.randn([10000, 10]) self.net_creator = TinyModel - self.optimizer_creator = partial(paddle.optimizer.RMSProp, 0.001) + self.lr_creator = lambda: 0.001 + self.optimizer_creator = paddle.optimizer.RMSProp self.loss_fn = loss_fn_tiny_model self.train_step_func = train_step_tiny_model self.steps = 3 @@ -169,10 +184,10 @@ class TestTrainStepTinyModelLRNoamDecay(TestTrainStepTinyModel): def setUp(self): self.input = paddle.randn([10000, 10]) self.net_creator = TinyModel - self.optimizer_creator = partial( - paddle.optimizer.SGD, - paddle.optimizer.lr.NoamDecay(d_model=0.01, warmup_steps=100), + self.lr_creator = partial( + paddle.optimizer.lr.NoamDecay, d_model=0.01, warmup_steps=100 ) + self.optimizer_creator = paddle.optimizer.SGD self.loss_fn = loss_fn_tiny_model self.train_step_func = train_step_tiny_model self.steps = 3 @@ -182,12 +197,12 @@ class TestTrainStepTinyModelLRPiecewiseDecay(TestTrainStepTinyModel): def setUp(self): self.input = paddle.randn([10000, 10]) self.net_creator = TinyModel - self.optimizer_creator = partial( - paddle.optimizer.SGD, - paddle.optimizer.lr.PiecewiseDecay( - boundaries=[3, 6, 9], values=[0.1, 0.2, 0.3, 0.4] - ), + self.lr_creator = partial( + paddle.optimizer.lr.PiecewiseDecay, + boundaries=[3, 6, 9], + values=[0.1, 0.2, 0.3, 0.4], ) + self.optimizer_creator = paddle.optimizer.SGD self.loss_fn = loss_fn_tiny_model self.train_step_func = train_step_tiny_model self.steps = 3 @@ -197,10 +212,12 @@ class TestTrainStepTinyModelLRNaturalExpDecay(TestTrainStepTinyModel): def setUp(self): self.input = paddle.randn([10000, 10]) self.net_creator = TinyModel - self.optimizer_creator = partial( - paddle.optimizer.SGD, - paddle.optimizer.lr.NaturalExpDecay(learning_rate=0.5, gamma=0.1), + self.lr_creator = partial( + paddle.optimizer.lr.NaturalExpDecay, + learning_rate=0.5, + gamma=0.1, ) + self.optimizer_creator = partial(paddle.optimizer.SGD) self.loss_fn = loss_fn_tiny_model self.train_step_func = train_step_tiny_model self.steps = 3 @@ -210,10 +227,10 @@ class TestTrainStepTinyModelLRInverseTimeDecay(TestTrainStepTinyModel): def setUp(self): self.input = paddle.randn([10000, 10]) self.net_creator = TinyModel - self.optimizer_creator = partial( - paddle.optimizer.SGD, - paddle.optimizer.lr.InverseTimeDecay(learning_rate=0.5, gamma=0.1), + self.lr_creator = partial( + paddle.optimizer.lr.InverseTimeDecay, learning_rate=0.5, gamma=0.1 ) + self.optimizer_creator = paddle.optimizer.SGD self.loss_fn = loss_fn_tiny_model self.train_step_func = train_step_tiny_model self.steps = 3 @@ -223,12 +240,13 @@ class TestTrainStepTinyModelLRPolynomialDecay(TestTrainStepTinyModel): def setUp(self): self.input = paddle.randn([10000, 10]) self.net_creator = TinyModel - self.optimizer_creator = partial( - paddle.optimizer.SGD, - paddle.optimizer.lr.PolynomialDecay( - learning_rate=0.5, decay_steps=20 - ), + self.lr_creator = partial( + paddle.optimizer.lr.PolynomialDecay, + learning_rate=0.5, + decay_steps=20, ) + self.optimizer_creator = paddle.optimizer.SGD + self.loss_fn = loss_fn_tiny_model self.train_step_func = train_step_tiny_model self.steps = 3 @@ -238,12 +256,14 @@ class TestTrainStepTinyModelLRLinearWarmup(TestTrainStepTinyModel): def setUp(self): self.input = paddle.randn([10000, 10]) self.net_creator = TinyModel - self.optimizer_creator = partial( - paddle.optimizer.SGD, - paddle.optimizer.lr.LinearWarmup( - learning_rate=0.5, warmup_steps=2, start_lr=0, end_lr=0.5 - ), + self.lr_creator = partial( + paddle.optimizer.lr.LinearWarmup, + learning_rate=0.5, + warmup_steps=2, + start_lr=0, + end_lr=0.5, ) + self.optimizer_creator = partial(paddle.optimizer.SGD) self.loss_fn = loss_fn_tiny_model self.train_step_func = train_step_tiny_model self.steps = 3 @@ -253,10 +273,10 @@ class TestTrainStepTinyModelLRExponentialDecay(TestTrainStepTinyModel): def setUp(self): self.input = paddle.randn([10000, 10]) self.net_creator = TinyModel - self.optimizer_creator = partial( - paddle.optimizer.SGD, - paddle.optimizer.lr.ExponentialDecay(learning_rate=0.5, gamma=0.9), + self.lr_creator = partial( + paddle.optimizer.lr.ExponentialDecay, learning_rate=0.5, gamma=0.9 ) + self.optimizer_creator = paddle.optimizer.SGD self.loss_fn = loss_fn_tiny_model self.train_step_func = train_step_tiny_model self.steps = 3 @@ -266,12 +286,14 @@ class TestTrainStepTinyModelLRMultiStepDecay(TestTrainStepTinyModel): def setUp(self): self.input = paddle.randn([10000, 10]) self.net_creator = TinyModel - self.optimizer_creator = partial( - paddle.optimizer.SGD, - paddle.optimizer.lr.MultiStepDecay( - learning_rate=0.5, milestones=[2, 4, 6], gamma=0.8 - ), + self.lr_creator = partial( + paddle.optimizer.lr.MultiStepDecay, + learning_rate=0.5, + milestones=[2, 4, 6], + gamma=0.8, ) + self.optimizer_creator = paddle.optimizer.SGD + self.loss_fn = loss_fn_tiny_model self.train_step_func = train_step_tiny_model self.steps = 3 @@ -281,12 +303,13 @@ class TestTrainStepTinyModelLRStepDecay(TestTrainStepTinyModel): def setUp(self): self.input = paddle.randn([10000, 10]) self.net_creator = TinyModel - self.optimizer_creator = partial( - paddle.optimizer.SGD, - paddle.optimizer.lr.StepDecay( - learning_rate=0.5, step_size=5, gamma=0.8 - ), + self.lr_creator = partial( + paddle.optimizer.lr.StepDecay, + learning_rate=0.5, + step_size=5, + gamma=0.8, ) + self.optimizer_creator = paddle.optimizer.SGD self.loss_fn = loss_fn_tiny_model self.train_step_func = train_step_tiny_model self.steps = 3 @@ -296,12 +319,12 @@ class TestTrainStepTinyModelLRLambdaDecay(TestTrainStepTinyModel): def setUp(self): self.input = paddle.randn([10000, 10]) self.net_creator = TinyModel - self.optimizer_creator = partial( - paddle.optimizer.SGD, - paddle.optimizer.lr.LambdaDecay( - learning_rate=0.5, lr_lambda=lambda x: 0.95**x - ), + self.lr_creator = partial( + paddle.optimizer.lr.LambdaDecay, + learning_rate=0.5, + lr_lambda=lambda x: 0.95**x, ) + self.optimizer_creator = paddle.optimizer.SGD self.loss_fn = loss_fn_tiny_model self.train_step_func = train_step_tiny_model self.steps = 3 @@ -311,12 +334,13 @@ class TestTrainStepTinyModelLRReduceOnPlateau(TestTrainStepTinyModel): def setUp(self): self.input = paddle.randn([10000, 10]) self.net_creator = TinyModel - self.optimizer_creator = partial( - paddle.optimizer.SGD, - paddle.optimizer.lr.ReduceOnPlateau( - learning_rate=1.0, factor=0.5, patience=5 - ), + self.lr_creator = partial( + paddle.optimizer.lr.ReduceOnPlateau, + learning_rate=1.0, + factor=0.5, + patience=5, ) + self.optimizer_creator = paddle.optimizer.SGD self.loss_fn = loss_fn_tiny_model self.train_step_func = train_step_tiny_model self.steps = 3 @@ -326,12 +350,12 @@ class TestTrainStepTinyModelLRCosineAnnealingDecay(TestTrainStepTinyModel): def setUp(self): self.input = paddle.randn([10000, 10]) self.net_creator = TinyModel - self.optimizer_creator = partial( - paddle.optimizer.SGD, - paddle.optimizer.lr.CosineAnnealingDecay( - learning_rate=0.5, T_max=10 - ), + self.lr_creator = partial( + paddle.optimizer.lr.CosineAnnealingDecay, + learning_rate=0.5, + T_max=10, ) + self.optimizer_creator = paddle.optimizer.SGD self.loss_fn = loss_fn_tiny_model self.train_step_func = train_step_tiny_model self.steps = 3 @@ -341,12 +365,12 @@ class TestTrainStepTinyModelLRMultiplicativeDecay(TestTrainStepTinyModel): def setUp(self): self.input = paddle.randn([10000, 10]) self.net_creator = TinyModel - self.optimizer_creator = partial( - paddle.optimizer.SGD, - paddle.optimizer.lr.MultiplicativeDecay( - learning_rate=0.5, lr_lambda=lambda x: 0.95 - ), + self.lr_creator = partial( + paddle.optimizer.lr.MultiplicativeDecay, + learning_rate=0.5, + lr_lambda=lambda x: 0.95, ) + self.optimizer_creator = paddle.optimizer.SGD self.loss_fn = loss_fn_tiny_model self.train_step_func = train_step_tiny_model self.steps = 3 @@ -356,12 +380,10 @@ class TestTrainStepTinyModelLROneCycleLR(TestTrainStepTinyModel): def setUp(self): self.input = paddle.randn([10000, 10]) self.net_creator = TinyModel - self.optimizer_creator = partial( - paddle.optimizer.SGD, - paddle.optimizer.lr.OneCycleLR( - max_learning_rate=1.0, total_steps=3 - ), + self.lr_creator = partial( + paddle.optimizer.lr.OneCycleLR, max_learning_rate=1.0, total_steps=3 ) + self.optimizer_creator = paddle.optimizer.SGD self.loss_fn = loss_fn_tiny_model self.train_step_func = train_step_tiny_model self.steps = 3 @@ -371,15 +393,14 @@ class TestTrainStepTinyModelLRCyclicLR(TestTrainStepTinyModel): def setUp(self): self.input = paddle.randn([10000, 10]) self.net_creator = TinyModel - self.optimizer_creator = partial( - paddle.optimizer.SGD, - paddle.optimizer.lr.CyclicLR( - base_learning_rate=0.5, - max_learning_rate=1.0, - step_size_up=15, - step_size_down=5, - ), + self.lr_creator = partial( + paddle.optimizer.lr.CyclicLR, + base_learning_rate=0.5, + max_learning_rate=1.0, + step_size_up=15, + step_size_down=5, ) + self.optimizer_creator = paddle.optimizer.SGD self.loss_fn = loss_fn_tiny_model self.train_step_func = train_step_tiny_model self.steps = 3 diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_train_step_resnet18_adam.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_train_step_resnet18_adam.py index e4b68eaf85ab5..8506349b43577 100644 --- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_train_step_resnet18_adam.py +++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_train_step_resnet18_adam.py @@ -13,7 +13,6 @@ # limitations under the License. import unittest -from functools import partial from test_train_step import ( TestTrainStepTinyModel, @@ -29,7 +28,8 @@ class TestTrainStepResNet18Adam(TestTrainStepTinyModel): def setUp(self): self.input = paddle.randn([64, 3, 224, 224]) self.net_creator = resnet18 - self.optimizer_creator = partial(paddle.optimizer.Adam, 0.001) + self.lr_creator = lambda: 0.001 + self.optimizer_creator = paddle.optimizer.Adam self.loss_fn = loss_fn_tiny_model self.train_step_func = train_step_tiny_model self.steps = 3 diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_train_step_resnet18_sgd.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_train_step_resnet18_sgd.py index 2a08467e1b16b..8760c1e5adc9e 100644 --- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_train_step_resnet18_sgd.py +++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_train_step_resnet18_sgd.py @@ -13,7 +13,6 @@ # limitations under the License. import unittest -from functools import partial from test_train_step import ( TestTrainStepTinyModel, @@ -29,7 +28,8 @@ class TestTrainStepResNet18Sgd(TestTrainStepTinyModel): def setUp(self): self.input = paddle.randn([64, 3, 224, 224]) self.net_creator = resnet18 - self.optimizer_creator = partial(paddle.optimizer.SGD, 0.001) + self.lr_creator = lambda: 0.001 + self.optimizer_creator = paddle.optimizer.SGD self.loss_fn = loss_fn_tiny_model self.train_step_func = train_step_tiny_model self.steps = 3 diff --git a/python/paddle/jit/dy2static/partial_program.py b/python/paddle/jit/dy2static/partial_program.py index 0f5e61fcd98e4..8a8f6621fb105 100644 --- a/python/paddle/jit/dy2static/partial_program.py +++ b/python/paddle/jit/dy2static/partial_program.py @@ -217,6 +217,8 @@ def __call__(self, inputs): self._cast_fp16_if_pure_fp16(in_vars) attrs = self._prepare_attributes() + self._sync_lr_value_with_scheduler() + _legacy_C_ops.run_program( self._valid_vars(in_vars), self._valid_vars(self._params), @@ -231,6 +233,23 @@ def __call__(self, inputs): restored_nest_out = self._restore_out(out_vars) return self._remove_no_value(restored_nest_out) + def _sync_lr_value_with_scheduler(self): + """Update lr_var value with calculated by lr_scheduler.""" + main_program = self._origin_main_program + if hasattr(main_program, 'lr_sheduler') and hasattr( + main_program, 'lr_var' + ): + lr_scheduler = main_program.lr_sheduler + lr_var = main_program.lr_var + from paddle.fluid.data_feeder import convert_dtype + from paddle.optimizer.lr import LRScheduler + + assert isinstance(lr_scheduler, LRScheduler), "must be LRScheduler" + lr_sheduler = self._origin_main_program.lr_sheduler + lr_value = lr_sheduler() + data = np.array([lr_value]).astype(convert_dtype(lr_var.dtype)) + lr_var.set_value(data) + def set_hooker(self, hooker): self._hooker = hooker From 4017eff624ff0e94a66e36e358c671204caf8eef Mon Sep 17 00:00:00 2001 From: SigureMo Date: Fri, 24 Mar 2023 08:50:04 +0000 Subject: [PATCH 27/43] move imports --- python/paddle/jit/dy2static/partial_program.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/paddle/jit/dy2static/partial_program.py b/python/paddle/jit/dy2static/partial_program.py index 0f116eacd464a..b1ef07c91e52c 100644 --- a/python/paddle/jit/dy2static/partial_program.py +++ b/python/paddle/jit/dy2static/partial_program.py @@ -21,8 +21,10 @@ from paddle.amp.auto_cast import _in_amp_guard, _in_pure_fp16_guard from paddle.fluid import backward, core, framework, program_guard from paddle.fluid.compiler import BuildStrategy +from paddle.fluid.data_feeder import convert_dtype from paddle.fluid.dygraph.base import switch_to_static_graph from paddle.fluid.framework import _apply_pass +from paddle.optimizer.lr import LRScheduler from . import logging_utils from .utils import RETURN_NO_VALUE_MAGIC_NUM, _out_grad_names, _param_grad_names @@ -232,8 +234,6 @@ def _sync_lr_value_with_scheduler(self): ): lr_scheduler = main_program.lr_sheduler lr_var = main_program.lr_var - from paddle.fluid.data_feeder import convert_dtype - from paddle.optimizer.lr import LRScheduler assert isinstance(lr_scheduler, LRScheduler), "must be LRScheduler" lr_sheduler = self._origin_main_program.lr_sheduler From b8ccbd95e1ebf7ce9615b7a5e3a1573cebf3f7f1 Mon Sep 17 00:00:00 2001 From: SigureMo Date: Mon, 27 Mar 2023 07:17:44 +0000 Subject: [PATCH 28/43] use name to identify the dropout_state in rnn --- python/paddle/nn/layer/rnn.py | 3 ++- python/paddle/optimizer/optimizer.py | 4 ++-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/python/paddle/nn/layer/rnn.py b/python/paddle/nn/layer/rnn.py index 904a8d2c11aac..e8b94e10e3c12 100644 --- a/python/paddle/nn/layer/rnn.py +++ b/python/paddle/nn/layer/rnn.py @@ -1428,7 +1428,8 @@ def flatten_parameters(self): # dropout state may also can be hided and avoid saving # should dropout state be persistable for static-graph self._dropout_state = self.create_variable( - dtype=core.VarDesc.VarType.UINT8 + dtype=core.VarDesc.VarType.UINT8, + name="dropout_state", ) if in_dynamic_mode(): with paddle.no_grad(): diff --git a/python/paddle/optimizer/optimizer.py b/python/paddle/optimizer/optimizer.py index 413d62321772b..42b5c2908dd15 100644 --- a/python/paddle/optimizer/optimizer.py +++ b/python/paddle/optimizer/optimizer.py @@ -441,7 +441,7 @@ def do_create(): self._learning_rate._var_name = lr_name lr_var = self.helper.create_global_variable( name=lr_name, - shape=[1], + shape=[], persistable=True, stop_gradient=True, dtype=_lr_dtype, @@ -469,7 +469,7 @@ def do_create(): framework.default_main_program() ] = paddle.static.create_global_var( name=unique_name.generate("learning_rate"), - shape=[1], + shape=[], value=float(self._learning_rate), dtype=_lr_dtype, persistable=True, From 3d87c6311c13db9fff6916121573ffb3d0ba9338 Mon Sep 17 00:00:00 2001 From: SigureMo Date: Mon, 27 Mar 2023 07:20:17 +0000 Subject: [PATCH 29/43] inc train step time limit --- .../fluid/tests/unittests/dygraph_to_static/CMakeLists.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/CMakeLists.txt b/python/paddle/fluid/tests/unittests/dygraph_to_static/CMakeLists.txt index 55cab1f34ebb4..d5991ddfc369c 100644 --- a/python/paddle/fluid/tests/unittests/dygraph_to_static/CMakeLists.txt +++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/CMakeLists.txt @@ -66,8 +66,8 @@ set_tests_properties(test_transformer PROPERTIES TIMEOUT 200) set_tests_properties(test_bmn PROPERTIES TIMEOUT 120) #set_tests_properties(test_mnist PROPERTIES TIMEOUT 120) set_tests_properties(test_build_strategy PROPERTIES TIMEOUT 120) -set_tests_properties(test_train_step_resnet18_sgd PROPERTIES TIMEOUT 900) -set_tests_properties(test_train_step_resnet18_adam PROPERTIES TIMEOUT 900) +set_tests_properties(test_train_step_resnet18_sgd PROPERTIES TIMEOUT 1200) +set_tests_properties(test_train_step_resnet18_adam PROPERTIES TIMEOUT 1200) if(NOT WIN32) set_tests_properties(test_resnet_v2 PROPERTIES TIMEOUT 120) From 82c5cf02d3babb6a2d38fdeb06527675c0971211 Mon Sep 17 00:00:00 2001 From: SigureMo Date: Mon, 27 Mar 2023 11:10:08 +0000 Subject: [PATCH 30/43] fix 0d lr scheduler value --- python/paddle/jit/dy2static/partial_program.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/paddle/jit/dy2static/partial_program.py b/python/paddle/jit/dy2static/partial_program.py index b1ef07c91e52c..3721ff5f052aa 100644 --- a/python/paddle/jit/dy2static/partial_program.py +++ b/python/paddle/jit/dy2static/partial_program.py @@ -238,7 +238,7 @@ def _sync_lr_value_with_scheduler(self): assert isinstance(lr_scheduler, LRScheduler), "must be LRScheduler" lr_sheduler = self._origin_main_program.lr_sheduler lr_value = lr_sheduler() - data = np.array([lr_value]).astype(convert_dtype(lr_var.dtype)) + data = np.array(lr_value).astype(convert_dtype(lr_var.dtype)) lr_var.set_value(data) def set_hooker(self, hooker): From 3db82c2324f8890f62b1fe334ea985a2bf1c3d74 Mon Sep 17 00:00:00 2001 From: SigureMo Date: Mon, 27 Mar 2023 11:50:41 +0000 Subject: [PATCH 31/43] add some missing committed changes --- python/paddle/fluid/dygraph/base.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/python/paddle/fluid/dygraph/base.py b/python/paddle/fluid/dygraph/base.py index e404733530c48..b411e51dc6bb6 100644 --- a/python/paddle/fluid/dygraph/base.py +++ b/python/paddle/fluid/dygraph/base.py @@ -145,8 +145,10 @@ def _convert_into_variable(tensor): # and necessary for inferring. It will be pruned if it's not necessary for inferring. # But if its shape is empty while created from `create_variable()`, we consider this buffer - # non-persistable. See case of `drop_state` in lstm api. - is_persistable = len(tensor.shape) > 0 + # non-persistable. See case of `dropout_state` in lstm api. + is_persistable = True + if hasattr(new_var, "name") and "dropout_state" in new_var.name: + is_persistable = False new_var = tensor._to_static_var( to_parameter=False, persistable=is_persistable @@ -155,7 +157,7 @@ def _convert_into_variable(tensor): if new_var.persistable is True: # TODO(@xiongkun): 0d-tensor may be affected at present, # but there is no particularly good method to identify whether 0d-tensor - # is used as buffer or "drop_out_state" in LSTM buffer variable. + # is used as buffer or "dropout_state" in LSTM buffer variable. from paddle.jit.dy2static.program_translator import ( ProgramTranslator, ) From 3001565157562fe67e22482cb9ac76d631ab8a74 Mon Sep 17 00:00:00 2001 From: SigureMo Date: Mon, 27 Mar 2023 16:07:41 +0000 Subject: [PATCH 32/43] `new_var` -> `tensor` --- python/paddle/fluid/dygraph/base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/paddle/fluid/dygraph/base.py b/python/paddle/fluid/dygraph/base.py index b411e51dc6bb6..15d899b0912be 100644 --- a/python/paddle/fluid/dygraph/base.py +++ b/python/paddle/fluid/dygraph/base.py @@ -147,7 +147,7 @@ def _convert_into_variable(tensor): # But if its shape is empty while created from `create_variable()`, we consider this buffer # non-persistable. See case of `dropout_state` in lstm api. is_persistable = True - if hasattr(new_var, "name") and "dropout_state" in new_var.name: + if "dropout_state" in tensor.name: is_persistable = False new_var = tensor._to_static_var( From ed5b0dd33c581b7eb6c14fdb71443d5e11c5b02d Mon Sep 17 00:00:00 2001 From: SigureMo Date: Tue, 28 Mar 2023 07:59:45 +0000 Subject: [PATCH 33/43] `sheduler` -> `scheduler` --- python/paddle/jit/dy2static/partial_program.py | 8 ++++---- python/paddle/optimizer/optimizer.py | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/python/paddle/jit/dy2static/partial_program.py b/python/paddle/jit/dy2static/partial_program.py index 188414f1ec40d..8c07906f007e8 100644 --- a/python/paddle/jit/dy2static/partial_program.py +++ b/python/paddle/jit/dy2static/partial_program.py @@ -229,15 +229,15 @@ def __call__(self, inputs): def _sync_lr_value_with_scheduler(self): """Update lr_var value with calculated by lr_scheduler.""" main_program = self._origin_main_program - if hasattr(main_program, 'lr_sheduler') and hasattr( + if hasattr(main_program, 'lr_scheduler') and hasattr( main_program, 'lr_var' ): - lr_scheduler = main_program.lr_sheduler + lr_scheduler = main_program.lr_scheduler lr_var = main_program.lr_var assert isinstance(lr_scheduler, LRScheduler), "must be LRScheduler" - lr_sheduler = self._origin_main_program.lr_sheduler - lr_value = lr_sheduler() + lr_scheduler = self._origin_main_program.lr_scheduler + lr_value = lr_scheduler() data = np.array(lr_value).astype(convert_dtype(lr_var.dtype)) lr_var.set_value(data) diff --git a/python/paddle/optimizer/optimizer.py b/python/paddle/optimizer/optimizer.py index f9066e53c6893..5a702a86ef005 100644 --- a/python/paddle/optimizer/optimizer.py +++ b/python/paddle/optimizer/optimizer.py @@ -447,7 +447,7 @@ def do_create(): dtype=_lr_dtype, ) main_prog = framework.default_main_program() - main_prog.lr_sheduler = self._learning_rate + main_prog.lr_scheduler = self._learning_rate main_prog.lr_var = lr_var self._learning_rate_map[ From 7183931b67fe5acd5cd68d0a314d04669311494f Mon Sep 17 00:00:00 2001 From: SigureMo Date: Wed, 29 Mar 2023 12:30:22 +0000 Subject: [PATCH 34/43] apply some suggestions --- python/paddle/fluid/dygraph/base.py | 7 ++---- python/paddle/fluid/framework.py | 11 ++++----- .../dygraph_to_static/CMakeLists.txt | 10 ++++---- python/paddle/nn/layer/rnn.py | 23 ++++++++++--------- 4 files changed, 25 insertions(+), 26 deletions(-) diff --git a/python/paddle/fluid/dygraph/base.py b/python/paddle/fluid/dygraph/base.py index 15d899b0912be..c7769c4b84ae1 100644 --- a/python/paddle/fluid/dygraph/base.py +++ b/python/paddle/fluid/dygraph/base.py @@ -44,7 +44,7 @@ 'to_variable', ] -# Flag that indicates whether running code under `@to_static` +NON_PERSISTABLE_VAR_NAME_SUFFIX = "__non_persistable" def in_declarative_mode(): @@ -147,7 +147,7 @@ def _convert_into_variable(tensor): # But if its shape is empty while created from `create_variable()`, we consider this buffer # non-persistable. See case of `dropout_state` in lstm api. is_persistable = True - if "dropout_state" in tensor.name: + if tensor.name.endswith(NON_PERSISTABLE_VAR_NAME_SUFFIX): is_persistable = False new_var = tensor._to_static_var( @@ -155,9 +155,6 @@ def _convert_into_variable(tensor): ) # add param into parameter recorder to collect all the params used in this program. if new_var.persistable is True: - # TODO(@xiongkun): 0d-tensor may be affected at present, - # but there is no particularly good method to identify whether 0d-tensor - # is used as buffer or "dropout_state" in LSTM buffer variable. from paddle.jit.dy2static.program_translator import ( ProgramTranslator, ) diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py index b21d8b22c78e9..312496f87e98c 100644 --- a/python/paddle/fluid/framework.py +++ b/python/paddle/fluid/framework.py @@ -27,11 +27,9 @@ import multiprocessing import sys import logging -import paddle from .proto import framework_pb2, data_feed_pb2 - from . import core from . import unique_name import paddle.version as fluid_version @@ -1719,12 +1717,14 @@ def backward(self, retain_graph=False): loss.backward() """ + from .backward import append_backward + if retain_graph is True: raise AssertionError( "`retain_graph` == True is not supported in @to_static function." "please set retain_graph = False." ) - param_grad_list = paddle.fluid.backward.append_backward(self) + param_grad_list = append_backward(self) for param, param_grad in param_grad_list: # set grad to simulate dygraph loss.backward() in static mode. setattr(param, "grad", param_grad) @@ -7625,12 +7625,11 @@ def _get_var(name, program=None): @signature_safe_contextmanager def dygraph_guard_if_declarative(): from .dygraph.base import in_declarative_mode + from .dygraph import Tracer if in_declarative_mode(): # Under @paddle.jit.to_static decorator, we switch back dygraph mode temporarily. - with paddle.fluid.framework._dygraph_guard( - tracer=paddle.fluid.dygraph.Tracer() - ): + with _dygraph_guard(tracer=Tracer()): yield else: yield diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/CMakeLists.txt b/python/paddle/fluid/tests/unittests/dygraph_to_static/CMakeLists.txt index d5991ddfc369c..3dc4080017f35 100644 --- a/python/paddle/fluid/tests/unittests/dygraph_to_static/CMakeLists.txt +++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/CMakeLists.txt @@ -39,9 +39,13 @@ if(WITH_PYTHON) set_tests_properties(test_lac PROPERTIES TIMEOUT 120) endif() +# Disable on Windows CPU CI for timeout if(WIN32 AND NOT WITH_GPU) - list(REMOVE_ITEM TEST_OPS test_resnet_amp - )# disable on Windows CPU CI for timeout + list(REMOVE_ITEM TEST_OPS test_resnet_amp) + # TODO(SigureMo): Temporarily disable train step on Windows CPU CI. + # We should remove this after fix the performance issue. + list(REMOVE_ITEM TEST_OPS test_train_step_resnet18_adam) + list(REMOVE_ITEM TEST_OPS test_train_step_resnet18_sgd) endif() foreach(TEST_OP ${TEST_OPS}) @@ -66,8 +70,6 @@ set_tests_properties(test_transformer PROPERTIES TIMEOUT 200) set_tests_properties(test_bmn PROPERTIES TIMEOUT 120) #set_tests_properties(test_mnist PROPERTIES TIMEOUT 120) set_tests_properties(test_build_strategy PROPERTIES TIMEOUT 120) -set_tests_properties(test_train_step_resnet18_sgd PROPERTIES TIMEOUT 1200) -set_tests_properties(test_train_step_resnet18_adam PROPERTIES TIMEOUT 1200) if(NOT WIN32) set_tests_properties(test_resnet_v2 PROPERTIES TIMEOUT 120) diff --git a/python/paddle/nn/layer/rnn.py b/python/paddle/nn/layer/rnn.py index 13d1b85d5a6a2..be6e83932d3b6 100644 --- a/python/paddle/nn/layer/rnn.py +++ b/python/paddle/nn/layer/rnn.py @@ -22,6 +22,7 @@ from paddle import _C_ops, _legacy_C_ops, framework, in_dynamic_mode from paddle.common_ops_import import Variable from paddle.fluid.data_feeder import check_type, check_variable_and_dtype +from paddle.fluid.dygraph.base import NON_PERSISTABLE_VAR_NAME_SUFFIX from paddle.fluid.framework import ( _non_static_mode, default_startup_program, @@ -47,7 +48,7 @@ def rnn( sequence_length=None, time_major=False, is_reverse=False, - **kwargs + **kwargs, ): r""" rnn creates a recurrent neural network specified by RNNCell `cell`, @@ -109,7 +110,7 @@ def rnn( sequence_length, time_major, is_reverse, - **kwargs + **kwargs, ) else: return _rnn_static_graph( @@ -119,7 +120,7 @@ def rnn( sequence_length, time_major, is_reverse, - **kwargs + **kwargs, ) @@ -155,7 +156,7 @@ def _rnn_dynamic_graph( sequence_length=None, time_major=False, is_reverse=False, - **kwargs + **kwargs, ): time_step_index = 0 if time_major else 1 flat_inputs = paddle.utils.flatten(inputs) @@ -223,7 +224,7 @@ def _rnn_static_graph( sequence_length=None, time_major=False, is_reverse=False, - **kwargs + **kwargs, ): check_type(inputs, 'inputs', (Variable, list, tuple), 'rnn') if isinstance(inputs, (list, tuple)): @@ -359,7 +360,7 @@ def birnn( initial_states=None, sequence_length=None, time_major=False, - **kwargs + **kwargs, ): r""" birnn creates a bidirectional recurrent neural network specified by @@ -432,7 +433,7 @@ def birnn( states_fw, sequence_length, time_major=time_major, - **kwargs + **kwargs, ) outputs_bw, states_bw = rnn( @@ -442,7 +443,7 @@ def birnn( sequence_length, time_major=time_major, is_reverse=True, - **kwargs + **kwargs, ) outputs = paddle.utils.map_structure( @@ -1209,7 +1210,7 @@ def forward( sequence_length=sequence_length, time_major=self.time_major, is_reverse=self.is_reverse, - **kwargs + **kwargs, ) return final_outputs, final_states @@ -1296,7 +1297,7 @@ def forward( initial_states, sequence_length, self.time_major, - **kwargs + **kwargs, ) return outputs, final_states @@ -1429,7 +1430,7 @@ def flatten_parameters(self): # should dropout state be persistable for static-graph self._dropout_state = self.create_variable( dtype=core.VarDesc.VarType.UINT8, - name="dropout_state", + name=f"dropout_state{NON_PERSISTABLE_VAR_NAME_SUFFIX}", ) if in_dynamic_mode(): with paddle.no_grad(): From 25cac142561afb2884b995d342866e3a501b7f4b Mon Sep 17 00:00:00 2001 From: SigureMo Date: Wed, 29 Mar 2023 15:16:47 +0000 Subject: [PATCH 35/43] test on gpu only --- .../unittests/dygraph_to_static/CMakeLists.txt | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/CMakeLists.txt b/python/paddle/fluid/tests/unittests/dygraph_to_static/CMakeLists.txt index 3dc4080017f35..b9db389572181 100644 --- a/python/paddle/fluid/tests/unittests/dygraph_to_static/CMakeLists.txt +++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/CMakeLists.txt @@ -39,13 +39,9 @@ if(WITH_PYTHON) set_tests_properties(test_lac PROPERTIES TIMEOUT 120) endif() -# Disable on Windows CPU CI for timeout if(WIN32 AND NOT WITH_GPU) - list(REMOVE_ITEM TEST_OPS test_resnet_amp) - # TODO(SigureMo): Temporarily disable train step on Windows CPU CI. - # We should remove this after fix the performance issue. - list(REMOVE_ITEM TEST_OPS test_train_step_resnet18_adam) - list(REMOVE_ITEM TEST_OPS test_train_step_resnet18_sgd) + list(REMOVE_ITEM TEST_OPS test_resnet_amp + )# disable on Windows CPU CI for timeout endif() foreach(TEST_OP ${TEST_OPS}) @@ -83,3 +79,10 @@ if(APPLE) set_tests_properties(test_mobile_net PROPERTIES TIMEOUT 300) set_tests_properties(test_resnet_v2 PROPERTIES TIMEOUT 300) endif() + +# TODO(SigureMo): Temporarily disable train step on CPU CI. +# We should remove this after fix the performance issue. +if(NOT WITH_GPU) + list(REMOVE_ITEM TEST_OPS test_train_step_resnet18_adam) + list(REMOVE_ITEM TEST_OPS test_train_step_resnet18_sgd) +endif() From faa5a16cd55d692c18b73121a53ff764653b759d Mon Sep 17 00:00:00 2001 From: SigureMo Date: Thu, 30 Mar 2023 01:34:43 +0000 Subject: [PATCH 36/43] add TIMEOUT for gpu tests --- .../fluid/tests/unittests/dygraph_to_static/CMakeLists.txt | 2 ++ 1 file changed, 2 insertions(+) diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/CMakeLists.txt b/python/paddle/fluid/tests/unittests/dygraph_to_static/CMakeLists.txt index b9db389572181..5475cf9e77d12 100644 --- a/python/paddle/fluid/tests/unittests/dygraph_to_static/CMakeLists.txt +++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/CMakeLists.txt @@ -66,6 +66,8 @@ set_tests_properties(test_transformer PROPERTIES TIMEOUT 200) set_tests_properties(test_bmn PROPERTIES TIMEOUT 120) #set_tests_properties(test_mnist PROPERTIES TIMEOUT 120) set_tests_properties(test_build_strategy PROPERTIES TIMEOUT 120) +set_tests_properties(test_train_step_resnet18_sgd PROPERTIES TIMEOUT 120) +set_tests_properties(test_train_step_resnet18_adam PROPERTIES TIMEOUT 120) if(NOT WIN32) set_tests_properties(test_resnet_v2 PROPERTIES TIMEOUT 120) From 21d4f833b42328194fc30004072e5e2016d2bc1a Mon Sep 17 00:00:00 2001 From: SigureMo Date: Fri, 31 Mar 2023 06:24:13 +0000 Subject: [PATCH 37/43] set uts timeout to 240 --- .../fluid/tests/unittests/dygraph_to_static/CMakeLists.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/CMakeLists.txt b/python/paddle/fluid/tests/unittests/dygraph_to_static/CMakeLists.txt index 5475cf9e77d12..6c818a2c52397 100644 --- a/python/paddle/fluid/tests/unittests/dygraph_to_static/CMakeLists.txt +++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/CMakeLists.txt @@ -66,8 +66,8 @@ set_tests_properties(test_transformer PROPERTIES TIMEOUT 200) set_tests_properties(test_bmn PROPERTIES TIMEOUT 120) #set_tests_properties(test_mnist PROPERTIES TIMEOUT 120) set_tests_properties(test_build_strategy PROPERTIES TIMEOUT 120) -set_tests_properties(test_train_step_resnet18_sgd PROPERTIES TIMEOUT 120) -set_tests_properties(test_train_step_resnet18_adam PROPERTIES TIMEOUT 120) +set_tests_properties(test_train_step_resnet18_sgd PROPERTIES TIMEOUT 240) +set_tests_properties(test_train_step_resnet18_adam PROPERTIES TIMEOUT 240) if(NOT WIN32) set_tests_properties(test_resnet_v2 PROPERTIES TIMEOUT 120) From 5a4e933e7588ac71d18b3e7649233a3d6e9ede5e Mon Sep 17 00:00:00 2001 From: SigureMo Date: Fri, 31 Mar 2023 07:30:50 +0000 Subject: [PATCH 38/43] move uts to new place --- .../tests/unittests => test}/dygraph_to_static/test_train_step.py | 0 .../dygraph_to_static/test_train_step_resnet18_adam.py | 0 .../dygraph_to_static/test_train_step_resnet18_sgd.py | 0 3 files changed, 0 insertions(+), 0 deletions(-) rename {python/paddle/fluid/tests/unittests => test}/dygraph_to_static/test_train_step.py (100%) rename {python/paddle/fluid/tests/unittests => test}/dygraph_to_static/test_train_step_resnet18_adam.py (100%) rename {python/paddle/fluid/tests/unittests => test}/dygraph_to_static/test_train_step_resnet18_sgd.py (100%) diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_train_step.py b/test/dygraph_to_static/test_train_step.py similarity index 100% rename from python/paddle/fluid/tests/unittests/dygraph_to_static/test_train_step.py rename to test/dygraph_to_static/test_train_step.py diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_train_step_resnet18_adam.py b/test/dygraph_to_static/test_train_step_resnet18_adam.py similarity index 100% rename from python/paddle/fluid/tests/unittests/dygraph_to_static/test_train_step_resnet18_adam.py rename to test/dygraph_to_static/test_train_step_resnet18_adam.py diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_train_step_resnet18_sgd.py b/test/dygraph_to_static/test_train_step_resnet18_sgd.py similarity index 100% rename from python/paddle/fluid/tests/unittests/dygraph_to_static/test_train_step_resnet18_sgd.py rename to test/dygraph_to_static/test_train_step_resnet18_sgd.py From fbfe26a7de2889bb4bd17aee3b3f2fb68330a97b Mon Sep 17 00:00:00 2001 From: SigureMo Date: Fri, 31 Mar 2023 11:06:47 +0000 Subject: [PATCH 39/43] inc train step time limit --- test/dygraph_to_static/CMakeLists.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/dygraph_to_static/CMakeLists.txt b/test/dygraph_to_static/CMakeLists.txt index 6c818a2c52397..107342e4eb46c 100644 --- a/test/dygraph_to_static/CMakeLists.txt +++ b/test/dygraph_to_static/CMakeLists.txt @@ -66,8 +66,8 @@ set_tests_properties(test_transformer PROPERTIES TIMEOUT 200) set_tests_properties(test_bmn PROPERTIES TIMEOUT 120) #set_tests_properties(test_mnist PROPERTIES TIMEOUT 120) set_tests_properties(test_build_strategy PROPERTIES TIMEOUT 120) -set_tests_properties(test_train_step_resnet18_sgd PROPERTIES TIMEOUT 240) -set_tests_properties(test_train_step_resnet18_adam PROPERTIES TIMEOUT 240) +set_tests_properties(test_train_step_resnet18_sgd PROPERTIES TIMEOUT 360) +set_tests_properties(test_train_step_resnet18_adam PROPERTIES TIMEOUT 360) if(NOT WIN32) set_tests_properties(test_resnet_v2 PROPERTIES TIMEOUT 120) From 38f2255ab9c2ee91a9e4eb834e95d20af9e91e8a Mon Sep 17 00:00:00 2001 From: SigureMo Date: Sat, 1 Apr 2023 12:55:13 +0000 Subject: [PATCH 40/43] inc train step time limit --- test/dygraph_to_static/CMakeLists.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/dygraph_to_static/CMakeLists.txt b/test/dygraph_to_static/CMakeLists.txt index 107342e4eb46c..eaaebd72de439 100644 --- a/test/dygraph_to_static/CMakeLists.txt +++ b/test/dygraph_to_static/CMakeLists.txt @@ -66,8 +66,8 @@ set_tests_properties(test_transformer PROPERTIES TIMEOUT 200) set_tests_properties(test_bmn PROPERTIES TIMEOUT 120) #set_tests_properties(test_mnist PROPERTIES TIMEOUT 120) set_tests_properties(test_build_strategy PROPERTIES TIMEOUT 120) -set_tests_properties(test_train_step_resnet18_sgd PROPERTIES TIMEOUT 360) -set_tests_properties(test_train_step_resnet18_adam PROPERTIES TIMEOUT 360) +set_tests_properties(test_train_step_resnet18_sgd PROPERTIES TIMEOUT 600) +set_tests_properties(test_train_step_resnet18_adam PROPERTIES TIMEOUT 600) if(NOT WIN32) set_tests_properties(test_resnet_v2 PROPERTIES TIMEOUT 120) From 5ba356fe0a85e98239edcf03e445c768c1a41c3a Mon Sep 17 00:00:00 2001 From: SigureMo Date: Mon, 3 Apr 2023 02:21:10 +0000 Subject: [PATCH 41/43] adjust cmake for ut --- test/dygraph_to_static/CMakeLists.txt | 21 ++++++++++----------- 1 file changed, 10 insertions(+), 11 deletions(-) diff --git a/test/dygraph_to_static/CMakeLists.txt b/test/dygraph_to_static/CMakeLists.txt index eaaebd72de439..4cb36708d5f83 100644 --- a/test/dygraph_to_static/CMakeLists.txt +++ b/test/dygraph_to_static/CMakeLists.txt @@ -39,9 +39,17 @@ if(WITH_PYTHON) set_tests_properties(test_lac PROPERTIES TIMEOUT 120) endif() +# Disable on Windows CPU CI for timeout if(WIN32 AND NOT WITH_GPU) - list(REMOVE_ITEM TEST_OPS test_resnet_amp - )# disable on Windows CPU CI for timeout + list(REMOVE_ITEM TEST_OPS test_resnet_amp) + + # TODO(SigureMo): Temporarily disable train step on Windows CPU CI. + # We should remove this after fix the performance issue. + list(REMOVE_ITEM TEST_OPS test_train_step_resnet18_adam) + list(REMOVE_ITEM TEST_OPS test_train_step_resnet18_sgd) +else() + set_tests_properties(test_train_step_resnet18_sgd PROPERTIES TIMEOUT 240) + set_tests_properties(test_train_step_resnet18_adam PROPERTIES TIMEOUT 240) endif() foreach(TEST_OP ${TEST_OPS}) @@ -66,8 +74,6 @@ set_tests_properties(test_transformer PROPERTIES TIMEOUT 200) set_tests_properties(test_bmn PROPERTIES TIMEOUT 120) #set_tests_properties(test_mnist PROPERTIES TIMEOUT 120) set_tests_properties(test_build_strategy PROPERTIES TIMEOUT 120) -set_tests_properties(test_train_step_resnet18_sgd PROPERTIES TIMEOUT 600) -set_tests_properties(test_train_step_resnet18_adam PROPERTIES TIMEOUT 600) if(NOT WIN32) set_tests_properties(test_resnet_v2 PROPERTIES TIMEOUT 120) @@ -81,10 +87,3 @@ if(APPLE) set_tests_properties(test_mobile_net PROPERTIES TIMEOUT 300) set_tests_properties(test_resnet_v2 PROPERTIES TIMEOUT 300) endif() - -# TODO(SigureMo): Temporarily disable train step on CPU CI. -# We should remove this after fix the performance issue. -if(NOT WITH_GPU) - list(REMOVE_ITEM TEST_OPS test_train_step_resnet18_adam) - list(REMOVE_ITEM TEST_OPS test_train_step_resnet18_sgd) -endif() From 8e98ae2d4e039f06e569a274c55376475b7b4daa Mon Sep 17 00:00:00 2001 From: SigureMo Date: Mon, 3 Apr 2023 02:35:05 +0000 Subject: [PATCH 42/43] adjust cmake for ut --- test/dygraph_to_static/CMakeLists.txt | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/test/dygraph_to_static/CMakeLists.txt b/test/dygraph_to_static/CMakeLists.txt index 4cb36708d5f83..0aff52fbf1859 100644 --- a/test/dygraph_to_static/CMakeLists.txt +++ b/test/dygraph_to_static/CMakeLists.txt @@ -39,17 +39,9 @@ if(WITH_PYTHON) set_tests_properties(test_lac PROPERTIES TIMEOUT 120) endif() -# Disable on Windows CPU CI for timeout if(WIN32 AND NOT WITH_GPU) - list(REMOVE_ITEM TEST_OPS test_resnet_amp) - - # TODO(SigureMo): Temporarily disable train step on Windows CPU CI. - # We should remove this after fix the performance issue. - list(REMOVE_ITEM TEST_OPS test_train_step_resnet18_adam) - list(REMOVE_ITEM TEST_OPS test_train_step_resnet18_sgd) -else() - set_tests_properties(test_train_step_resnet18_sgd PROPERTIES TIMEOUT 240) - set_tests_properties(test_train_step_resnet18_adam PROPERTIES TIMEOUT 240) + list(REMOVE_ITEM TEST_OPS test_resnet_amp + )# disable on Windows CPU CI for timeout endif() foreach(TEST_OP ${TEST_OPS}) @@ -87,3 +79,13 @@ if(APPLE) set_tests_properties(test_mobile_net PROPERTIES TIMEOUT 300) set_tests_properties(test_resnet_v2 PROPERTIES TIMEOUT 300) endif() + +if(WIN32 AND NOT WITH_GPU) + # TODO(SigureMo): Temporarily disable train step on Windows CPU CI. + # We should remove this after fix the performance issue. + list(REMOVE_ITEM TEST_OPS test_train_step_resnet18_adam) + list(REMOVE_ITEM TEST_OPS test_train_step_resnet18_sgd) +else() + set_tests_properties(test_train_step_resnet18_sgd PROPERTIES TIMEOUT 240) + set_tests_properties(test_train_step_resnet18_adam PROPERTIES TIMEOUT 240) +endif() From 0c4876248eb286fdfd8c1192b40f431d35ed516c Mon Sep 17 00:00:00 2001 From: SigureMo Date: Mon, 3 Apr 2023 03:29:08 +0000 Subject: [PATCH 43/43] adjust cmake for ut --- test/dygraph_to_static/CMakeLists.txt | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/test/dygraph_to_static/CMakeLists.txt b/test/dygraph_to_static/CMakeLists.txt index 0aff52fbf1859..d8bec02ae9ff8 100644 --- a/test/dygraph_to_static/CMakeLists.txt +++ b/test/dygraph_to_static/CMakeLists.txt @@ -44,6 +44,13 @@ if(WIN32 AND NOT WITH_GPU) )# disable on Windows CPU CI for timeout endif() +if(NOT WITH_GPU) + # TODO(SigureMo): Temporarily disable train step on Windows CPU CI. + # We should remove this after fix the performance issue. + list(REMOVE_ITEM TEST_OPS test_train_step_resnet18_adam) + list(REMOVE_ITEM TEST_OPS test_train_step_resnet18_sgd) +endif() + foreach(TEST_OP ${TEST_OPS}) list(FIND TEST_EAGER_OPS ${TEST_OP} WAS_FOUND) if(NOT WAS_FOUND EQUAL -1) @@ -80,12 +87,7 @@ if(APPLE) set_tests_properties(test_resnet_v2 PROPERTIES TIMEOUT 300) endif() -if(WIN32 AND NOT WITH_GPU) - # TODO(SigureMo): Temporarily disable train step on Windows CPU CI. - # We should remove this after fix the performance issue. - list(REMOVE_ITEM TEST_OPS test_train_step_resnet18_adam) - list(REMOVE_ITEM TEST_OPS test_train_step_resnet18_sgd) -else() +if(WITH_GPU) set_tests_properties(test_train_step_resnet18_sgd PROPERTIES TIMEOUT 240) set_tests_properties(test_train_step_resnet18_adam PROPERTIES TIMEOUT 240) endif()