Skip to content

Commit

Permalink
remove the optimizer base and learning rate base (#56099)
Browse files Browse the repository at this point in the history
* remove the optimizer base and learning rate base

* fix bug

* fix bug
  • Loading branch information
longranger2 authored Aug 11, 2023
1 parent f60c698 commit 6eaed2d
Show file tree
Hide file tree
Showing 16 changed files with 65 additions and 1,663 deletions.
3 changes: 1 addition & 2 deletions python/paddle/amp/auto_cast.py
Original file line number Diff line number Diff line change
Expand Up @@ -250,7 +250,6 @@ def _is_valid_optimizer(optimizer):
optimizer,
(
paddle.optimizer.Optimizer,
paddle.fluid.optimizer.Optimizer,
DygraphShardingOptimizer,
),
)
Expand All @@ -260,7 +259,7 @@ def check_optimizers(optimizers):
for optimizer in optimizers:
if not _is_valid_optimizer(optimizer):
raise RuntimeError(
"Current train mode is pure fp16, optimizers should be paddle.optimizer.Optimizer or paddle.fluid.optimizer.Optimizer or DygraphShardingOptimizer, but receive {}.".format(
"Current train mode is pure fp16, optimizers should be paddle.optimizer.Optimizer or DygraphShardingOptimizer, but receive {}.".format(
type(optimizer)
)
)
Expand Down
3 changes: 1 addition & 2 deletions python/paddle/distributed/auto_parallel/static/engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -146,11 +146,10 @@ def __init__(

if optimizer and not isinstance(
optimizer,
(paddle.optimizer.Optimizer, paddle.static.Optimizer),
(paddle.optimizer.Optimizer),
):
raise TypeError(
"'optimizer' must be object of class `paddle.optimizer.Optimizer`"
" or `paddle.static.Optimizer`."
)
self._optimizer = auto_utils.validate_opt(optimizer)

Expand Down
59 changes: 54 additions & 5 deletions python/paddle/distributed/fleet/meta_optimizers/dgc_optimizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,11 +20,11 @@

import paddle
from paddle.common_ops_import import LayerHelper
from paddle.fluid import framework
from paddle.fluid.dygraph import base as imperative_base
from paddle.fluid.optimizer import Optimizer
from paddle.framework import core, in_dynamic_mode
from paddle.nn.clip import ClipGradByNorm, append_gradient_clip_ops
from paddle.optimizer import Momentum
from paddle.optimizer import Momentum, Optimizer
from paddle.regularizer import L1Decay, L2Decay
from paddle.static import create_global_var

Expand Down Expand Up @@ -58,8 +58,8 @@ def __init__(
assert momentum is not None
super().__init__(
learning_rate=learning_rate,
parameter_list=parameter_list,
regularization=regularization,
parameters=parameter_list,
weight_decay=regularization,
grad_clip=grad_clip,
name=name,
)
Expand Down Expand Up @@ -396,6 +396,55 @@ def _dgc_op(
op_maker.kOpRoleVarAttrName(), [param_var.name, grad_var.name]
)

def _process_distribute_lookuptable(self, param_grads):
"""
Because distribute lookup table only support SGD optimizer for now, not support
other optimizer and regularization, so we should find the table parameter out,
and avoid to add regularization and other op for it, and add sgd optimize op
for it independently.
:param param_grads(list((Var, Var))): list of (param, grad) pair.
:param loss: the loss variable.
:param startup_program: the startup program
"""
from paddle.distributed.distribute_lookup_table import (
find_distributed_lookup_table,
)

program = framework.default_main_program()
global_block = framework.default_main_program().global_block()
table_name = find_distributed_lookup_table(program)
table_param = None
table_grad = None
new_param_grads = []
for p, g in param_grads:
if p.name == table_name:
if table_param is not None:
raise RuntimeError(
"multi dist table var found, only support one now!"
)
table_param = p
table_grad = g
else:
new_param_grads.append((p, g))
sgd_op = None
if table_param is not None:
param_and_grad = [table_param, table_grad]
with table_param.block.program._optimized_guard(
param_and_grad
), framework.name_scope("optimizer"):
self._create_global_learning_rate()
# create the optimize op
sgd_op = global_block.append_op(
type='sgd',
inputs={
"Param": table_param,
"Grad": table_grad,
"LearningRate": self._create_param_lr(param_and_grad),
},
outputs={"ParamOut": param_and_grad[0]},
)
return new_param_grads, (table_param, table_grad), sgd_op

@imperative_base.no_grad()
def apply_gradients(self, params_grads):
# Note: since we can't use all_reduce_op now,
Expand Down Expand Up @@ -532,7 +581,7 @@ def apply_gradients(self, params_grads):

def apply_optimize(self, loss, startup_program, params_grads):
self._init_dgc_opt()
return self.dgc_opt.apply_optimize(
return self.dgc_opt._apply_optimize(
loss, startup_program=startup_program, params_grads=params_grads
)

Expand Down
2 changes: 0 additions & 2 deletions python/paddle/fluid/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,6 @@
from .initializer import set_global_initializer
from . import layers
from . import dygraph
from . import optimizer
from . import backward
from .backward import gradients
from . import incubate
Expand Down Expand Up @@ -109,7 +108,6 @@
'disable_dygraph',
'enable_imperative',
'disable_imperative',
'optimizer',
'backward',
'LoDTensor',
'LoDTensorArray',
Expand Down
3 changes: 0 additions & 3 deletions python/paddle/fluid/dygraph/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,9 +18,6 @@
from . import tracer
from .tracer import *

from . import learning_rate_scheduler
from .learning_rate_scheduler import *

__all__ = []
__all__ += base.__all__
__all__ += learning_rate_scheduler.__all__
180 changes: 0 additions & 180 deletions python/paddle/fluid/dygraph/learning_rate_scheduler.py

This file was deleted.

Loading

0 comments on commit 6eaed2d

Please sign in to comment.