Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

enable users to set intial memory states for lstm/gru group #2641

Merged
merged 2 commits into from
Jun 29, 2017
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
51 changes: 26 additions & 25 deletions python/paddle/trainer_config_helpers/layers.py
Original file line number Diff line number Diff line change
Expand Up @@ -1149,10 +1149,10 @@ def pooling_layer(input,
@layer_support(DROPOUT)
def lstmemory(input,
name=None,
size=None,
reverse=False,
act=None,
gate_act=None,
size=None,
state_act=None,
bias_attr=None,
param_attr=None,
Expand Down Expand Up @@ -1194,6 +1194,8 @@ def lstmemory(input,

:param name: The lstmemory layer name.
:type name: basestring
:param size: DEPRECATED. size of the lstm cell
:type size: int
:param input: input layer name.
:type input: LayerOutput
:param reverse: is sequence process reversed or not.
Expand All @@ -1220,15 +1222,15 @@ def lstmemory(input,
assert state_act.support_hppl
assert act.support_hppl
assert input.size is not None and input.size % 4 == 0

if size is not None:
if input.size / 4 == size:
plog = logger.warning
else:
plog = logger.fatal

plog("NOTE: The lstmemory layer[%s]'s size is set by previous input "
"layer. The lstm size should be equal with input layer size/4. The"
" size which is set explicitly will be ignored." % name)
plog("size of lstmemory layer: %s is automatically set to "
"size of input layer / 4. The parameter size passing to "
"this layer is ignored." % (name))

Layer(
name=name,
Expand All @@ -1255,11 +1257,11 @@ def lstmemory(input,
@wrap_name_default("gru")
@layer_support(DROPOUT)
def grumemory(input,
size=None,
name=None,
reverse=False,
act=None,
gate_act=None,
size=None,
bias_attr=None,
param_attr=None,
layer_attr=None):
Expand Down Expand Up @@ -1318,6 +1320,8 @@ def grumemory(input,
:type name: None|basestring
:param input: input layer.
:type input: LayerOutput.
:param size: DEPRECATED. size of the gru cell
:type size: int
:param reverse: Whether sequence process is reversed or not.
:type reverse: bool
:param act: activation type, TanhActivation by default. This activation
Expand All @@ -1334,9 +1338,6 @@ def grumemory(input,
:type param_attr: ParameterAttribute|None|False
:param layer_attr: Extra Layer attribute
:type layer_attr: ExtraLayerAttribute|None
:param size: Stub parameter of size, but actually not used. If set this size
will get a warning.
:type size: None
:return: LayerOutput object.
:rtype: LayerOutput
"""
Expand All @@ -1348,9 +1349,9 @@ def grumemory(input,
plog = logger.warning
else:
plog = logger.fatal
plog("NOTE: the gru memory layer's size is set by previous input layer,"
" and should be input size / 3. Set size explicitly will be "
"ignored.")
plog("size of grumemory layer: %s is automatically set to "
"size of input layer / 3. The parameter size passing to this "
"layer is ignored." % (name))

Layer(
name=name,
Expand Down Expand Up @@ -2524,8 +2525,8 @@ def img_cmrnorm_layer(input,


@wrap_bias_attr_default()
@wrap_param_attr_default(default_factory=lambda _: ParamAttr(initial_mean=1.0,
initial_std=0.))
@wrap_param_attr_default(
default_factory=lambda _: ParamAttr(initial_mean=1.0, initial_std=0.))
@wrap_act_default(act=ReluActivation())
@wrap_name_default("batch_norm")
@layer_support(DROPOUT)
Expand Down Expand Up @@ -3013,25 +3014,25 @@ def lstm_step_layer(input,
bias_attr=None,
layer_attr=None):
"""
LSTM Step Layer. It used in recurrent_group. The lstm equations are shown
as follow.
LSTM Step Layer. This function is used only in recurrent_group.
The lstm equations are shown as follows.

.. math::

i_t & = \\sigma(W_{xi}x_{t} + W_{hi}h_{t-1} + W_{ci}c_{t-1} + b_i)
i_t & = \\sigma(W_{x_i}x_{t} + W_{h_i}h_{t-1} + W_{c_i}c_{t-1} + b_i)

f_t & = \\sigma(W_{xf}x_{t} + W_{hf}h_{t-1} + W_{cf}c_{t-1} + b_f)
f_t & = \\sigma(W_{x_f}x_{t} + W_{h_f}h_{t-1} + W_{c_f}c_{t-1} + b_f)

c_t & = f_tc_{t-1} + i_t tanh (W_{xc}x_t+W_{hc}h_{t-1} + b_c)
c_t & = f_tc_{t-1} + i_t tanh (W_{x_c}x_t+W_{h_c}h_{t-1} + b_c)

o_t & = \\sigma(W_{xo}x_{t} + W_{ho}h_{t-1} + W_{co}c_t + b_o)
o_t & = \\sigma(W_{x_o}x_{t} + W_{h_o}h_{t-1} + W_{c_o}c_t + b_o)

h_t & = o_t tanh(c_t)


The input of lstm step is :math:`Wx_t + Wh_{t-1}`, and user should use
:code:`mixed_layer` and :code:`full_matrix_projection` to calculate these
input vector.
input vectors.

The state of lstm step is :math:`c_{t-1}`. And lstm step layer will do

Expand All @@ -3042,14 +3043,14 @@ def lstm_step_layer(input,
...


This layer contains two outputs. Default output is :math:`h_t`. The other
output is :math:`o_t`, which name is 'state' and can use
This layer has two outputs. Default output is :math:`h_t`. The other
output is :math:`o_t`, whose name is 'state' and can use
:code:`get_output_layer` to extract this output.

:param name: Layer's name.
:type name: basestring
:param size: Layer's size. NOTE: lstm layer's size, should be equal as
:code:`input.size/4`, and should be equal as
:param size: Layer's size. NOTE: lstm layer's size, should be equal to
:code:`input.size/4`, and should be equal to
:code:`state.size`.
:type size: int
:param input: input layer. :math:`Wx_t + Wh_{t-1}`
Expand Down
50 changes: 32 additions & 18 deletions python/paddle/trainer_config_helpers/networks.py
Original file line number Diff line number Diff line change
Expand Up @@ -614,6 +614,7 @@ def simple_lstm(input,

@wrap_name_default('lstm_unit')
def lstmemory_unit(input,
memory_boot=None,
name=None,
size=None,
param_attr=None,
Expand All @@ -626,9 +627,9 @@ def lstmemory_unit(input,
lstm_layer_attr=None,
get_output_layer_attr=None):
"""
Define calculations that a LSTM unit performs in a single time step.
This function itself is not a recurrent layer, so that it can not be
directly applied to sequence input. This function is always used in
Define calculations that a LSTM unit performs during a single time step.
This function itself is not a recurrent layer, so it can not be
directly used to process sequence inputs. This function is always used in
recurrent_group (see layers.py for more details) to implement attention
mechanism.

Expand All @@ -638,13 +639,13 @@ def lstmemory_unit(input,

.. math::

i_t & = \\sigma(W_{xi}x_{t} + W_{hi}h_{t-1} + W_{ci}c_{t-1} + b_i)
i_t & = \\sigma(W_{x_i}x_{t} + W_{h_i}h_{t-1} + W_{c_i}c_{t-1} + b_i)

f_t & = \\sigma(W_{xf}x_{t} + W_{hf}h_{t-1} + W_{cf}c_{t-1} + b_f)
f_t & = \\sigma(W_{x_f}x_{t} + W_{h_f}h_{t-1} + W_{c_f}c_{t-1} + b_f)

c_t & = f_tc_{t-1} + i_t tanh (W_{xc}x_t+W_{hc}h_{t-1} + b_c)
c_t & = f_tc_{t-1} + i_t tanh (W_{x_c}x_t+W_{h_c}h_{t-1} + b_c)

o_t & = \\sigma(W_{xo}x_{t} + W_{ho}h_{t-1} + W_{co}c_t + b_o)
o_t & = \\sigma(W_{x_o}x_{t} + W_{h_o}h_{t-1} + W_{c_o}c_t + b_o)

h_t & = o_t tanh(c_t)

Expand All @@ -661,6 +662,8 @@ def lstmemory_unit(input,

:param input: input layer name.
:type input: LayerOutput
:param memory_boot: the initialization state of the LSTM cell.
:type memory_boot: LayerOutput | None
:param name: lstmemory unit name.
:type name: basestring
:param size: lstmemory unit size.
Expand Down Expand Up @@ -692,7 +695,8 @@ def lstmemory_unit(input,
assert input.size % 4 == 0
size = input.size / 4
out_mem = memory(name=name, size=size)
state_mem = memory(name="%s_state" % name, size=size)
state_mem = memory(
name="%s_state" % name, size=size, boot_layer=memory_boot)

with mixed_layer(
name="%s_input_recurrent" % name,
Expand Down Expand Up @@ -726,6 +730,7 @@ def lstmemory_unit(input,
def lstmemory_group(input,
size=None,
name=None,
memory_boot=None,
reverse=False,
param_attr=None,
act=None,
Expand All @@ -737,7 +742,7 @@ def lstmemory_group(input,
lstm_layer_attr=None,
get_output_layer_attr=None):
"""
lstm_group is a recurrent layer group version of Long Short Term Memory. It
lstm_group is a recurrent_group version of Long Short Term Memory. It
does exactly the same calculation as the lstmemory layer (see lstmemory in
layers.py for the maths) does. A promising benefit is that LSTM memory
cell states, or hidden states in every time step are accessible to the
Expand All @@ -748,8 +753,8 @@ def lstmemory_group(input,

NOTE: In PaddlePaddle's implementation, the following input-to-hidden
multiplications:
:math:`W_{xi}x_{t}` , :math:`W_{xf}x_{t}`,
:math:`W_{xc}x_t`, :math:`W_{xo}x_{t}` are not done in lstmemory_unit to
:math:`W_{x_i}x_{t}` , :math:`W_{x_f}x_{t}`,
:math:`W_{x_c}x_t`, :math:`W_{x_o}x_{t}` are not done in lstmemory_unit to
speed up the calculations. Consequently, an additional mixed_layer with
full_matrix_projection must be included before lstmemory_unit is called.

Expand All @@ -765,10 +770,12 @@ def lstmemory_group(input,

:param input: input layer name.
:type input: LayerOutput
:param name: lstmemory group name.
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This line should be retained?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

done.

:type name: basestring
:param size: lstmemory group size.
:type size: int
:param name: name of the lstmemory group.
:type name: basestring
:param memory_boot: the initialization state of LSTM cell.
:type memory_boot: LayerOutput | None
:param reverse: is lstm reversed
:type reverse: bool
:param param_attr: Parameter config, None if use default.
Expand Down Expand Up @@ -798,6 +805,7 @@ def lstmemory_group(input,
def __lstm_step__(ipt):
return lstmemory_unit(
input=ipt,
memory_boot=memory_boot,
name=name,
size=size,
mixed_bias_attr=mixed_bias_attr,
Expand All @@ -819,6 +827,7 @@ def __lstm_step__(ipt):

@wrap_name_default('gru_unit')
def gru_unit(input,
memory_boot=None,
size=None,
name=None,
gru_bias_attr=None,
Expand All @@ -829,15 +838,17 @@ def gru_unit(input,
naive=False):
"""
Define calculations that a gated recurrent unit performs in a single time
step. This function itself is not a recurrent layer, so that it can not be
directly applied to sequence input. This function is almost always used in
step. This function itself is not a recurrent layer, so it can not be
directly used to process sequence inputs. This function is always used in
the recurrent_group (see layers.py for more details) to implement attention
mechanism.

Please see grumemory in layers.py for the details about the maths.

:param input: input layer name.
:type input: LayerOutput
:param memory_boot: the initialization state of the LSTM cell.
:type memory_boot: LayerOutput | None
:param name: name of the gru group.
:type name: basestring
:param size: hidden size of the gru.
Expand All @@ -856,7 +867,7 @@ def gru_unit(input,
if size is None:
size = input.size / 3

out_mem = memory(name=name, size=size)
out_mem = memory(name=name, size=size, boot_layer=memory_boot)

if naive:
__step__ = gru_step_naive_layer
Expand All @@ -878,6 +889,7 @@ def gru_unit(input,

@wrap_name_default('gru_group')
def gru_group(input,
memory_boot=None,
size=None,
name=None,
reverse=False,
Expand All @@ -888,7 +900,7 @@ def gru_group(input,
gru_layer_attr=None,
naive=False):
"""
gru_group is a recurrent layer group version of Gated Recurrent Unit. It
gru_group is a recurrent_group version of Gated Recurrent Unit. It
does exactly the same calculation as the grumemory layer does. A promising
benefit is that gru hidden states are accessible to the user. This is
especially useful in attention model. If you do not need to access
Expand All @@ -908,6 +920,8 @@ def gru_group(input,

:param input: input layer name.
:type input: LayerOutput
:param memory_boot: the initialization state of the LSTM cell.
:type memory_boot: LayerOutput | None
:param name: name of the gru group.
:type name: basestring
:param size: hidden size of the gru.
Expand All @@ -929,6 +943,7 @@ def gru_group(input,
def __gru_step__(ipt):
return gru_unit(
input=ipt,
memory_boot=memory_boot,
name=name,
size=size,
gru_bias_attr=gru_bias_attr,
Expand Down Expand Up @@ -1083,7 +1098,6 @@ def simple_gru2(input,

return grumemory(
name=name,
size=size,
input=m,
reverse=reverse,
bias_attr=gru_bias_attr,
Expand Down