From 898329fe33df1a92d3c01394af4c5685afff7891 Mon Sep 17 00:00:00 2001 From: mykolaskrynnyk <45297092+mykolaskrynnyk@users.noreply.github.com> Date: Wed, 20 Mar 2024 19:34:25 +0100 Subject: [PATCH] Docs/modelling layers (#1502) * Docs(layers): add a description for `tie_weights` argument * Refactor(layers): make `name` an explicit argument for Transformer layers * Refactor(layers): remove explicit usage of `name` in `__init__` calls * Docs(layers): remove references to `name` and consistently documents `**kwargs` --- keras_nlp/layers/modeling/alibi_bias.py | 3 +++ keras_nlp/layers/modeling/f_net_encoder.py | 7 +++---- keras_nlp/layers/modeling/masked_lm_head.py | 2 ++ keras_nlp/layers/modeling/position_embedding.py | 2 ++ keras_nlp/layers/modeling/reversible_embedding.py | 2 ++ keras_nlp/layers/modeling/rotary_embedding.py | 2 ++ keras_nlp/layers/modeling/sine_position_encoding.py | 2 ++ keras_nlp/layers/modeling/token_and_position_embedding.py | 5 +++++ keras_nlp/layers/modeling/transformer_decoder.py | 4 ++-- keras_nlp/layers/modeling/transformer_encoder.py | 4 ++-- 10 files changed, 25 insertions(+), 8 deletions(-) diff --git a/keras_nlp/layers/modeling/alibi_bias.py b/keras_nlp/layers/modeling/alibi_bias.py index fdc956ae1..cc72be3f8 100644 --- a/keras_nlp/layers/modeling/alibi_bias.py +++ b/keras_nlp/layers/modeling/alibi_bias.py @@ -35,6 +35,9 @@ class AlibiBias(keras.layers.Layer): each head. The heads' slopes are a geometric sequence that starts at `2**(-alibi_bias_max/num_heads)` and uses that same value as its ratio. Defaults to 8. + **kwargs: other keyword arguments passed to `keras.layers.Layer`, + including `name`, `trainable`, `dtype` etc. + Call arguments: attention_scores: The result of multipying the query and the key of the multi-head attention layer of the transformer to add alibi bias to diff --git a/keras_nlp/layers/modeling/f_net_encoder.py b/keras_nlp/layers/modeling/f_net_encoder.py index a5370d960..0732dee34 100644 --- a/keras_nlp/layers/modeling/f_net_encoder.py +++ b/keras_nlp/layers/modeling/f_net_encoder.py @@ -47,8 +47,8 @@ class FNetEncoder(keras.layers.Layer): bias_initializer: "string" or `keras.initializers` initializer. The bias initializer for the dense layers. Defaults to `"zeros"`. - name: string. The name of the layer. Defaults to `None`. - **kwargs: other keyword arguments. + **kwargs: other keyword arguments passed to `keras.layers.Layer`, + including `name`, `trainable`, `dtype` etc. Examples: @@ -79,10 +79,9 @@ def __init__( layer_norm_epsilon=1e-5, kernel_initializer="glorot_uniform", bias_initializer="zeros", - name=None, **kwargs ): - super().__init__(name=name, **kwargs) + super().__init__(**kwargs) self.intermediate_dim = intermediate_dim self.dropout = dropout self.activation = keras.activations.get(activation) diff --git a/keras_nlp/layers/modeling/masked_lm_head.py b/keras_nlp/layers/modeling/masked_lm_head.py index eacee7e8c..d51f0eb50 100644 --- a/keras_nlp/layers/modeling/masked_lm_head.py +++ b/keras_nlp/layers/modeling/masked_lm_head.py @@ -59,6 +59,8 @@ class MaskedLMHead(keras.layers.Layer): bias_initializer: string or `keras.initializers` initializer. The bias initializer for the dense and multiheaded attention layers. Defaults to `"zeros"`. + **kwargs: other keyword arguments passed to `keras.layers.Layer`, + including `name`, `trainable`, `dtype` etc. Examples: diff --git a/keras_nlp/layers/modeling/position_embedding.py b/keras_nlp/layers/modeling/position_embedding.py index 6f9a44c29..34597cb11 100644 --- a/keras_nlp/layers/modeling/position_embedding.py +++ b/keras_nlp/layers/modeling/position_embedding.py @@ -33,6 +33,8 @@ class PositionEmbedding(keras.layers.Layer): initializer: The initializer to use for the embedding weights. Defaults to `"glorot_uniform"`. seq_axis: The axis of the input tensor where we add the embeddings. + **kwargs: other keyword arguments passed to `keras.layers.Layer`, + including `name`, `trainable`, `dtype` etc. Call arguments: inputs: The tensor inputs to compute an embedding for, with shape diff --git a/keras_nlp/layers/modeling/reversible_embedding.py b/keras_nlp/layers/modeling/reversible_embedding.py index d11521768..9266b6d28 100644 --- a/keras_nlp/layers/modeling/reversible_embedding.py +++ b/keras_nlp/layers/modeling/reversible_embedding.py @@ -52,6 +52,8 @@ class ReversibleEmbedding(keras.layers.Embedding): reverse_dtype: The dtype for the reverse projection computation. For stability, it is usually best to use full precision even when working with half or mixed precision training. + **kwargs: other keyword arguments passed to `keras.layers.Embedding`, + including `name`, `trainable`, `dtype` etc. Call arguments: inputs: The tensor inputs to the layer. diff --git a/keras_nlp/layers/modeling/rotary_embedding.py b/keras_nlp/layers/modeling/rotary_embedding.py index b494d559b..1442548ea 100644 --- a/keras_nlp/layers/modeling/rotary_embedding.py +++ b/keras_nlp/layers/modeling/rotary_embedding.py @@ -38,6 +38,8 @@ class RotaryEmbedding(keras.layers.Layer): scaling_factor: float. The scaling factor used to scale frequency range. sequence_axis: int. Sequence axis in the input tensor. feature_axis: int. Feature axis in the input tensor. + **kwargs: other keyword arguments passed to `keras.layers.Layer`, + including `name`, `trainable`, `dtype` etc. Call arguments: inputs: The tensor inputs to apply the embedding to. This can have diff --git a/keras_nlp/layers/modeling/sine_position_encoding.py b/keras_nlp/layers/modeling/sine_position_encoding.py index 6e96a77e2..5ab874c11 100644 --- a/keras_nlp/layers/modeling/sine_position_encoding.py +++ b/keras_nlp/layers/modeling/sine_position_encoding.py @@ -34,6 +34,8 @@ class SinePositionEncoding(keras.layers.Layer): max_wavelength: The maximum angular wavelength of the sine/cosine curves, as described in Attention is All You Need. Defaults to `10000`. + **kwargs: other keyword arguments passed to `keras.layers.Layer`, + including `name`, `trainable`, `dtype` etc. Call arguments: inputs: The tensor inputs to compute an embedding for, with shape diff --git a/keras_nlp/layers/modeling/token_and_position_embedding.py b/keras_nlp/layers/modeling/token_and_position_embedding.py index bb7107f96..6266963bf 100644 --- a/keras_nlp/layers/modeling/token_and_position_embedding.py +++ b/keras_nlp/layers/modeling/token_and_position_embedding.py @@ -33,6 +33,9 @@ class TokenAndPositionEmbedding(keras.layers.Layer): vocabulary_size: The size of the vocabulary. sequence_length: The maximum length of input sequence embedding_dim: The output dimension of the embedding layer + tie_weights: Boolean, whether or not the matrix for embedding and + the matrix for the `reverse` projection should share the same + weights. embeddings_initializer: The initializer to use for the Embedding Layers mask_zero: Boolean, whether or not the input value 0 is a special @@ -43,6 +46,8 @@ class TokenAndPositionEmbedding(keras.layers.Layer): If mask_zero` is set to True, as a consequence, index 0 cannot be used in the vocabulary (input_dim should equal size of vocabulary + 1). + **kwargs: other keyword arguments passed to `keras.layers.Layer`, + including `name`, `trainable`, `dtype` etc. Examples: ```python diff --git a/keras_nlp/layers/modeling/transformer_decoder.py b/keras_nlp/layers/modeling/transformer_decoder.py index d06a1948f..0de35da0b 100644 --- a/keras_nlp/layers/modeling/transformer_decoder.py +++ b/keras_nlp/layers/modeling/transformer_decoder.py @@ -69,8 +69,8 @@ class TransformerDecoder(keras.layers.Layer): (similar to GPT-2). If set to False, outputs of attention layer and intermediate dense layer are normalized (similar to BERT). Defaults to `False`. - name: string. The name of the layer. Defaults to `None`. - **kwargs: other keyword arguments. + **kwargs: other keyword arguments passed to `keras.layers.Layer`, + including `name`, `trainable`, `dtype` etc. Examples: ```python diff --git a/keras_nlp/layers/modeling/transformer_encoder.py b/keras_nlp/layers/modeling/transformer_encoder.py index 32cdd3554..cd45b6aeb 100644 --- a/keras_nlp/layers/modeling/transformer_encoder.py +++ b/keras_nlp/layers/modeling/transformer_encoder.py @@ -58,8 +58,8 @@ class TransformerEncoder(keras.layers.Layer): (similar to GPT-2). If set to False, outputs of attention layer and intermediate dense layer are normalized (similar to BERT). Defaults to `False`. - name: string. The name of the layer. Defaults to `None`. - **kwargs: other keyword arguments. + **kwargs: other keyword arguments passed to `keras.layers.Layer`, + including `name`, `trainable`, `dtype` etc. Examples: