Docs/modelling layers (keras-team#1502)

* Docs(layers): add a description for `tie_weights` argument * Refactor(layers): make `name` an explicit argument for Transformer layers * Refactor(layers): remove explicit usage of `name` in `__init__` calls * Docs(layers): remove references to `name` and consistently documents `**kwargs`
abuelnasr0 · Apr 2, 2024 · 898329f · 898329f
1 parent 2acb4c9
commit 898329f
Show file tree

Hide file tree

Showing 10 changed files with 25 additions and 8 deletions.
diff --git a/keras_nlp/layers/modeling/alibi_bias.py b/keras_nlp/layers/modeling/alibi_bias.py
@@ -35,6 +35,9 @@ class AlibiBias(keras.layers.Layer):
  each head. The heads' slopes are a geometric sequence that starts at
  `2**(-alibi_bias_max/num_heads)` and uses that same value as its
  ratio. Defaults to 8.
+ **kwargs: other keyword arguments passed to `keras.layers.Layer`,
+ including `name`, `trainable`, `dtype` etc.
+
  Call arguments:
  attention_scores: The result of multipying the query and the key of the
  multi-head attention layer of the transformer to add alibi bias to

diff --git a/keras_nlp/layers/modeling/f_net_encoder.py b/keras_nlp/layers/modeling/f_net_encoder.py
@@ -47,8 +47,8 @@ class FNetEncoder(keras.layers.Layer):
  bias_initializer: "string" or `keras.initializers` initializer.
  The bias initializer for the dense layers.
  Defaults to `"zeros"`.
- name: string. The name of the layer. Defaults to `None`.
- **kwargs: other keyword arguments.
+ **kwargs: other keyword arguments passed to `keras.layers.Layer`,
+  including `name`, `trainable`, `dtype` etc.
 
  Examples:
 
@@ -79,10 +79,9 @@ def __init__(
  layer_norm_epsilon=1e-5,
  kernel_initializer="glorot_uniform",
  bias_initializer="zeros",
- name=None,
  **kwargs
  ):
- super().__init__(name=name, **kwargs)
+ super().__init__(**kwargs)
  self.intermediate_dim = intermediate_dim
  self.dropout = dropout
  self.activation = keras.activations.get(activation)

diff --git a/keras_nlp/layers/modeling/masked_lm_head.py b/keras_nlp/layers/modeling/masked_lm_head.py
@@ -59,6 +59,8 @@ class MaskedLMHead(keras.layers.Layer):
  bias_initializer: string or `keras.initializers` initializer.
  The bias initializer for the dense and multiheaded
  attention layers. Defaults to `"zeros"`.
+ **kwargs: other keyword arguments passed to `keras.layers.Layer`,
+ including `name`, `trainable`, `dtype` etc.
 
  Examples:
 

diff --git a/keras_nlp/layers/modeling/position_embedding.py b/keras_nlp/layers/modeling/position_embedding.py
@@ -33,6 +33,8 @@ class PositionEmbedding(keras.layers.Layer):
  initializer: The initializer to use for the embedding weights. Defaults
  to `"glorot_uniform"`.
  seq_axis: The axis of the input tensor where we add the embeddings.
+ **kwargs: other keyword arguments passed to `keras.layers.Layer`,
+ including `name`, `trainable`, `dtype` etc.
 
  Call arguments:
  inputs: The tensor inputs to compute an embedding for, with shape

diff --git a/keras_nlp/layers/modeling/reversible_embedding.py b/keras_nlp/layers/modeling/reversible_embedding.py
@@ -52,6 +52,8 @@ class ReversibleEmbedding(keras.layers.Embedding):
  reverse_dtype: The dtype for the reverse projection computation.
  For stability, it is usually best to use full precision even when
  working with half or mixed precision training.
+ **kwargs: other keyword arguments passed to `keras.layers.Embedding`,
+ including `name`, `trainable`, `dtype` etc.
 
  Call arguments:
  inputs: The tensor inputs to the layer.

diff --git a/keras_nlp/layers/modeling/rotary_embedding.py b/keras_nlp/layers/modeling/rotary_embedding.py
@@ -38,6 +38,8 @@ class RotaryEmbedding(keras.layers.Layer):
  scaling_factor: float. The scaling factor used to scale frequency range.
  sequence_axis: int. Sequence axis in the input tensor.
  feature_axis: int. Feature axis in the input tensor.
+ **kwargs: other keyword arguments passed to `keras.layers.Layer`,
+ including `name`, `trainable`, `dtype` etc.
 
  Call arguments:
  inputs: The tensor inputs to apply the embedding to. This can have

diff --git a/keras_nlp/layers/modeling/sine_position_encoding.py b/keras_nlp/layers/modeling/sine_position_encoding.py
@@ -34,6 +34,8 @@ class SinePositionEncoding(keras.layers.Layer):
  max_wavelength: The maximum angular wavelength of the sine/cosine
  curves, as described in Attention is All You Need. Defaults to
  `10000`.
+ **kwargs: other keyword arguments passed to `keras.layers.Layer`,
+ including `name`, `trainable`, `dtype` etc.
 
  Call arguments:
  inputs: The tensor inputs to compute an embedding for, with shape

diff --git a/keras_nlp/layers/modeling/token_and_position_embedding.py b/keras_nlp/layers/modeling/token_and_position_embedding.py
@@ -33,6 +33,9 @@ class TokenAndPositionEmbedding(keras.layers.Layer):
  vocabulary_size: The size of the vocabulary.
  sequence_length: The maximum length of input sequence
  embedding_dim: The output dimension of the embedding layer
+ tie_weights: Boolean, whether or not the matrix for embedding and
+ the matrix for the `reverse` projection should share the same
+ weights.
  embeddings_initializer: The initializer to use for the Embedding
  Layers
  mask_zero: Boolean, whether or not the input value 0 is a special
@@ -43,6 +46,8 @@ class TokenAndPositionEmbedding(keras.layers.Layer):
  If mask_zero` is set to True, as a consequence, index 0 cannot be
  used in the vocabulary
  (input_dim should equal size of vocabulary + 1).
+ **kwargs: other keyword arguments passed to `keras.layers.Layer`,
+ including `name`, `trainable`, `dtype` etc.
 
  Examples:
  ```python

diff --git a/keras_nlp/layers/modeling/transformer_decoder.py b/keras_nlp/layers/modeling/transformer_decoder.py
@@ -69,8 +69,8 @@ class TransformerDecoder(keras.layers.Layer):
  (similar to GPT-2). If set to False, outputs of attention layer and
  intermediate dense layer are normalized (similar to BERT).
  Defaults to `False`.
- name: string. The name of the layer. Defaults to `None`.
- **kwargs: other keyword arguments.
+ **kwargs: other keyword arguments passed to `keras.layers.Layer`,
+  including `name`, `trainable`, `dtype` etc.
 
  Examples:
  ```python

diff --git a/keras_nlp/layers/modeling/transformer_encoder.py b/keras_nlp/layers/modeling/transformer_encoder.py
@@ -58,8 +58,8 @@ class TransformerEncoder(keras.layers.Layer):
  (similar to GPT-2). If set to False, outputs of attention layer and
  intermediate dense layer are normalized (similar to BERT).
  Defaults to `False`.
- name: string. The name of the layer. Defaults to `None`.
- **kwargs: other keyword arguments.
+ **kwargs: other keyword arguments passed to `keras.layers.Layer`,
+  including `name`, `trainable`, `dtype` etc.
 
  Examples: