From 5a29d4f3285f896576cecb03ea2dc540e7ea013d Mon Sep 17 00:00:00 2001 From: "Wonseok Lee (Jack)" Date: Thu, 11 Aug 2022 21:51:39 +0900 Subject: [PATCH] Fix LayoutLMv3 documentation (#17932) * fix typos * fix sequence_length docs of LayoutLMv3Model * delete trailing white spaces * fix layoutlmv3 docs more * apply make fixup & quality * change to two versions of input docstring * apply make fixup & quality --- .../models/layoutlmv3/modeling_layoutlmv3.py | 112 ++++++++++++++++-- 1 file changed, 99 insertions(+), 13 deletions(-) diff --git a/src/transformers/models/layoutlmv3/modeling_layoutlmv3.py b/src/transformers/models/layoutlmv3/modeling_layoutlmv3.py index f3bdd2cd8d9067..68987e38e9942e 100644 --- a/src/transformers/models/layoutlmv3/modeling_layoutlmv3.py +++ b/src/transformers/models/layoutlmv3/modeling_layoutlmv3.py @@ -54,17 +54,93 @@ behavior. Parameters: - config ([`LayoutLMv2Config`]): Model configuration class with all the parameters of the model. + config ([`LayoutLMv3Config`]): Model configuration class with all the parameters of the model. Initializing with a config file does not load the weights associated with the model, only the configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights. """ -LAYOUTLMV3_INPUTS_DOCSTRING = r""" +LAYOUTLMV3_MODEL_INPUTS_DOCSTRING = r""" Args: - input_ids (`torch.LongTensor` of shape `{0}`): + input_ids (`torch.LongTensor` of shape `({0})`): Indices of input sequence tokens in the vocabulary. - Indices can be obtained using [`LayoutLMv2Tokenizer`]. See [`PreTrainedTokenizer.encode`] and + Note that `sequence_length = token_sequence_length + patch_sequence_length + 1` where `1` is for [CLS] + token. See `pixel_values` for `patch_sequence_length`. + + Indices can be obtained using [`LayoutLMv3Tokenizer`]. See [`PreTrainedTokenizer.encode`] and + [`PreTrainedTokenizer.__call__`] for details. + + [What are input IDs?](../glossary#input-ids) + + bbox (`torch.LongTensor` of shape `({0}, 4)`, *optional*): + Bounding boxes of each input sequence tokens. Selected in the range `[0, + config.max_2d_position_embeddings-1]`. Each bounding box should be a normalized version in (x0, y0, x1, y1) + format, where (x0, y0) corresponds to the position of the upper left corner in the bounding box, and (x1, + y1) represents the position of the lower right corner. + + Note that `sequence_length = token_sequence_length + patch_sequence_length + 1` where `1` is for [CLS] + token. See `pixel_values` for `patch_sequence_length`. + + pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`): + Batch of document images. Each image is divided into patches of shape `(num_channels, config.patch_size, + config.patch_size)` and the total number of patches (=`patch_sequence_length`) equals to `((height / + config.patch_size) * (width / config.patch_size))`. + + attention_mask (`torch.FloatTensor` of shape `({0})`, *optional*): + Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + Note that `sequence_length = token_sequence_length + patch_sequence_length + 1` where `1` is for [CLS] + token. See `pixel_values` for `patch_sequence_length`. + + [What are attention masks?](../glossary#attention-mask) + token_type_ids (`torch.LongTensor` of shape `({0})`, *optional*): + Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0, + 1]`: + + - 0 corresponds to a *sentence A* token, + - 1 corresponds to a *sentence B* token. + + Note that `sequence_length = token_sequence_length + patch_sequence_length + 1` where `1` is for [CLS] + token. See `pixel_values` for `patch_sequence_length`. + + [What are token type IDs?](../glossary#token-type-ids) + position_ids (`torch.LongTensor` of shape `({0})`, *optional*): + Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0, + config.max_position_embeddings - 1]`. + + Note that `sequence_length = token_sequence_length + patch_sequence_length + 1` where `1` is for [CLS] + token. See `pixel_values` for `patch_sequence_length`. + + [What are position IDs?](../glossary#position-ids) + head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*): + Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`: + + - 1 indicates the head is **not masked**, + - 0 indicates the head is **masked**. + + inputs_embeds (`torch.FloatTensor` of shape `({0}, hidden_size)`, *optional*): + Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This + is useful if you want more control over how to convert *input_ids* indices into associated vectors than the + model's internal embedding lookup matrix. + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned + tensors for more detail. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for + more detail. + return_dict (`bool`, *optional*): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. +""" + +LAYOUTLMV3_DOWNSTREAM_INPUTS_DOCSTRING = r""" + Args: + input_ids (`torch.LongTensor` of shape `({0})`): + Indices of input sequence tokens in the vocabulary. + + Indices can be obtained using [`LayoutLMv3Tokenizer`]. See [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for details. [What are input IDs?](../glossary#input-ids) @@ -76,16 +152,18 @@ y1) represents the position of the lower right corner. pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`): - Batch of document images. + Batch of document images. Each image is divided into patches of shape `(num_channels, config.patch_size, + config.patch_size)` and the total number of patches (=`patch_sequence_length`) equals to `((height / + config.patch_size) * (width / config.patch_size))`. - attention_mask (`torch.FloatTensor` of shape `{0}`, *optional*): + attention_mask (`torch.FloatTensor` of shape `({0})`, *optional*): Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: - 1 for tokens that are **not masked**, - 0 for tokens that are **masked**. [What are attention masks?](../glossary#attention-mask) - token_type_ids (`torch.LongTensor` of shape `{0}`, *optional*): + token_type_ids (`torch.LongTensor` of shape `({0})`, *optional*): Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0, 1]`: @@ -93,7 +171,7 @@ - 1 corresponds to a *sentence B* token. [What are token type IDs?](../glossary#token-type-ids) - position_ids (`torch.LongTensor` of shape `{0}`, *optional*): + position_ids (`torch.LongTensor` of shape `({0})`, *optional*): Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0, config.max_position_embeddings - 1]`. @@ -104,7 +182,7 @@ - 1 indicates the head is **not masked**, - 0 indicates the head is **masked**. - inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): + inputs_embeds (`torch.FloatTensor` of shape `({0}, hidden_size)`, *optional*): Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This is useful if you want more control over how to convert *input_ids* indices into associated vectors than the model's internal embedding lookup matrix. @@ -763,7 +841,9 @@ def forward_image(self, pixel_values): return embeddings - @add_start_docstrings_to_model_forward(LAYOUTLMV3_INPUTS_DOCSTRING.format("(batch_size, sequence_length)")) + @add_start_docstrings_to_model_forward( + LAYOUTLMV3_MODEL_INPUTS_DOCSTRING.format("batch_size, token_sequence_length") + ) @replace_return_docstrings(output_type=BaseModelOutput, config_class=_CONFIG_FOR_DOC) def forward( self, @@ -975,7 +1055,9 @@ def __init__(self, config): self.init_weights() - @add_start_docstrings_to_model_forward(LAYOUTLMV3_INPUTS_DOCSTRING.format("batch_size, sequence_length")) + @add_start_docstrings_to_model_forward( + LAYOUTLMV3_DOWNSTREAM_INPUTS_DOCSTRING.format("batch_size, sequence_length") + ) @replace_return_docstrings(output_type=TokenClassifierOutput, config_class=_CONFIG_FOR_DOC) def forward( self, @@ -1084,7 +1166,9 @@ def __init__(self, config): self.init_weights() - @add_start_docstrings_to_model_forward(LAYOUTLMV3_INPUTS_DOCSTRING.format("batch_size, sequence_length")) + @add_start_docstrings_to_model_forward( + LAYOUTLMV3_DOWNSTREAM_INPUTS_DOCSTRING.format("batch_size, sequence_length") + ) @replace_return_docstrings(output_type=QuestionAnsweringModelOutput, config_class=_CONFIG_FOR_DOC) def forward( self, @@ -1214,7 +1298,9 @@ def __init__(self, config): self.init_weights() - @add_start_docstrings_to_model_forward(LAYOUTLMV3_INPUTS_DOCSTRING.format("batch_size, sequence_length")) + @add_start_docstrings_to_model_forward( + LAYOUTLMV3_DOWNSTREAM_INPUTS_DOCSTRING.format("batch_size, sequence_length") + ) @replace_return_docstrings(output_type=SequenceClassifierOutput, config_class=_CONFIG_FOR_DOC) def forward( self,