From 9fb42f3bb9d7d9ec92624f6829c9eb5129ea67a0 Mon Sep 17 00:00:00 2001
From: Niels Rogge <nielsrogge@Nielss-MacBook-Pro.local>
Date: Sun, 18 Sep 2022 19:09:12 +0200
Subject: [PATCH 1/9] Add first draft

---
 docs/source/en/model_doc/layoutlmv2.mdx       |   4 +
 src/transformers/__init__.py                  |   2 +
 .../models/layoutlmv2/__init__.py             |   2 +
 .../models/layoutlmv2/modeling_layoutlmv2.py  | 247 ++++++++++++++++++
 src/transformers/utils/dummy_pt_objects.py    |   7 +
 .../layoutlmv2/test_modeling_layoutlmv2.py    |   2 +
 6 files changed, 264 insertions(+)

diff --git a/docs/source/en/model_doc/layoutlmv2.mdx b/docs/source/en/model_doc/layoutlmv2.mdx
index e40a3cfc8d8a6b..b0ea6139130c82 100644
--- a/docs/source/en/model_doc/layoutlmv2.mdx
+++ b/docs/source/en/model_doc/layoutlmv2.mdx
@@ -307,3 +307,7 @@ print(encoding.keys())
 ## LayoutLMv2ForQuestionAnswering
 
 [[autodoc]] LayoutLMv2ForQuestionAnswering
+
+## LayoutLMv2ForRelationExtraction
+
+[[autodoc]] LayoutLMv2ForRelationExtraction
\ No newline at end of file
diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index 3c3a3a50064162..49dd9296b57860 100755
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -1379,6 +1379,7 @@
             "LayoutLMv2ForTokenClassification",
             "LayoutLMv2Model",
             "LayoutLMv2PreTrainedModel",
+            "LayoutLMv2ForRelationExtraction",
         ]
     )
     _import_structure["models.layoutlmv3"].extend(
@@ -4080,6 +4081,7 @@
         from .models.layoutlmv2 import (
             LAYOUTLMV2_PRETRAINED_MODEL_ARCHIVE_LIST,
             LayoutLMv2ForQuestionAnswering,
+            LayoutLMv2ForRelationExtraction,
             LayoutLMv2ForSequenceClassification,
             LayoutLMv2ForTokenClassification,
             LayoutLMv2Model,
diff --git a/src/transformers/models/layoutlmv2/__init__.py b/src/transformers/models/layoutlmv2/__init__.py
index beaacb815843d0..802b1875920a2f 100644
--- a/src/transformers/models/layoutlmv2/__init__.py
+++ b/src/transformers/models/layoutlmv2/__init__.py
@@ -63,6 +63,7 @@
         "LayoutLMv2Layer",
         "LayoutLMv2Model",
         "LayoutLMv2PreTrainedModel",
+        "LayoutLMv2ForRelationExtraction",
     ]
 
 if TYPE_CHECKING:
@@ -95,6 +96,7 @@
         from .modeling_layoutlmv2 import (
             LAYOUTLMV2_PRETRAINED_MODEL_ARCHIVE_LIST,
             LayoutLMv2ForQuestionAnswering,
+            LayoutLMv2ForRelationExtraction,
             LayoutLMv2ForSequenceClassification,
             LayoutLMv2ForTokenClassification,
             LayoutLMv2Layer,
diff --git a/src/transformers/models/layoutlmv2/modeling_layoutlmv2.py b/src/transformers/models/layoutlmv2/modeling_layoutlmv2.py
index be31af99d6dfd8..bcfbb3f98ce4f9 100755
--- a/src/transformers/models/layoutlmv2/modeling_layoutlmv2.py
+++ b/src/transformers/models/layoutlmv2/modeling_layoutlmv2.py
@@ -15,6 +15,8 @@
 """ PyTorch LayoutLMv2 model."""
 
 import math
+from copy import copy
+from dataclasses import dataclass
 from typing import Optional, Tuple, Union
 
 import torch
@@ -33,6 +35,7 @@
 from ...modeling_utils import PreTrainedModel
 from ...pytorch_utils import apply_chunking_to_forward, torch_int_div
 from ...utils import (
+    ModelOutput,
     add_start_docstrings,
     add_start_docstrings_to_model_forward,
     is_detectron2_available,
@@ -61,6 +64,32 @@
 ]
 
 
+@dataclass
+class RelationExtractionOutput(ModelOutput):
+    """
+    Class for outputs of [`LayoutLMv2ForRelationExtraction`].
+
+    Args:
+        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
+            Classification (or regression if config.num_labels==1) loss.
+        logits (`torch.FloatTensor` of shape `(batch_size, config.num_labels)`):
+            Classification (or regression if config.num_labels==1) scores (before SoftMax).
+        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
+            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of
+            the model at the output of each layer plus the optional initial embedding outputs.
+        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in
+            the self-attention heads.
+    """
+
+    loss: Optional[torch.FloatTensor] = None
+    logits: torch.FloatTensor = None
+    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+    attentions: Optional[Tuple[torch.FloatTensor]] = None
+
+
 class LayoutLMv2Embeddings(nn.Module):
     """Construct the embeddings from word, position and token_type embeddings."""
 
@@ -1424,3 +1453,221 @@ def forward(
             hidden_states=outputs.hidden_states,
             attentions=outputs.attentions,
         )
+
+
+class BiaffineAttention(torch.nn.Module):
+    """Implements a biaffine attention operator for binary relation classification.
+    Args:
+    PyTorch implementation of the biaffine attention operator from "End-to-end neural relation extraction using deep
+    biaffine attention" (https://arxiv.org/abs/1812.11275) which can be used as a classifier for binary relation
+    classification.
+        in_features (int): The size of the feature dimension of the inputs. out_features (int): The size of the feature
+        dimension of the output.
+    Shape:
+        - x_1: `(N, *, in_features)` where `N` is the batch dimension and `*` means any number of additional
+          dimensisons.
+        - x_2: `(N, *, in_features)`, where `N` is the batch dimension and `*` means any number of additional
+          dimensions.
+        - Output: `(N, *, out_features)`, where `N` is the batch dimension and `*` means any number
+            of additional dimensions.
+    Examples:
+        >>> batch_size, in_features, out_features = 32, 100, 4 >>> biaffine_attention = BiaffineAttention(in_features,
+        out_features) >>> x_1 = torch.randn(batch_size, in_features) >>> x_2 = torch.randn(batch_size, in_features) >>>
+        output = biaffine_attention(x_1, x_2) >>> print(output.size()) torch.Size([32, 4])
+    """
+
+    def __init__(self, in_features, out_features):
+        super(BiaffineAttention, self).__init__()
+
+        self.in_features = in_features
+        self.out_features = out_features
+
+        self.bilinear = torch.nn.Bilinear(in_features, in_features, out_features, bias=False)
+        self.linear = torch.nn.Linear(2 * in_features, out_features, bias=True)
+
+        self.reset_parameters()
+
+    def forward(self, x_1, x_2):
+        return self.bilinear(x_1, x_2) + self.linear(torch.cat((x_1, x_2), dim=-1))
+
+    def reset_parameters(self):
+        self.bilinear.reset_parameters()
+        self.linear.reset_parameters()
+
+
+class LayoutLMv2RelationExtractionDecoder(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.entity_emb = nn.Embedding(3, config.hidden_size, scale_grad_by_freq=True)
+        projection = nn.Sequential(
+            nn.Linear(config.hidden_size * 2, config.hidden_size),
+            nn.ReLU(),
+            nn.Dropout(config.hidden_dropout_prob),
+            nn.Linear(config.hidden_size, config.hidden_size // 2),
+            nn.ReLU(),
+            nn.Dropout(config.hidden_dropout_prob),
+        )
+        self.ffnn_head = copy.deepcopy(projection)
+        self.ffnn_tail = copy.deepcopy(projection)
+        self.rel_classifier = BiaffineAttention(config.hidden_size // 2, 2)
+        self.loss_fct = CrossEntropyLoss()
+
+    def build_relation(self, relations, entities):
+        batch_size = len(relations)
+        new_relations = []
+        for b in range(batch_size):
+            if len(entities[b]["start"]) <= 2:
+                entities[b] = {"end": [1, 1], "label": [0, 0], "start": [0, 0]}
+            all_possible_relations = set(
+                [
+                    (i, j)
+                    for i in range(len(entities[b]["label"]))
+                    for j in range(len(entities[b]["label"]))
+                    if entities[b]["label"][i] == 1 and entities[b]["label"][j] == 2
+                ]
+            )
+            if len(all_possible_relations) == 0:
+                all_possible_relations = set([(0, 1)])
+            positive_relations = set(list(zip(relations[b]["head"], relations[b]["tail"])))
+            negative_relations = all_possible_relations - positive_relations
+            positive_relations = set([i for i in positive_relations if i in all_possible_relations])
+            reordered_relations = list(positive_relations) + list(negative_relations)
+            relation_per_doc = {"head": [], "tail": [], "label": []}
+            relation_per_doc["head"] = [i[0] for i in reordered_relations]
+            relation_per_doc["tail"] = [i[1] for i in reordered_relations]
+            relation_per_doc["label"] = [1] * len(positive_relations) + [0] * (
+                len(reordered_relations) - len(positive_relations)
+            )
+            assert len(relation_per_doc["head"]) != 0
+            new_relations.append(relation_per_doc)
+        return new_relations, entities
+
+    def get_predicted_relations(self, logits, relations, entities):
+        pred_relations = []
+        for i, pred_label in enumerate(logits.argmax(-1)):
+            if pred_label != 1:
+                continue
+            rel = {}
+            rel["head_id"] = relations["head"][i]
+            rel["head"] = (entities["start"][rel["head_id"]], entities["end"][rel["head_id"]])
+            rel["head_type"] = entities["label"][rel["head_id"]]
+
+            rel["tail_id"] = relations["tail"][i]
+            rel["tail"] = (entities["start"][rel["tail_id"]], entities["end"][rel["tail_id"]])
+            rel["tail_type"] = entities["label"][rel["tail_id"]]
+            rel["type"] = 1
+            pred_relations.append(rel)
+        return pred_relations
+
+    def forward(self, hidden_states, entities, relations):
+        batch_size, max_n_words, context_dim = hidden_states.size()
+        device = hidden_states.device
+        relations, entities = self.build_relation(relations, entities)
+        loss = 0
+        all_pred_relations = []
+        for b in range(batch_size):
+            head_entities = torch.tensor(relations[b]["head"], device=device)
+            tail_entities = torch.tensor(relations[b]["tail"], device=device)
+            relation_labels = torch.tensor(relations[b]["label"], device=device)
+            entities_start_index = torch.tensor(entities[b]["start"], device=device)
+            entities_labels = torch.tensor(entities[b]["label"], device=device)
+            head_index = entities_start_index[head_entities]
+            head_label = entities_labels[head_entities]
+            head_label_repr = self.entity_emb(head_label)
+
+            tail_index = entities_start_index[tail_entities]
+            tail_label = entities_labels[tail_entities]
+            tail_label_repr = self.entity_emb(tail_label)
+
+            head_repr = torch.cat(
+                (hidden_states[b][head_index], head_label_repr),
+                dim=-1,
+            )
+            tail_repr = torch.cat(
+                (hidden_states[b][tail_index], tail_label_repr),
+                dim=-1,
+            )
+            heads = self.ffnn_head(head_repr)
+            tails = self.ffnn_tail(tail_repr)
+            logits = self.rel_classifier(heads, tails)
+            loss += self.loss_fct(logits, relation_labels)
+            pred_relations = self.get_predicted_relations(logits, relations[b], entities[b])
+            all_pred_relations.append(pred_relations)
+        return loss, all_pred_relations
+
+
+@add_start_docstrings(
+    """
+    LayoutLMv2 Model with a relation extraction head on top for key-value extraction tasks such as
+    [XFUND](https://github.com/doc-analysis/XFUND) (a bi-affine attention layer on top).
+    """,
+    LAYOUTLMV2_START_DOCSTRING,
+)
+class LayoutLMv2ForRelationExtraction(LayoutLMv2PreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+
+        self.layoutlmv2 = LayoutLMv2Model(config)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        self.extractor = LayoutLMv2RelationExtractionDecoder(config)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @add_start_docstrings_to_model_forward(LAYOUTLMV2_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @replace_return_docstrings(output_type=RelationExtractionOutput, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        bbox: Optional[torch.LongTensor] = None,
+        image: Optional[torch.FloatTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        token_type_ids: Optional[torch.LongTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        start_positions: Optional[torch.LongTensor] = None,
+        end_positions: Optional[torch.LongTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        entities=None,
+        relations=None,
+    ):
+        r"""
+        entities (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            ...
+        relations (...):
+            ...
+
+        Returns:
+
+        Example:
+
+        ```python
+        >>> from transformers import LayoutLMv2Processor, LayoutLMv2ForRelationExtraction
+        >>> from PIL import Image
+        >>> from datasets import load_dataset
+        ```
+        """
+        outputs = self.layoutlmv2(
+            input_ids=input_ids,
+            bbox=bbox,
+            image=image,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+        )
+
+        seq_length = input_ids.size(1)
+        sequence_output = outputs[0][:, :seq_length]
+        sequence_output = self.dropout(sequence_output)
+        loss, pred_relations = self.extractor(sequence_output, entities, relations)
+
+        return RelationExtractionOutput(
+            loss=loss,
+            entities=entities,
+            relations=relations,
+            pred_relations=pred_relations,
+            hidden_states=outputs[0],
+        )
diff --git a/src/transformers/utils/dummy_pt_objects.py b/src/transformers/utils/dummy_pt_objects.py
index b656cee9c89bdc..dccceb462ff8b7 100644
--- a/src/transformers/utils/dummy_pt_objects.py
+++ b/src/transformers/utils/dummy_pt_objects.py
@@ -2628,6 +2628,13 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
+class LayoutLMv2ForRelationExtraction(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
 class LayoutLMv2ForSequenceClassification(metaclass=DummyObject):
     _backends = ["torch"]
 
diff --git a/tests/models/layoutlmv2/test_modeling_layoutlmv2.py b/tests/models/layoutlmv2/test_modeling_layoutlmv2.py
index 3c38373163e496..178feacda68340 100644
--- a/tests/models/layoutlmv2/test_modeling_layoutlmv2.py
+++ b/tests/models/layoutlmv2/test_modeling_layoutlmv2.py
@@ -34,6 +34,7 @@
         MODEL_MAPPING,
         LayoutLMv2Config,
         LayoutLMv2ForQuestionAnswering,
+        LayoutLMv2ForRelationExtraction,
         LayoutLMv2ForSequenceClassification,
         LayoutLMv2ForTokenClassification,
         LayoutLMv2Model,
@@ -269,6 +270,7 @@ class LayoutLMv2ModelTest(ModelTesterMixin, unittest.TestCase):
             LayoutLMv2ForSequenceClassification,
             LayoutLMv2ForTokenClassification,
             LayoutLMv2ForQuestionAnswering,
+            LayoutLMv2ForRelationExtraction,
         )
         if is_torch_available()
         else ()

From bfa9fbeb3ce18bd6dacaca11230c8cfd35131a06 Mon Sep 17 00:00:00 2001
From: Niels Rogge <nielsrogge@Nielss-MacBook-Pro.local>
Date: Sun, 18 Sep 2022 19:33:33 +0200
Subject: [PATCH 2/9] Fix bug

---
 src/transformers/models/layoutlmv2/modeling_layoutlmv2.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/layoutlmv2/modeling_layoutlmv2.py b/src/transformers/models/layoutlmv2/modeling_layoutlmv2.py
index bcfbb3f98ce4f9..276835b6fd56da 100755
--- a/src/transformers/models/layoutlmv2/modeling_layoutlmv2.py
+++ b/src/transformers/models/layoutlmv2/modeling_layoutlmv2.py
@@ -14,8 +14,8 @@
 # limitations under the License.
 """ PyTorch LayoutLMv2 model."""
 
+import copy
 import math
-from copy import copy
 from dataclasses import dataclass
 from typing import Optional, Tuple, Union
 

From 36c41a1017eb6ed321e480d944095ae01a68e1a3 Mon Sep 17 00:00:00 2001
From: Niels Rogge <nielsrogge@Nielss-MacBook-Pro.local>
Date: Sun, 18 Sep 2022 21:36:56 +0200
Subject: [PATCH 3/9] Fix output

---
 .../models/layoutlmv2/modeling_layoutlmv2.py      | 15 +++++++++++----
 1 file changed, 11 insertions(+), 4 deletions(-)

diff --git a/src/transformers/models/layoutlmv2/modeling_layoutlmv2.py b/src/transformers/models/layoutlmv2/modeling_layoutlmv2.py
index 276835b6fd56da..1034813d36dff8 100755
--- a/src/transformers/models/layoutlmv2/modeling_layoutlmv2.py
+++ b/src/transformers/models/layoutlmv2/modeling_layoutlmv2.py
@@ -72,8 +72,12 @@ class RelationExtractionOutput(ModelOutput):
     Args:
         loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
             Classification (or regression if config.num_labels==1) loss.
-        logits (`torch.FloatTensor` of shape `(batch_size, config.num_labels)`):
-            Classification (or regression if config.num_labels==1) scores (before SoftMax).
+        entities (...)
+            ...
+        relations (...)
+            ...
+        pred_relations (...)
+            ...
         hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
             Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
             one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of
@@ -85,7 +89,9 @@ class RelationExtractionOutput(ModelOutput):
     """
 
     loss: Optional[torch.FloatTensor] = None
-    logits: torch.FloatTensor = None
+    entities: dict = None
+    relations: dict = None
+    pred_relations: dict = None
     hidden_states: Optional[Tuple[torch.FloatTensor]] = None
     attentions: Optional[Tuple[torch.FloatTensor]] = None
 
@@ -1669,5 +1675,6 @@ def forward(
             entities=entities,
             relations=relations,
             pred_relations=pred_relations,
-            hidden_states=outputs[0],
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
         )

From 0c05911e4c0b65903b498012edc7f6d00182fabe Mon Sep 17 00:00:00 2001
From: Niels Rogge <nielsrogge@Nielss-MacBook-Pro.local>
Date: Mon, 19 Sep 2022 13:35:42 +0200
Subject: [PATCH 4/9] Add return_dict option

---
 .../models/layoutlmv2/modeling_layoutlmv2.py  | 24 +++++++++++++------
 1 file changed, 17 insertions(+), 7 deletions(-)

diff --git a/src/transformers/models/layoutlmv2/modeling_layoutlmv2.py b/src/transformers/models/layoutlmv2/modeling_layoutlmv2.py
index 1034813d36dff8..b0e6dc5fa046bb 100755
--- a/src/transformers/models/layoutlmv2/modeling_layoutlmv2.py
+++ b/src/transformers/models/layoutlmv2/modeling_layoutlmv2.py
@@ -1632,12 +1632,11 @@ def forward(
         position_ids: Optional[torch.LongTensor] = None,
         head_mask: Optional[torch.FloatTensor] = None,
         inputs_embeds: Optional[torch.FloatTensor] = None,
-        start_positions: Optional[torch.LongTensor] = None,
-        end_positions: Optional[torch.LongTensor] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
         entities=None,
         relations=None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
     ):
         r"""
         entities (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
@@ -1655,6 +1654,9 @@ def forward(
         >>> from datasets import load_dataset
         ```
         """
+
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
         outputs = self.layoutlmv2(
             input_ids=input_ids,
             bbox=bbox,
@@ -1663,12 +1665,20 @@ def forward(
             token_type_ids=token_type_ids,
             position_ids=position_ids,
             head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
         )
 
         seq_length = input_ids.size(1)
-        sequence_output = outputs[0][:, :seq_length]
-        sequence_output = self.dropout(sequence_output)
-        loss, pred_relations = self.extractor(sequence_output, entities, relations)
+        text_output = outputs[0][:, :seq_length]
+        text_output = self.dropout(text_output)
+        loss, pred_relations = self.extractor(text_output, entities, relations)
+
+        if not return_dict:
+            output = (pred_relations,) + outputs[2:]
+            return ((loss,) + output) if loss is not None else output
 
         return RelationExtractionOutput(
             loss=loss,

From 219a4f72a667cdb25bb342f0b4e96cf537155ca0 Mon Sep 17 00:00:00 2001
From: NielsRogge <niels.rogge1@gmail.com>
Date: Mon, 19 Sep 2022 20:01:21 +0000
Subject: [PATCH 5/9] Add test

---
 tests/models/layoutlmv2/test.py               |  38 +++++++
 .../layoutlmv2/test_modeling_layoutlmv2.py    | 100 +++++++++++++++++-
 2 files changed, 133 insertions(+), 5 deletions(-)
 create mode 100644 tests/models/layoutlmv2/test.py

diff --git a/tests/models/layoutlmv2/test.py b/tests/models/layoutlmv2/test.py
new file mode 100644
index 00000000000000..cb5ba15507e25a
--- /dev/null
+++ b/tests/models/layoutlmv2/test.py
@@ -0,0 +1,38 @@
+# for example in range(self.batch_size):
+            #     # sample a number of entities for the example
+            #     num_entities = random.randint(1, self.max_entities)
+            #     entity_starts = []
+            #     entity_ends = []
+            #     entity_labels = []
+            #     for entity in range(num_entities):
+            #         entity_start = random.randint(0, self.seq_length)
+            #         entity_end = entity_start + random.randint(1, self.max_entity_length)
+            #         entity_label = random.randint(0, self.num_labels)
+            #         entity_starts.append(entity_start)
+            #         entity_ends.append(entity_end)
+            #         entity_labels.append(entity_label)
+            #     entity_dict = {
+            #         "start": entity_starts,
+            #         "end": entity_ends,
+            #         "label": entity_labels,
+            #     }
+            #     entities.append(entity_dict)
+
+            #     # sample a number of relations for the example
+            #     num_relations = random.randint(1, self.max_relations)
+            #     start_indices = []
+            #     end_indices = []
+            #     heads = []
+            #     tails = []
+            #     for relation in range(num_relations):
+            #         start_index = random.randint(0, self.seq_length)
+            #         end_index = start_index + random.randint(1, self.max_entity_length)
+            #         head = random.randint(0, self.max_entities)
+            #         tail = random.randint(0, self.max_entities)
+            #     relation_dict = {
+            #         "start_index": start_indices,
+            #         "end_index": end_indices,
+            #         "head": heads,
+            #         "tail": tails,
+            #     }
+            #     relations.append(relation_dict)
\ No newline at end of file
diff --git a/tests/models/layoutlmv2/test_modeling_layoutlmv2.py b/tests/models/layoutlmv2/test_modeling_layoutlmv2.py
index 178feacda68340..d3aa6bc7d62c75 100644
--- a/tests/models/layoutlmv2/test_modeling_layoutlmv2.py
+++ b/tests/models/layoutlmv2/test_modeling_layoutlmv2.py
@@ -137,9 +137,14 @@ def prepare_config_and_inputs(self):
 
         sequence_labels = None
         token_labels = None
+        entities = None
+        relations = None
         if self.use_labels:
             sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
             token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
+            # we choose some random entities and relations
+            entities = [{"start": [0, 4], "end": [3, 6], "label": [2,1]} for _ in range(self.batch_size)]
+            relations =  [{"start_index": [0], "end_index": [5], "head": [0], "tail": [1]} for _ in range(self.batch_size)]
 
         config = LayoutLMv2Config(
             vocab_size=self.vocab_size,
@@ -164,10 +169,31 @@ def prepare_config_and_inputs(self):
         config.detectron2_config_args["MODEL.RESNETS.RES2_OUT_CHANNELS"] = 64
         config.detectron2_config_args["MODEL.RESNETS.NUM_GROUPS"] = 1
 
-        return config, input_ids, bbox, image, token_type_ids, input_mask, sequence_labels, token_labels
+        return (
+            config,
+            input_ids,
+            bbox,
+            image,
+            token_type_ids,
+            input_mask,
+            sequence_labels,
+            token_labels,
+            entities,
+            relations,
+        )
 
     def create_and_check_model(
-        self, config, input_ids, bbox, image, token_type_ids, input_mask, sequence_labels, token_labels
+        self,
+        config,
+        input_ids,
+        bbox,
+        image,
+        token_type_ids,
+        input_mask,
+        sequence_labels,
+        token_labels,
+        entities,
+        relations,
     ):
         model = LayoutLMv2Model(config=config)
         model.to(torch_device)
@@ -183,7 +209,17 @@ def create_and_check_model(
         self.parent.assertEqual(result.pooler_output.shape, (self.batch_size, self.hidden_size))
 
     def create_and_check_for_sequence_classification(
-        self, config, input_ids, bbox, image, token_type_ids, input_mask, sequence_labels, token_labels
+        self,
+        config,
+        input_ids,
+        bbox,
+        image,
+        token_type_ids,
+        input_mask,
+        sequence_labels,
+        token_labels,
+        entities,
+        relations,
     ):
         config.num_labels = self.num_labels
         model = LayoutLMv2ForSequenceClassification(config)
@@ -200,7 +236,17 @@ def create_and_check_for_sequence_classification(
         self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_labels))
 
     def create_and_check_for_token_classification(
-        self, config, input_ids, bbox, image, token_type_ids, input_mask, sequence_labels, token_labels
+        self,
+        config,
+        input_ids,
+        bbox,
+        image,
+        token_type_ids,
+        input_mask,
+        sequence_labels,
+        token_labels,
+        entities,
+        relations,
     ):
         config.num_labels = self.num_labels
         model = LayoutLMv2ForTokenClassification(config=config)
@@ -217,7 +263,17 @@ def create_and_check_for_token_classification(
         self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.num_labels))
 
     def create_and_check_for_question_answering(
-        self, config, input_ids, bbox, image, token_type_ids, input_mask, sequence_labels, token_labels
+        self,
+        config,
+        input_ids,
+        bbox,
+        image,
+        token_type_ids,
+        input_mask,
+        sequence_labels,
+        token_labels,
+        entities,
+        relations,
     ):
         model = LayoutLMv2ForQuestionAnswering(config=config)
         model.to(torch_device)
@@ -234,6 +290,36 @@ def create_and_check_for_question_answering(
         self.parent.assertEqual(result.start_logits.shape, (self.batch_size, self.seq_length))
         self.parent.assertEqual(result.end_logits.shape, (self.batch_size, self.seq_length))
 
+    def create_and_check_for_relation_extraction(
+        self,
+        config,
+        input_ids,
+        bbox,
+        image,
+        token_type_ids,
+        input_mask,
+        sequence_labels,
+        token_labels,
+        entities,
+        relations,
+    ):
+        model = LayoutLMv2ForRelationExtraction(config=config)
+        torch_device = "cpu"
+        model.to(torch_device)
+        model.eval()
+        result = model(
+            input_ids.to("cpu"),
+            bbox=bbox.to("cpu"),
+            image=image.to("cpu"),
+            attention_mask=input_mask.to("cpu"),
+            token_type_ids=token_type_ids.to("cpu"),
+            entities=entities,
+            relations=relations,
+        )
+        print(result.pred_relations)
+        # self.parent.assertEqual(result.start_logits.shape, (self.batch_size, self.seq_length))
+        # self.parent.assertEqual(result.end_logits.shape, (self.batch_size, self.seq_length))
+
     def prepare_config_and_inputs_for_common(self):
         config_and_inputs = self.prepare_config_and_inputs()
         (
@@ -315,6 +401,10 @@ def test_for_question_answering(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_for_question_answering(*config_and_inputs)
 
+    def test_for_relation_extraction(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_for_relation_extraction(*config_and_inputs)
+    
     def test_save_load_fast_init_from_base(self):
         config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
         base_class = MODEL_MAPPING[config.__class__]

From 4c5c3dc5df1b9d9f66ebb4c16ac493b2a02725b4 Mon Sep 17 00:00:00 2001
From: NielsRogge <niels.rogge1@gmail.com>
Date: Tue, 20 Sep 2022 07:40:53 +0000
Subject: [PATCH 6/9] Make most tests pass

---
 .../models/layoutlmv2/modeling_layoutlmv2.py  |  30 ++--
 .../layoutlmv2/test_modeling_layoutlmv2.py    | 149 ++++++++++++++++--
 tests/test_modeling_common.py                 |   7 -
 3 files changed, 151 insertions(+), 35 deletions(-)

diff --git a/src/transformers/models/layoutlmv2/modeling_layoutlmv2.py b/src/transformers/models/layoutlmv2/modeling_layoutlmv2.py
index b0e6dc5fa046bb..b24a78e19ddf74 100755
--- a/src/transformers/models/layoutlmv2/modeling_layoutlmv2.py
+++ b/src/transformers/models/layoutlmv2/modeling_layoutlmv2.py
@@ -89,9 +89,9 @@ class RelationExtractionOutput(ModelOutput):
     """
 
     loss: Optional[torch.FloatTensor] = None
+    pred_relations: dict = None
     entities: dict = None
     relations: dict = None
-    pred_relations: dict = None
     hidden_states: Optional[Tuple[torch.FloatTensor]] = None
     attentions: Optional[Tuple[torch.FloatTensor]] = None
 
@@ -546,7 +546,7 @@ class LayoutLMv2PreTrainedModel(PreTrainedModel):
 
     def _init_weights(self, module):
         """Initialize the weights"""
-        if isinstance(module, nn.Linear):
+        if isinstance(module, (nn.Linear, nn.Bilinear)):
             # Slightly different from the TF version which uses truncated_normal for initialization
             # cf https://github.com/pytorch/pytorch/pull/5617
             module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
@@ -1491,15 +1491,9 @@ def __init__(self, in_features, out_features):
         self.bilinear = torch.nn.Bilinear(in_features, in_features, out_features, bias=False)
         self.linear = torch.nn.Linear(2 * in_features, out_features, bias=True)
 
-        self.reset_parameters()
-
     def forward(self, x_1, x_2):
         return self.bilinear(x_1, x_2) + self.linear(torch.cat((x_1, x_2), dim=-1))
 
-    def reset_parameters(self):
-        self.bilinear.reset_parameters()
-        self.linear.reset_parameters()
-
 
 class LayoutLMv2RelationExtractionDecoder(nn.Module):
     def __init__(self, config):
@@ -1632,14 +1626,14 @@ def forward(
         position_ids: Optional[torch.LongTensor] = None,
         head_mask: Optional[torch.FloatTensor] = None,
         inputs_embeds: Optional[torch.FloatTensor] = None,
-        entities=None,
-        relations=None,
+        entities: Optional[dict] = None,
+        relations: Optional[dict] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
     ):
         r"""
-        entities (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+        entities (...):
             ...
         relations (...):
             ...
@@ -1671,20 +1665,24 @@ def forward(
             return_dict=return_dict,
         )
 
-        seq_length = input_ids.size(1)
+        seq_length = input_ids.size(1) if input_ids is not None else inputs_embeds.size(1)
         text_output = outputs[0][:, :seq_length]
         text_output = self.dropout(text_output)
-        loss, pred_relations = self.extractor(text_output, entities, relations)
+
+        loss = None
+        pred_relations = None
+        if entities is not None and relations is not None:
+            loss, pred_relations = self.extractor(text_output, entities, relations)
 
         if not return_dict:
-            output = (pred_relations,) + outputs[2:]
-            return ((loss,) + output) if loss is not None else output
+            output = (entities, relations) + outputs[2:]
+            return ((loss, pred_relations) + output) if loss is not None else output
 
         return RelationExtractionOutput(
             loss=loss,
+            pred_relations=pred_relations,
             entities=entities,
             relations=relations,
-            pred_relations=pred_relations,
             hidden_states=outputs.hidden_states,
             attentions=outputs.attentions,
         )
diff --git a/tests/models/layoutlmv2/test_modeling_layoutlmv2.py b/tests/models/layoutlmv2/test_modeling_layoutlmv2.py
index d3aa6bc7d62c75..ebb06f67319a5a 100644
--- a/tests/models/layoutlmv2/test_modeling_layoutlmv2.py
+++ b/tests/models/layoutlmv2/test_modeling_layoutlmv2.py
@@ -15,11 +15,13 @@
 """ Testing suite for the PyTorch LayoutLMv2 model. """
 
 
+import copy
 import os
 import random
 import tempfile
 import unittest
 
+from transformers.models.auto import get_values
 from transformers.testing_utils import require_detectron2, require_torch, require_torch_multi_gpu, slow, torch_device
 from transformers.utils import is_detectron2_available, is_torch_available
 
@@ -31,6 +33,10 @@
     import torch
 
     from transformers import (
+        MODEL_FOR_DOCUMENT_QUESTION_ANSWERING_MAPPING,
+        MODEL_FOR_QUESTION_ANSWERING_MAPPING,
+        MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING,
+        MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING,
         MODEL_MAPPING,
         LayoutLMv2Config,
         LayoutLMv2ForQuestionAnswering,
@@ -143,8 +149,10 @@ def prepare_config_and_inputs(self):
             sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
             token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
             # we choose some random entities and relations
-            entities = [{"start": [0, 4], "end": [3, 6], "label": [2,1]} for _ in range(self.batch_size)]
-            relations =  [{"start_index": [0], "end_index": [5], "head": [0], "tail": [1]} for _ in range(self.batch_size)]
+            entities = [{"start": [0, 4], "end": [3, 6], "label": [2, 1]} for _ in range(self.batch_size)]
+            relations = [
+                {"start_index": [0], "end_index": [5], "head": [0], "tail": [1]} for _ in range(self.batch_size)
+            ]
 
         config = LayoutLMv2Config(
             vocab_size=self.vocab_size,
@@ -304,21 +312,18 @@ def create_and_check_for_relation_extraction(
         relations,
     ):
         model = LayoutLMv2ForRelationExtraction(config=config)
-        torch_device = "cpu"
         model.to(torch_device)
         model.eval()
         result = model(
-            input_ids.to("cpu"),
-            bbox=bbox.to("cpu"),
-            image=image.to("cpu"),
-            attention_mask=input_mask.to("cpu"),
-            token_type_ids=token_type_ids.to("cpu"),
+            input_ids,
+            bbox=bbox,
+            image=image,
+            attention_mask=input_mask,
+            token_type_ids=token_type_ids,
             entities=entities,
             relations=relations,
         )
-        print(result.pred_relations)
-        # self.parent.assertEqual(result.start_logits.shape, (self.batch_size, self.seq_length))
-        # self.parent.assertEqual(result.end_logits.shape, (self.batch_size, self.seq_length))
+        self.parent.assertTrue(result.pred_relations)
 
     def prepare_config_and_inputs_for_common(self):
         config_and_inputs = self.prepare_config_and_inputs()
@@ -331,6 +336,8 @@ def prepare_config_and_inputs_for_common(self):
             input_mask,
             sequence_labels,
             token_labels,
+            entities,
+            relations,
         ) = config_and_inputs
         inputs_dict = {
             "input_ids": input_ids,
@@ -362,6 +369,42 @@ class LayoutLMv2ModelTest(ModelTesterMixin, unittest.TestCase):
         else ()
     )
 
+    def _prepare_for_class(self, inputs_dict, model_class, return_labels=False):
+        inputs_dict = copy.deepcopy(inputs_dict)
+
+        if model_class.__name__ == "LayoutLMv2ForRelationExtraction":
+            # we choose some random entities and relations
+            entities = [{"start": [0, 4], "end": [3, 6], "label": [2, 1]} for _ in range(self.model_tester.batch_size)]
+            relations = [
+                {"start_index": [0], "end_index": [5], "head": [0], "tail": [1]}
+                for _ in range(self.model_tester.batch_size)
+            ]
+            inputs_dict["entities"] = entities
+            inputs_dict["relations"] = relations
+
+        if return_labels:
+            if model_class in [
+                *get_values(MODEL_FOR_QUESTION_ANSWERING_MAPPING),
+                *get_values(MODEL_FOR_DOCUMENT_QUESTION_ANSWERING_MAPPING),
+            ]:
+
+                inputs_dict["start_positions"] = torch.zeros(
+                    self.model_tester.batch_size, dtype=torch.long, device=torch_device
+                )
+                inputs_dict["end_positions"] = torch.zeros(
+                    self.model_tester.batch_size, dtype=torch.long, device=torch_device
+                )
+            elif model_class in get_values(MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING):
+                inputs_dict["labels"] = torch.zeros(
+                    self.model_tester.batch_size, dtype=torch.long, device=torch_device
+                )
+            elif model_class in get_values(MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING):
+                inputs_dict["labels"] = torch.zeros(
+                    (self.model_tester.batch_size, self.model_tester.seq_length), dtype=torch.long, device=torch_device
+                )
+
+        return inputs_dict
+
     def setUp(self):
         self.model_tester = LayoutLMv2ModelTester(self)
         self.config_tester = ConfigTester(self, config_class=LayoutLMv2Config, hidden_size=37)
@@ -404,7 +447,7 @@ def test_for_question_answering(self):
     def test_for_relation_extraction(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_for_relation_extraction(*config_and_inputs)
-    
+
     def test_save_load_fast_init_from_base(self):
         config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
         base_class = MODEL_MAPPING[config.__class__]
@@ -577,6 +620,88 @@ def test_initialization(self):
                         msg=f"Parameter {name} of model {model_class} seems not properly initialized",
                     )
 
+    def _create_and_check_torchscript(self, config, inputs_dict):
+        if not self.test_torchscript:
+            return
+
+        configs_no_init = _config_zero_init(config)  # To be sure we have no Nan
+        configs_no_init.torchscript = True
+        for model_class in self.all_model_classes:
+            if model_class.__name__ == "LayoutLMv2ForRelationExtraction":
+                continue
+
+            model = model_class(config=configs_no_init)
+            model.to(torch_device)
+            model.eval()
+            inputs = self._prepare_for_class(inputs_dict, model_class)
+
+            try:
+                input_ids = inputs["input_ids"]
+                bbox = inputs["bbox"]
+                image = inputs["image"].tensor
+                traced_model = torch.jit.trace(
+                    model, (input_ids, bbox, image), check_trace=False
+                )  # when traced model is checked, an error is produced due to name mangling
+            except RuntimeError:
+                self.fail("Couldn't trace module.")
+
+            with tempfile.TemporaryDirectory() as tmp_dir_name:
+                pt_file_name = os.path.join(tmp_dir_name, "traced_model.pt")
+
+                try:
+                    torch.jit.save(traced_model, pt_file_name)
+                except Exception:
+                    self.fail("Couldn't save module.")
+
+                try:
+                    loaded_model = torch.jit.load(pt_file_name)
+                except Exception:
+                    self.fail("Couldn't load module.")
+
+            model.to(torch_device)
+            model.eval()
+
+            loaded_model.to(torch_device)
+            loaded_model.eval()
+
+            model_state_dict = model.state_dict()
+            loaded_model_state_dict = loaded_model.state_dict()
+
+            non_persistent_buffers = {}
+            for key in loaded_model_state_dict.keys():
+                if key not in model_state_dict.keys():
+                    non_persistent_buffers[key] = loaded_model_state_dict[key]
+
+            loaded_model_state_dict = {
+                key: value for key, value in loaded_model_state_dict.items() if key not in non_persistent_buffers
+            }
+
+            self.assertEqual(set(model_state_dict.keys()), set(loaded_model_state_dict.keys()))
+
+            model_buffers = list(model.buffers())
+            for non_persistent_buffer in non_persistent_buffers.values():
+                found_buffer = False
+                for i, model_buffer in enumerate(model_buffers):
+                    if torch.equal(non_persistent_buffer, model_buffer):
+                        found_buffer = True
+                        break
+
+                self.assertTrue(found_buffer)
+                model_buffers.pop(i)
+
+            models_equal = True
+            for layer_name, p1 in model_state_dict.items():
+                if layer_name in loaded_model_state_dict:
+                    p2 = loaded_model_state_dict[layer_name]
+                    if p1.data.ne(p2.data).sum() > 0:
+                        models_equal = False
+
+            self.assertTrue(models_equal)
+
+            # Avoid memory leak. Without this, each call increase RAM usage by ~20MB.
+            # (Even with this call, there are still memory leak by ~0.04MB)
+            self.clear_torch_jit_class_registry()
+
 
 def prepare_layoutlmv2_batch_inputs():
     # Here we prepare a batch of 2 sequences to test a LayoutLMv2 forward pass on:
diff --git a/tests/test_modeling_common.py b/tests/test_modeling_common.py
index 082f2a8a9057f9..78308ced25dce9 100755
--- a/tests/test_modeling_common.py
+++ b/tests/test_modeling_common.py
@@ -658,13 +658,6 @@ def _create_and_check_torchscript(self, config, inputs_dict):
                     traced_model = torch.jit.trace(
                         model, (main_input, attention_mask, decoder_input_ids, decoder_attention_mask)
                     )
-                elif "bbox" in inputs and "image" in inputs:  # LayoutLMv2 requires additional inputs
-                    input_ids = inputs["input_ids"]
-                    bbox = inputs["bbox"]
-                    image = inputs["image"].tensor
-                    traced_model = torch.jit.trace(
-                        model, (input_ids, bbox, image), check_trace=False
-                    )  # when traced model is checked, an error is produced due to name mangling
                 else:
                     main_input = inputs[main_input_name]
                     traced_model = torch.jit.trace(model, main_input)

From cf7e2a878a5ae448b2d4ffa6566fcde7767e9415 Mon Sep 17 00:00:00 2001
From: NielsRogge <niels.rogge1@gmail.com>
Date: Tue, 20 Sep 2022 08:09:14 +0000
Subject: [PATCH 7/9] Make more tests pass

---
 .../layoutlmv2/test_modeling_layoutlmv2.py    | 234 ++++++++++--------
 1 file changed, 133 insertions(+), 101 deletions(-)

diff --git a/tests/models/layoutlmv2/test_modeling_layoutlmv2.py b/tests/models/layoutlmv2/test_modeling_layoutlmv2.py
index ebb06f67319a5a..82bb61bd7276dc 100644
--- a/tests/models/layoutlmv2/test_modeling_layoutlmv2.py
+++ b/tests/models/layoutlmv2/test_modeling_layoutlmv2.py
@@ -58,7 +58,7 @@ def __init__(
         batch_size=2,
         num_channels=3,
         image_size=4,
-        seq_length=7,
+        text_seq_length=7,
         is_training=True,
         use_input_mask=True,
         use_token_type_ids=True,
@@ -87,7 +87,7 @@ def __init__(
         self.batch_size = batch_size
         self.num_channels = num_channels
         self.image_size = image_size
-        self.seq_length = seq_length
+        self.text_seq_length = text_seq_length
         self.is_training = is_training
         self.use_input_mask = use_input_mask
         self.use_token_type_ids = use_token_type_ids
@@ -112,10 +112,13 @@ def __init__(
         self.scope = scope
         self.range_bbox = range_bbox
 
+        # in LayoutLMv2, the seq length equals the number of text tokens + number of image tokens
+        self.seq_length = self.text_seq_length + self.image_feature_pool_shape[0] * self.image_feature_pool_shape[1]
+
     def prepare_config_and_inputs(self):
-        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
+        input_ids = ids_tensor([self.batch_size, self.text_seq_length], self.vocab_size)
 
-        bbox = ids_tensor([self.batch_size, self.seq_length, 4], self.range_bbox)
+        bbox = ids_tensor([self.batch_size, self.text_seq_length, 4], self.range_bbox)
         # Ensure that bbox is legal
         for i in range(bbox.shape[0]):
             for j in range(bbox.shape[1]):
@@ -135,11 +138,11 @@ def prepare_config_and_inputs(self):
 
         input_mask = None
         if self.use_input_mask:
-            input_mask = random_attention_mask([self.batch_size, self.seq_length])
+            input_mask = random_attention_mask([self.batch_size, self.text_seq_length])
 
         token_type_ids = None
         if self.use_token_type_ids:
-            token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
+            token_type_ids = ids_tensor([self.batch_size, self.text_seq_length], self.type_vocab_size)
 
         sequence_labels = None
         token_labels = None
@@ -147,7 +150,7 @@ def prepare_config_and_inputs(self):
         relations = None
         if self.use_labels:
             sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
-            token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
+            token_labels = ids_tensor([self.batch_size, self.text_seq_length], self.num_labels)
             # we choose some random entities and relations
             entities = [{"start": [0, 4], "end": [3, 6], "label": [2, 1]} for _ in range(self.batch_size)]
             relations = [
@@ -211,9 +214,7 @@ def create_and_check_model(
         result = model(input_ids, bbox=bbox, image=image, token_type_ids=token_type_ids)
         result = model(input_ids, bbox=bbox, image=image)
 
-        # LayoutLMv2 has a different expected sequence length, namely also visual tokens are added
-        expected_seq_len = self.seq_length + self.image_feature_pool_shape[0] * self.image_feature_pool_shape[1]
-        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, expected_seq_len, self.hidden_size))
+        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
         self.parent.assertEqual(result.pooler_output.shape, (self.batch_size, self.hidden_size))
 
     def create_and_check_for_sequence_classification(
@@ -268,7 +269,7 @@ def create_and_check_for_token_classification(
             token_type_ids=token_type_ids,
             labels=token_labels,
         )
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.num_labels))
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.text_seq_length, self.num_labels))
 
     def create_and_check_for_question_answering(
         self,
@@ -295,8 +296,8 @@ def create_and_check_for_question_answering(
             start_positions=sequence_labels,
             end_positions=sequence_labels,
         )
-        self.parent.assertEqual(result.start_logits.shape, (self.batch_size, self.seq_length))
-        self.parent.assertEqual(result.end_logits.shape, (self.batch_size, self.seq_length))
+        self.parent.assertEqual(result.start_logits.shape, (self.batch_size, self.text_seq_length))
+        self.parent.assertEqual(result.end_logits.shape, (self.batch_size, self.text_seq_length))
 
     def create_and_check_for_relation_extraction(
         self,
@@ -400,7 +401,7 @@ def _prepare_for_class(self, inputs_dict, model_class, return_labels=False):
                 )
             elif model_class in get_values(MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING):
                 inputs_dict["labels"] = torch.zeros(
-                    (self.model_tester.batch_size, self.model_tester.seq_length), dtype=torch.long, device=torch_device
+                    (self.model_tester.batch_size, self.model_tester.text_seq_length), dtype=torch.long, device=torch_device
                 )
 
         return inputs_dict
@@ -496,113 +497,79 @@ class CopyClass(model_class):
                     max_diff = (model_slow_init.state_dict()[key] - model_fast_init.state_dict()[key]).sum().item()
                     self.assertLessEqual(max_diff, 1e-3, msg=f"{key} not identical")
 
-    def test_attention_outputs(self):
+    def test_save_load_fast_init_to_base(self):
         config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        config.return_dict = True
+        base_class = MODEL_MAPPING[config.__class__]
 
-        # LayoutLMv2 has a different expected sequence length
-        expected_seq_len = (
-            self.model_tester.seq_length
-            + self.model_tester.image_feature_pool_shape[0] * self.model_tester.image_feature_pool_shape[1]
-        )
+        if isinstance(base_class, tuple):
+            base_class = base_class[0]
 
         for model_class in self.all_model_classes:
-            inputs_dict["output_attentions"] = True
-            inputs_dict["output_hidden_states"] = False
-            config.return_dict = True
-            model = model_class(config)
-            model.to(torch_device)
-            model.eval()
-            with torch.no_grad():
-                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
-            attentions = outputs.attentions
-            self.assertEqual(len(attentions), self.model_tester.num_hidden_layers)
-
-            # check that output_attentions also work using config
-            del inputs_dict["output_attentions"]
-            config.output_attentions = True
-            model = model_class(config)
-            model.to(torch_device)
-            model.eval()
-            with torch.no_grad():
-                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
-            attentions = outputs.attentions
-            self.assertEqual(len(attentions), self.model_tester.num_hidden_layers)
-
-            self.assertListEqual(
-                list(attentions[0].shape[-3:]),
-                [self.model_tester.num_attention_heads, expected_seq_len, expected_seq_len],
-            )
-            out_len = len(outputs)
-
-            # Check attention is always last and order is fine
-            inputs_dict["output_attentions"] = True
-            inputs_dict["output_hidden_states"] = True
-            model = model_class(config)
-            model.to(torch_device)
-            model.eval()
-            with torch.no_grad():
-                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
-
-            if hasattr(self.model_tester, "num_hidden_states_types"):
-                added_hidden_states = self.model_tester.num_hidden_states_types
-            else:
-                added_hidden_states = 1
-            self.assertEqual(out_len + added_hidden_states, len(outputs))
-
-            self_attentions = outputs.attentions
 
-            self.assertEqual(len(self_attentions), self.model_tester.num_hidden_layers)
-            self.assertListEqual(
-                list(self_attentions[0].shape[-3:]),
-                [self.model_tester.num_attention_heads, expected_seq_len, expected_seq_len],
-            )
-
-    def test_hidden_states_output(self):
-        def check_hidden_states_output(inputs_dict, config, model_class):
-            model = model_class(config)
-            model.to(torch_device)
-            model.eval()
-
-            with torch.no_grad():
-                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
+            if model_class == base_class:
+                continue
 
-            hidden_states = outputs.hidden_states
+            # make a copy of model class to not break future tests
+            # from https://stackoverflow.com/questions/9541025/how-to-copy-a-python-class
+            class CopyClass(base_class):
+                pass
 
-            expected_num_layers = getattr(
-                self.model_tester, "expected_num_hidden_layers", self.model_tester.num_hidden_layers + 1
-            )
-            self.assertEqual(len(hidden_states), expected_num_layers)
+            base_class_copy = CopyClass
 
-            # LayoutLMv2 has a different expected sequence length
-            expected_seq_len = (
-                self.model_tester.seq_length
-                + self.model_tester.image_feature_pool_shape[0] * self.model_tester.image_feature_pool_shape[1]
-            )
+            # make sure that all keys are expected for test
+            base_class_copy._keys_to_ignore_on_load_missing = []
 
-            self.assertListEqual(
-                list(hidden_states[0].shape[-2:]),
-                [expected_seq_len, self.model_tester.hidden_size],
-            )
+            # make init deterministic, but make sure that
+            # non-initialized weights throw errors nevertheless
+            base_class_copy._init_weights = self._mock_init_weights
 
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+            model = model_class(config)
+            state_dict = model.state_dict()
 
-        for model_class in self.all_model_classes:
-            inputs_dict["output_hidden_states"] = True
-            check_hidden_states_output(inputs_dict, config, model_class)
+            # this will often delete a single weight of a multi-weight module
+            # to test an edge case
+            random_key_to_del = random.choice(list(state_dict.keys()))
+            del state_dict[random_key_to_del]
 
-            # check that output_hidden_states also work using config
-            del inputs_dict["output_hidden_states"]
-            config.output_hidden_states = True
+            # check that certain keys didn't get saved with the model
+            with tempfile.TemporaryDirectory() as tmpdirname:
+                model.config.save_pretrained(tmpdirname)
+                torch.save(state_dict, os.path.join(tmpdirname, "pytorch_model.bin"))
 
-            check_hidden_states_output(inputs_dict, config, model_class)
+                model_fast_init = base_class_copy.from_pretrained(tmpdirname)
+                model_slow_init = base_class_copy.from_pretrained(tmpdirname, _fast_init=False)
 
+                for key in model_fast_init.state_dict().keys():
+                    if key == "layoutlmv2.visual_segment_embedding":
+                        # we skip the visual segment embedding as it has a custom initialization scheme
+                        continue
+                    max_diff = (model_slow_init.state_dict()[key] - model_fast_init.state_dict()[key]).sum().item()
+                    self.assertLessEqual(max_diff, 1e-3, msg=f"{key} not identical")
+    
     @slow
     def test_model_from_pretrained(self):
         for model_name in LAYOUTLMV2_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
             model = LayoutLMv2Model.from_pretrained(model_name)
             self.assertIsNotNone(model)
 
+    def test_training(self):
+        if not self.model_tester.is_training:
+            return
+    
+        for model_class in self.all_model_classes:
+            print("Model class:", model_class)
+            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+            config.return_dict = True
+    
+            if model_class in get_values(MODEL_MAPPING):
+                continue
+    
+            model = model_class(config)
+            model.to(torch_device)
+            model.train()
+            inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True)
+            loss = model(**inputs).loss
+    
     def test_initialization(self):
         config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
 
@@ -620,6 +587,71 @@ def test_initialization(self):
                         msg=f"Parameter {name} of model {model_class} seems not properly initialized",
                     )
 
+    # we overwrite this as LayoutLMv2ForRelationExtraction is not supported
+    def test_headmasking(self):
+        if not self.test_head_masking:
+            return
+
+        global_rng = random.Random()
+
+        global_rng.seed(42)
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        global_rng.seed()
+
+        inputs_dict["output_attentions"] = True
+        config.output_hidden_states = True
+        configs_no_init = _config_zero_init(config)  # To be sure we have no Nan
+        for model_class in self.all_model_classes:
+            if model_class.__name__ == "LayoutLMv2ForRelationExtraction":
+                continue
+
+            model = model_class(config=configs_no_init)
+            model.to(torch_device)
+            model.eval()
+
+            # Prepare head_mask
+            # Set require_grad after having prepared the tensor to avoid error (leaf variable has been moved into the graph interior)
+            head_mask = torch.ones(
+                self.model_tester.num_hidden_layers,
+                self.model_tester.num_attention_heads,
+                device=torch_device,
+            )
+            head_mask[0, 0] = 0
+            head_mask[-1, :-1] = 0
+            head_mask.requires_grad_(requires_grad=True)
+            inputs = self._prepare_for_class(inputs_dict, model_class).copy()
+            inputs["head_mask"] = head_mask
+            outputs = model(**inputs, return_dict=True)
+
+            # Test that we can get a gradient back for importance score computation
+            output = sum(t.sum() for t in outputs[0])
+            output = output.sum()
+            output.backward()
+            multihead_outputs = head_mask.grad
+
+            self.assertIsNotNone(multihead_outputs)
+            self.assertEqual(len(multihead_outputs), self.model_tester.num_hidden_layers)
+
+            def check_attentions_validity(attentions):
+                # Remove Nan
+                for t in attentions:
+                    self.assertLess(
+                        torch.sum(torch.isnan(t)), t.numel() / 4
+                    )  # Check we don't have more than 25% nans (arbitrary)
+                attentions = [
+                    t.masked_fill(torch.isnan(t), 0.0) for t in attentions
+                ]  # remove them (the test is less complete)
+
+                self.assertAlmostEqual(attentions[0][..., 0, :, :].flatten().sum().item(), 0.0)
+                self.assertNotEqual(attentions[0][..., -1, :, :].flatten().sum().item(), 0.0)
+                if len(attentions) > 2:  # encoder-decoder models have only 2 layers in each module
+                    self.assertNotEqual(attentions[1][..., 0, :, :].flatten().sum().item(), 0.0)
+                self.assertAlmostEqual(attentions[-1][..., -2, :, :].flatten().sum().item(), 0.0)
+                self.assertNotEqual(attentions[-1][..., -1, :, :].flatten().sum().item(), 0.0)
+
+            check_attentions_validity(outputs.attentions)
+    
+    # we overwrite this as LayoutLMv2 requires special inputs + LayoutLMv2ForRelationExtraction is not supported
     def _create_and_check_torchscript(self, config, inputs_dict):
         if not self.test_torchscript:
             return

From 3a904ea2dcfe0fae258aa97c19c6046419ccbeb2 Mon Sep 17 00:00:00 2001
From: NielsRogge <niels.rogge1@gmail.com>
Date: Tue, 20 Sep 2022 08:16:12 +0000
Subject: [PATCH 8/9] Improve docstrign

---
 .../models/layoutlmv2/modeling_layoutlmv2.py  | 24 ++++---
 tests/models/layoutlmv2/test.py               | 72 +++++++++----------
 .../layoutlmv2/test_modeling_layoutlmv2.py    | 26 ++-----
 3 files changed, 54 insertions(+), 68 deletions(-)

diff --git a/src/transformers/models/layoutlmv2/modeling_layoutlmv2.py b/src/transformers/models/layoutlmv2/modeling_layoutlmv2.py
index b24a78e19ddf74..9fc957b808989b 100755
--- a/src/transformers/models/layoutlmv2/modeling_layoutlmv2.py
+++ b/src/transformers/models/layoutlmv2/modeling_layoutlmv2.py
@@ -1463,12 +1463,17 @@ def forward(
 
 class BiaffineAttention(torch.nn.Module):
     """Implements a biaffine attention operator for binary relation classification.
-    Args:
+
     PyTorch implementation of the biaffine attention operator from "End-to-end neural relation extraction using deep
     biaffine attention" (https://arxiv.org/abs/1812.11275) which can be used as a classifier for binary relation
     classification.
-        in_features (int): The size of the feature dimension of the inputs. out_features (int): The size of the feature
-        dimension of the output.
+
+    Args:
+        in_features (int):
+            The size of the feature dimension of the inputs.
+        out_features (int):
+            The size of the feature dimension of the output.
+
     Shape:
         - x_1: `(N, *, in_features)` where `N` is the batch dimension and `*` means any number of additional
           dimensisons.
@@ -1476,10 +1481,6 @@ class BiaffineAttention(torch.nn.Module):
           dimensions.
         - Output: `(N, *, out_features)`, where `N` is the batch dimension and `*` means any number
             of additional dimensions.
-    Examples:
-        >>> batch_size, in_features, out_features = 32, 100, 4 >>> biaffine_attention = BiaffineAttention(in_features,
-        out_features) >>> x_1 = torch.randn(batch_size, in_features) >>> x_2 = torch.randn(batch_size, in_features) >>>
-        output = biaffine_attention(x_1, x_2) >>> print(output.size()) torch.Size([32, 4])
     """
 
     def __init__(self, in_features, out_features):
@@ -1669,10 +1670,11 @@ def forward(
         text_output = outputs[0][:, :seq_length]
         text_output = self.dropout(text_output)
 
-        loss = None
-        pred_relations = None
-        if entities is not None and relations is not None:
-            loss, pred_relations = self.extractor(text_output, entities, relations)
+        if entities is None or relations is None:
+            raise ValueError(
+                "You need to provide entities and relations. Instantiate relations with empty lists at inference time"
+            )
+        loss, pred_relations = self.extractor(text_output, entities, relations)
 
         if not return_dict:
             output = (entities, relations) + outputs[2:]
diff --git a/tests/models/layoutlmv2/test.py b/tests/models/layoutlmv2/test.py
index cb5ba15507e25a..70ca480b9a633b 100644
--- a/tests/models/layoutlmv2/test.py
+++ b/tests/models/layoutlmv2/test.py
@@ -1,38 +1,38 @@
 # for example in range(self.batch_size):
-            #     # sample a number of entities for the example
-            #     num_entities = random.randint(1, self.max_entities)
-            #     entity_starts = []
-            #     entity_ends = []
-            #     entity_labels = []
-            #     for entity in range(num_entities):
-            #         entity_start = random.randint(0, self.seq_length)
-            #         entity_end = entity_start + random.randint(1, self.max_entity_length)
-            #         entity_label = random.randint(0, self.num_labels)
-            #         entity_starts.append(entity_start)
-            #         entity_ends.append(entity_end)
-            #         entity_labels.append(entity_label)
-            #     entity_dict = {
-            #         "start": entity_starts,
-            #         "end": entity_ends,
-            #         "label": entity_labels,
-            #     }
-            #     entities.append(entity_dict)
+#     # sample a number of entities for the example
+#     num_entities = random.randint(1, self.max_entities)
+#     entity_starts = []
+#     entity_ends = []
+#     entity_labels = []
+#     for entity in range(num_entities):
+#         entity_start = random.randint(0, self.seq_length)
+#         entity_end = entity_start + random.randint(1, self.max_entity_length)
+#         entity_label = random.randint(0, self.num_labels)
+#         entity_starts.append(entity_start)
+#         entity_ends.append(entity_end)
+#         entity_labels.append(entity_label)
+#     entity_dict = {
+#         "start": entity_starts,
+#         "end": entity_ends,
+#         "label": entity_labels,
+#     }
+#     entities.append(entity_dict)
 
-            #     # sample a number of relations for the example
-            #     num_relations = random.randint(1, self.max_relations)
-            #     start_indices = []
-            #     end_indices = []
-            #     heads = []
-            #     tails = []
-            #     for relation in range(num_relations):
-            #         start_index = random.randint(0, self.seq_length)
-            #         end_index = start_index + random.randint(1, self.max_entity_length)
-            #         head = random.randint(0, self.max_entities)
-            #         tail = random.randint(0, self.max_entities)
-            #     relation_dict = {
-            #         "start_index": start_indices,
-            #         "end_index": end_indices,
-            #         "head": heads,
-            #         "tail": tails,
-            #     }
-            #     relations.append(relation_dict)
\ No newline at end of file
+#     # sample a number of relations for the example
+#     num_relations = random.randint(1, self.max_relations)
+#     start_indices = []
+#     end_indices = []
+#     heads = []
+#     tails = []
+#     for relation in range(num_relations):
+#         start_index = random.randint(0, self.seq_length)
+#         end_index = start_index + random.randint(1, self.max_entity_length)
+#         head = random.randint(0, self.max_entities)
+#         tail = random.randint(0, self.max_entities)
+#     relation_dict = {
+#         "start_index": start_indices,
+#         "end_index": end_indices,
+#         "head": heads,
+#         "tail": tails,
+#     }
+#     relations.append(relation_dict)
diff --git a/tests/models/layoutlmv2/test_modeling_layoutlmv2.py b/tests/models/layoutlmv2/test_modeling_layoutlmv2.py
index 82bb61bd7276dc..294d4f7ce971dc 100644
--- a/tests/models/layoutlmv2/test_modeling_layoutlmv2.py
+++ b/tests/models/layoutlmv2/test_modeling_layoutlmv2.py
@@ -401,7 +401,9 @@ def _prepare_for_class(self, inputs_dict, model_class, return_labels=False):
                 )
             elif model_class in get_values(MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING):
                 inputs_dict["labels"] = torch.zeros(
-                    (self.model_tester.batch_size, self.model_tester.text_seq_length), dtype=torch.long, device=torch_device
+                    (self.model_tester.batch_size, self.model_tester.text_seq_length),
+                    dtype=torch.long,
+                    device=torch_device,
                 )
 
         return inputs_dict
@@ -545,31 +547,13 @@ class CopyClass(base_class):
                         continue
                     max_diff = (model_slow_init.state_dict()[key] - model_fast_init.state_dict()[key]).sum().item()
                     self.assertLessEqual(max_diff, 1e-3, msg=f"{key} not identical")
-    
+
     @slow
     def test_model_from_pretrained(self):
         for model_name in LAYOUTLMV2_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
             model = LayoutLMv2Model.from_pretrained(model_name)
             self.assertIsNotNone(model)
 
-    def test_training(self):
-        if not self.model_tester.is_training:
-            return
-    
-        for model_class in self.all_model_classes:
-            print("Model class:", model_class)
-            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-            config.return_dict = True
-    
-            if model_class in get_values(MODEL_MAPPING):
-                continue
-    
-            model = model_class(config)
-            model.to(torch_device)
-            model.train()
-            inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True)
-            loss = model(**inputs).loss
-    
     def test_initialization(self):
         config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
 
@@ -650,7 +634,7 @@ def check_attentions_validity(attentions):
                 self.assertNotEqual(attentions[-1][..., -1, :, :].flatten().sum().item(), 0.0)
 
             check_attentions_validity(outputs.attentions)
-    
+
     # we overwrite this as LayoutLMv2 requires special inputs + LayoutLMv2ForRelationExtraction is not supported
     def _create_and_check_torchscript(self, config, inputs_dict):
         if not self.test_torchscript:

From 1e844fa0c1d274a08e94f73739931284ddccacf3 Mon Sep 17 00:00:00 2001
From: NielsRogge <niels.rogge1@gmail.com>
Date: Tue, 20 Sep 2022 09:11:03 +0000
Subject: [PATCH 9/9] Make all tests pass

---
 .../models/layoutlmv2/modeling_layoutlmv2.py  | 46 +++++++----
 tests/models/layoutlmv2/test.py               | 38 ---------
 .../layoutlmv2/test_modeling_layoutlmv2.py    | 78 +++++++++++++++++++
 3 files changed, 111 insertions(+), 51 deletions(-)
 delete mode 100644 tests/models/layoutlmv2/test.py

diff --git a/src/transformers/models/layoutlmv2/modeling_layoutlmv2.py b/src/transformers/models/layoutlmv2/modeling_layoutlmv2.py
index 9fc957b808989b..32766d4724940b 100755
--- a/src/transformers/models/layoutlmv2/modeling_layoutlmv2.py
+++ b/src/transformers/models/layoutlmv2/modeling_layoutlmv2.py
@@ -70,14 +70,17 @@ class RelationExtractionOutput(ModelOutput):
     Class for outputs of [`LayoutLMv2ForRelationExtraction`].
 
     Args:
-        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
+        loss (`torch.FloatTensor` of shape `(1,)`:
             Classification (or regression if config.num_labels==1) loss.
-        entities (...)
-            ...
-        relations (...)
-            ...
-        pred_relations (...)
-            ...
+        entities (`list[dict]`):
+            List of dictionaries (one per example in the batch). Each dictionary contains 3 keys: `start`, `end` and
+            `label`.
+        relations (`list[dict]`):
+            List of dictionaries (one per example in the batch). Each dictionary contains 4 keys: `start_index`,
+            `end_index`, `head` and `tail`.
+        pred_relations (`list[dict]`):
+            List of dictionaries (one per example in the batch). Each dictionary contains 7 keys: `head`, `head_id`,
+            `head_type`, `tail`, `tail_id`, `tail_type` and `type`.
         hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
             Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
             one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of
@@ -1634,10 +1637,12 @@ def forward(
         return_dict: Optional[bool] = None,
     ):
         r"""
-        entities (...):
-            ...
-        relations (...):
-            ...
+        entities (`list[dict]`):
+            List of dictionaries (one per example in the batch). Each dictionary contains 3 keys: `start`, `end` and
+            `label`.
+        relations (`list[dict]`):
+            List of dictionaries (one per example in the batch). Each dictionary contains 4 keys: `start_index`,
+            `end_index`, `head` and `tail`.
 
         Returns:
 
@@ -1647,6 +1652,21 @@ def forward(
         >>> from transformers import LayoutLMv2Processor, LayoutLMv2ForRelationExtraction
         >>> from PIL import Image
         >>> from datasets import load_dataset
+
+        >>> processor = LayoutLMv2Processor.from_pretrained("microsoft/layoutlmv2-base-uncased")
+        >>> model = LayoutLMv2ForRelationExtraction.from_pretrained("microsoft/layoutlmv2-base-uncased")
+
+        >>> dataset = load_dataset("hf-internal-testing/fixtures_docvqa")
+        >>> image_path = dataset["test"][0]["file"]
+        >>> image = Image.open(image_path).convert("RGB")
+        >>> encoding = processor(image, return_tensors="pt")
+
+        >>> # instantiate relations as empty at inference time
+        >>> encoding["entities"] = [{"start": [0, 4], "end": [3, 6], "label": [2, 1]}]
+        >>> encoding["relations"] = [{"start_index": [], "end_index": [], "head": [], "tail": []}]
+
+        >>> outputs = model(**encoding)
+        >>> predicted_relations = outputs.pred_relations[0]
         ```
         """
 
@@ -1677,8 +1697,8 @@ def forward(
         loss, pred_relations = self.extractor(text_output, entities, relations)
 
         if not return_dict:
-            output = (entities, relations) + outputs[2:]
-            return ((loss, pred_relations) + output) if loss is not None else output
+            output = (loss, pred_relations, entities, relations) + outputs[2:]
+            return output
 
         return RelationExtractionOutput(
             loss=loss,
diff --git a/tests/models/layoutlmv2/test.py b/tests/models/layoutlmv2/test.py
deleted file mode 100644
index 70ca480b9a633b..00000000000000
--- a/tests/models/layoutlmv2/test.py
+++ /dev/null
@@ -1,38 +0,0 @@
-# for example in range(self.batch_size):
-#     # sample a number of entities for the example
-#     num_entities = random.randint(1, self.max_entities)
-#     entity_starts = []
-#     entity_ends = []
-#     entity_labels = []
-#     for entity in range(num_entities):
-#         entity_start = random.randint(0, self.seq_length)
-#         entity_end = entity_start + random.randint(1, self.max_entity_length)
-#         entity_label = random.randint(0, self.num_labels)
-#         entity_starts.append(entity_start)
-#         entity_ends.append(entity_end)
-#         entity_labels.append(entity_label)
-#     entity_dict = {
-#         "start": entity_starts,
-#         "end": entity_ends,
-#         "label": entity_labels,
-#     }
-#     entities.append(entity_dict)
-
-#     # sample a number of relations for the example
-#     num_relations = random.randint(1, self.max_relations)
-#     start_indices = []
-#     end_indices = []
-#     heads = []
-#     tails = []
-#     for relation in range(num_relations):
-#         start_index = random.randint(0, self.seq_length)
-#         end_index = start_index + random.randint(1, self.max_entity_length)
-#         head = random.randint(0, self.max_entities)
-#         tail = random.randint(0, self.max_entities)
-#     relation_dict = {
-#         "start_index": start_indices,
-#         "end_index": end_indices,
-#         "head": heads,
-#         "tail": tails,
-#     }
-#     relations.append(relation_dict)
diff --git a/tests/models/layoutlmv2/test_modeling_layoutlmv2.py b/tests/models/layoutlmv2/test_modeling_layoutlmv2.py
index 294d4f7ce971dc..05e452e24e4074 100644
--- a/tests/models/layoutlmv2/test_modeling_layoutlmv2.py
+++ b/tests/models/layoutlmv2/test_modeling_layoutlmv2.py
@@ -20,6 +20,7 @@
 import random
 import tempfile
 import unittest
+from typing import Dict, List, Tuple
 
 from transformers.models.auto import get_values
 from transformers.testing_utils import require_detectron2, require_torch, require_torch_multi_gpu, slow, torch_device
@@ -718,6 +719,83 @@ def _create_and_check_torchscript(self, config, inputs_dict):
             # (Even with this call, there are still memory leak by ~0.04MB)
             self.clear_torch_jit_class_registry()
 
+    # overwrite as LayoutLMv2ForRelationExtraction outputs dictonaries containing integers rather than tensors
+    def test_model_outputs_equivalence(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        def set_nan_tensor_to_zero(t):
+            t[t != t] = 0
+            return t
+
+        def check_equivalence(model, tuple_inputs, dict_inputs, additional_kwargs={}):
+            with torch.no_grad():
+                tuple_output = model(**tuple_inputs, return_dict=False, **additional_kwargs)
+                dict_output = model(**dict_inputs, return_dict=True, **additional_kwargs).to_tuple()
+
+                def recursive_check(tuple_object, dict_object):
+                    if isinstance(tuple_object, (List, Tuple)):
+                        for tuple_iterable_value, dict_iterable_value in zip(tuple_object, dict_object):
+                            recursive_check(tuple_iterable_value, dict_iterable_value)
+                    elif isinstance(tuple_object, Dict):
+                        for tuple_iterable_value, dict_iterable_value in zip(
+                            tuple_object.values(), dict_object.values()
+                        ):
+                            recursive_check(tuple_iterable_value, dict_iterable_value)
+                    elif tuple_object is None:
+                        return
+                    elif isinstance(tuple_object, int):
+                        self.assertEqual(tuple_object, dict_object)
+                    else:
+                        self.assertTrue(
+                            torch.allclose(
+                                set_nan_tensor_to_zero(tuple_object), set_nan_tensor_to_zero(dict_object), atol=1e-5
+                            ),
+                            msg=(
+                                "Tuple and dict output are not equal. Difference:"
+                                f" {torch.max(torch.abs(tuple_object - dict_object))}. Tuple has `nan`:"
+                                f" {torch.isnan(tuple_object).any()} and `inf`: {torch.isinf(tuple_object)}. Dict has"
+                                f" `nan`: {torch.isnan(dict_object).any()} and `inf`: {torch.isinf(dict_object)}."
+                            ),
+                        )
+
+                recursive_check(tuple_output, dict_output)
+
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            model.to(torch_device)
+            model.eval()
+
+            tuple_inputs = self._prepare_for_class(inputs_dict, model_class)
+            dict_inputs = self._prepare_for_class(inputs_dict, model_class)
+            check_equivalence(model, tuple_inputs, dict_inputs)
+
+            tuple_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True)
+            dict_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True)
+            check_equivalence(model, tuple_inputs, dict_inputs)
+
+            tuple_inputs = self._prepare_for_class(inputs_dict, model_class)
+            dict_inputs = self._prepare_for_class(inputs_dict, model_class)
+            check_equivalence(model, tuple_inputs, dict_inputs, {"output_hidden_states": True})
+
+            tuple_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True)
+            dict_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True)
+            check_equivalence(model, tuple_inputs, dict_inputs, {"output_hidden_states": True})
+
+            if self.has_attentions:
+                tuple_inputs = self._prepare_for_class(inputs_dict, model_class)
+                dict_inputs = self._prepare_for_class(inputs_dict, model_class)
+                check_equivalence(model, tuple_inputs, dict_inputs, {"output_attentions": True})
+
+                tuple_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True)
+                dict_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True)
+                check_equivalence(model, tuple_inputs, dict_inputs, {"output_attentions": True})
+
+                tuple_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True)
+                dict_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True)
+                check_equivalence(
+                    model, tuple_inputs, dict_inputs, {"output_hidden_states": True, "output_attentions": True}
+                )
+
 
 def prepare_layoutlmv2_batch_inputs():
     # Here we prepare a batch of 2 sequences to test a LayoutLMv2 forward pass on: