huggingface · stancld · Jan 4, 2021 · Jan 5, 2021 · Jan 5, 2021 · Jan 8, 2021
diff --git a/src/transformers/models/bart/modeling_bart.py b/src/transformers/models/bart/modeling_bart.py
diff --git a/src/transformers/models/blenderbot/modeling_blenderbot.py b/src/transformers/models/blenderbot/modeling_blenderbot.py
diff --git a/src/transformers/models/blenderbot_small/modeling_blenderbot_small.py b/src/transformers/models/blenderbot_small/modeling_blenderbot_small.py
diff --git a/src/transformers/models/marian/modeling_marian.py b/src/transformers/models/marian/modeling_marian.py
diff --git a/src/transformers/models/mbart/modeling_mbart.py b/src/transformers/models/mbart/modeling_mbart.py
diff --git a/src/transformers/models/pegasus/modeling_pegasus.py b/src/transformers/models/pegasus/modeling_pegasus.py
diff --git a/tests/test_modeling_bart.py b/tests/test_modeling_bart.py
@@ -51,18 +51,26 @@ def prepare_bart_inputs_dict(
     config,
     input_ids,
     decoder_input_ids=None,
+    head_mask=None,
     attention_mask=None,
     decoder_attention_mask=None,
+    decoder_head_mask=None,
 ):
     if attention_mask is None:
         attention_mask = input_ids.ne(config.pad_token_id)
+    if head_mask is None:
+        head_mask = torch.ones(config.encoder_layers, config.encoder_attention_heads)
     if decoder_attention_mask is None:
         decoder_attention_mask = decoder_input_ids.ne(config.pad_token_id)
+    if decoder_head_mask is None:
+        decoder_head_mask = torch.ones(config.decoder_layers, config.decoder_attention_heads)
     return {
         "input_ids": input_ids,
         "decoder_input_ids": decoder_input_ids,
         "attention_mask": attention_mask,
+        "head_mask": head_mask,
         "decoder_attention_mask": attention_mask,
+        "decoder_head_mask": decoder_head_mask,
     }
 
 
@@ -142,9 +150,10 @@ def create_and_check_decoder_model_past_large_inputs(self, config, inputs_dict):
         model = BartModel(config=config).get_decoder().to(torch_device).eval()
         input_ids = inputs_dict["input_ids"]
         attention_mask = inputs_dict["attention_mask"]
+        head_mask = inputs_dict["head_mask"]
 
         # first forward pass
-        outputs = model(input_ids, attention_mask=attention_mask, use_cache=True)
+        outputs = model(input_ids, attention_mask=attention_mask, head_mask=head_mask, use_cache=True)
 
         output, past_key_values = outputs.to_tuple()
 

diff --git a/tests/test_modeling_blenderbot.py b/tests/test_modeling_blenderbot.py
@@ -39,17 +39,25 @@ def prepare_blenderbot_inputs_dict(
     input_ids,
     decoder_input_ids,
     attention_mask=None,
+    head_mask=None,
     decoder_attention_mask=None,
+    decoder_head_mask=None,
 ):
     if attention_mask is None:
         attention_mask = input_ids.ne(config.pad_token_id)
+    if head_mask is None:
+        head_mask = torch.ones(config.encoder_layers, config.encoder_attention_heads)
     if decoder_attention_mask is None:
         decoder_attention_mask = decoder_input_ids.ne(config.pad_token_id)
+    if decoder_head_mask is None:
+        decoder_head_mask = torch.ones(config.decoder_layers, config.decoder_attention_heads)
     return {
         "input_ids": input_ids,
         "decoder_input_ids": decoder_input_ids,
         "attention_mask": attention_mask,
+        "head_mask": head_mask,
         "decoder_attention_mask": attention_mask,
+        "decoder_head_mask": decoder_head_mask,
     }
 
 
@@ -129,9 +137,10 @@ def create_and_check_decoder_model_past_large_inputs(self, config, inputs_dict):
         model = BlenderbotModel(config=config).get_decoder().to(torch_device).eval()
         input_ids = inputs_dict["input_ids"]
         attention_mask = inputs_dict["attention_mask"]
+        head_mask = inputs_dict["head_mask"]
 
         # first forward pass
-        outputs = model(input_ids, attention_mask=attention_mask, use_cache=True)
+        outputs = model(input_ids, attention_mask=attention_mask, head_mask=head_mask, use_cache=True)
 
         output, past_key_values = outputs.to_tuple()
 

diff --git a/tests/test_modeling_blenderbot_small.py b/tests/test_modeling_blenderbot_small.py
@@ -47,17 +47,25 @@ def prepare_blenderbot_small_inputs_dict(
     input_ids,
     decoder_input_ids,
     attention_mask=None,
+    head_mask=None,
     decoder_attention_mask=None,
+    decoder_head_mask=None,
 ):
     if attention_mask is None:
         attention_mask = input_ids.ne(config.pad_token_id)
+    if head_mask is None:
+        head_mask = torch.ones(config.encoder_layers, config.encoder_attention_heads)
     if decoder_attention_mask is None:
         decoder_attention_mask = decoder_input_ids.ne(config.pad_token_id)
+    if decoder_head_mask is None:
+        decoder_head_mask = torch.ones(config.decoder_layers, config.decoder_attention_heads)
     return {
         "input_ids": input_ids,
         "decoder_input_ids": decoder_input_ids,
         "attention_mask": attention_mask,
+        "head_mask": head_mask,
         "decoder_attention_mask": attention_mask,
+        "decoder_head_mask": decoder_head_mask,
     }
 
 
@@ -137,9 +145,10 @@ def create_and_check_decoder_model_past_large_inputs(self, config, inputs_dict):
         model = BlenderbotSmallModel(config=config).get_decoder().to(torch_device).eval()
         input_ids = inputs_dict["input_ids"]
         attention_mask = inputs_dict["attention_mask"]
+        head_mask = inputs_dict["head_mask"]
 
         # first forward pass
-        outputs = model(input_ids, attention_mask=attention_mask, use_cache=True)
+        outputs = model(input_ids, attention_mask=attention_mask, head_mask=head_mask, use_cache=True)
 
         output, past_key_values = outputs.to_tuple()
 

diff --git a/tests/test_modeling_common.py b/tests/test_modeling_common.py
@@ -206,7 +206,12 @@ def test_forward_signature(self):
                     "decoder_attention_mask",
                     "encoder_outputs",
                 ]
-                self.assertListEqual(arg_names[:5], expected_arg_names)
+                if model.config.model_type in ["bart", "mbart", "marian", "blenderbot", "blenderbot-small", "pegasus"]:
+                    expected_arg_names.insert(2, "head_mask")
+                    expected_arg_names.insert(5, "decoder_head_mask")
+                    self.assertListEqual(arg_names[:7], expected_arg_names)
+                else:
+                    self.assertListEqual(arg_names[:5], expected_arg_names)
             else:
                 expected_arg_names = ["input_ids"]
                 self.assertListEqual(arg_names[:1], expected_arg_names)
@@ -395,10 +400,31 @@ def _create_and_check_torchscript(self, config, inputs_dict):
                     attention_mask = inputs["attention_mask"]
                     decoder_input_ids = inputs["decoder_input_ids"]
                     decoder_attention_mask = inputs["decoder_attention_mask"]
-
-                    traced_model = torch.jit.trace(
-                        model, (input_ids, attention_mask, decoder_input_ids, decoder_attention_mask)
-                    )
+                    if model.config.model_type not in [
+                        "bart",
+                        "mbart",
+                        "marian",
+                        "blenderbot",
+                        "blenderbot-small",
+                        "pegasus",
+                    ]:
+                        traced_model = torch.jit.trace(
+                            model, (input_ids, attention_mask, decoder_input_ids, decoder_attention_mask)
+                        )
+                    else:
+                        head_mask = inputs["head_mask"]
+                        decoder_head_mask = inputs["decoder_head_mask"]
+                        traced_model = torch.jit.trace(
+                            model,
+                            (
+                                input_ids,
+                                attention_mask,
+                                head_mask,
+                                decoder_input_ids,
+                                decoder_attention_mask,
+                                decoder_head_mask,
+                            ),
+                        )
                 else:
                     input_ids = inputs["input_ids"]
                     traced_model = torch.jit.trace(model, input_ids)

diff --git a/tests/test_modeling_marian.py b/tests/test_modeling_marian.py
@@ -53,17 +53,25 @@ def prepare_marian_inputs_dict(
     input_ids,
     decoder_input_ids,
     attention_mask=None,
+    head_mask=None,
     decoder_attention_mask=None,
+    decoder_head_mask=None,
 ):
     if attention_mask is None:
         attention_mask = input_ids.ne(config.pad_token_id)
+    if head_mask is None:
+        head_mask = torch.ones(config.encoder_layers, config.encoder_attention_heads)
     if decoder_attention_mask is None:
         decoder_attention_mask = decoder_input_ids.ne(config.pad_token_id)
+    if decoder_head_mask is None:
+        decoder_head_mask = torch.ones(config.decoder_layers, config.decoder_attention_heads)
     return {
         "input_ids": input_ids,
         "decoder_input_ids": decoder_input_ids,
         "attention_mask": attention_mask,
+        "head_mask": head_mask,
         "decoder_attention_mask": attention_mask,
+        "decoder_head_mask": decoder_head_mask,
     }
 
 
@@ -146,9 +154,10 @@ def create_and_check_decoder_model_past_large_inputs(self, config, inputs_dict):
         model = MarianModel(config=config).get_decoder().to(torch_device).eval()
         input_ids = inputs_dict["input_ids"]
         attention_mask = inputs_dict["attention_mask"]
+        head_mask = inputs_dict["head_mask"]
 
         # first forward pass
-        outputs = model(input_ids, attention_mask=attention_mask, use_cache=True)
+        outputs = model(input_ids, attention_mask=attention_mask, head_mask=head_mask, use_cache=True)
 
         output, past_key_values = outputs.to_tuple()
 

diff --git a/tests/test_modeling_mbart.py b/tests/test_modeling_mbart.py
@@ -48,17 +48,25 @@ def prepare_mbart_inputs_dict(
     input_ids,
     decoder_input_ids,
     attention_mask=None,
+    head_mask=None,
     decoder_attention_mask=None,
+    decoder_head_mask=None,
 ):
     if attention_mask is None:
         attention_mask = input_ids.ne(config.pad_token_id)
+    if head_mask is None:
+        head_mask = torch.ones(config.encoder_layers, config.encoder_attention_heads)
     if decoder_attention_mask is None:
         decoder_attention_mask = decoder_input_ids.ne(config.pad_token_id)
+    if decoder_head_mask is None:
+        decoder_head_mask = torch.ones(config.decoder_layers, config.decoder_attention_heads)
     return {
         "input_ids": input_ids,
         "decoder_input_ids": decoder_input_ids,
+        "head_mask": head_mask,
         "attention_mask": attention_mask,
         "decoder_attention_mask": attention_mask,
+        "decoder_head_mask": decoder_head_mask,
     }
 
 
@@ -138,9 +146,10 @@ def create_and_check_decoder_model_past_large_inputs(self, config, inputs_dict):
         model = MBartModel(config=config).get_decoder().to(torch_device).eval()
         input_ids = inputs_dict["input_ids"]
         attention_mask = inputs_dict["attention_mask"]
+        head_mask = inputs_dict["head_mask"]
 
         # first forward pass
-        outputs = model(input_ids, attention_mask=attention_mask, use_cache=True)
+        outputs = model(input_ids, attention_mask=attention_mask, head_mask=head_mask, use_cache=True)
 
         output, past_key_values = outputs.to_tuple()
 

diff --git a/tests/test_modeling_pegasus.py b/tests/test_modeling_pegasus.py
@@ -40,17 +40,25 @@ def prepare_pegasus_inputs_dict(
     input_ids,
     decoder_input_ids,
     attention_mask=None,
+    head_mask=None,
     decoder_attention_mask=None,
+    decoder_head_mask=None,
 ):
     if attention_mask is None:
         attention_mask = input_ids.ne(config.pad_token_id)
+    if head_mask is None:
+        head_mask = torch.ones(config.encoder_layers, config.encoder_attention_heads)
     if decoder_attention_mask is None:
         decoder_attention_mask = decoder_input_ids.ne(config.pad_token_id)
+    if decoder_head_mask is None:
+        decoder_head_mask = torch.ones(config.decoder_layers, config.decoder_attention_heads)
     return {
         "input_ids": input_ids,
         "decoder_input_ids": decoder_input_ids,
         "attention_mask": attention_mask,
+        "head_mask": head_mask,
         "decoder_attention_mask": attention_mask,
+        "decoder_head_mask": decoder_head_mask,
     }
 
 
@@ -130,9 +138,10 @@ def create_and_check_decoder_model_past_large_inputs(self, config, inputs_dict):
         model = PegasusModel(config=config).get_decoder().to(torch_device).eval()
         input_ids = inputs_dict["input_ids"]
         attention_mask = inputs_dict["attention_mask"]
+        head_mask = inputs_dict["head_mask"]
 
         # first forward pass
-        outputs = model(input_ids, attention_mask=attention_mask, use_cache=True)
+        outputs = model(input_ids, attention_mask=attention_mask, head_mask=head_mask, use_cache=True)
 
         output, past_key_values = outputs.to_tuple()