diff --git a/docs/source/model_doc/luke.rst b/docs/source/model_doc/luke.rst
index 95b50bf006e26a..b3190ea6532db9 100644
--- a/docs/source/model_doc/luke.rst
+++ b/docs/source/model_doc/luke.rst
@@ -1,5 +1,5 @@
 .. 
-    Copyright 2020 The HuggingFace Team. All rights reserved.
+    Copyright 2021 The HuggingFace Team. All rights reserved.
 
     Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
     the License. You may obtain a copy of the License at
diff --git a/src/transformers/models/luke/__init__.py b/src/transformers/models/luke/__init__.py
index 106a65e22c9f0a..4f5f3155581ab6 100644
--- a/src/transformers/models/luke/__init__.py
+++ b/src/transformers/models/luke/__init__.py
@@ -2,7 +2,7 @@
 # There's no way to ignore "F401 '...' imported but unused" warnings in this
 # module, but to preserve other warnings. So, don't check this module at all.
 
-# Copyright 2020 The HuggingFace Team. All rights reserved.
+# Copyright 2021 The HuggingFace Team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/src/transformers/models/luke/configuration_luke.py b/src/transformers/models/luke/configuration_luke.py
index 6c434fc8cc00aa..1a8ab38ea28b91 100644
--- a/src/transformers/models/luke/configuration_luke.py
+++ b/src/transformers/models/luke/configuration_luke.py
@@ -21,8 +21,8 @@
 logger = logging.get_logger(__name__)
 
 LUKE_PRETRAINED_CONFIG_ARCHIVE_MAP = {
-    "luke-base": "https://huggingface.co/studio-ousia/luke-base/resolve/main/config.json",
-    "luke-large": "https://huggingface.co/studio-ousia/luke-large/resolve/main/config.json",
+    "studio-ousia/luke-base": "https://huggingface.co/studio-ousia/luke-base/resolve/main/config.json",
+    "studio-ousia/luke-large": "https://huggingface.co/studio-ousia/luke-large/resolve/main/config.json",
 }
 
 
diff --git a/src/transformers/models/luke/modeling_luke.py b/src/transformers/models/luke/modeling_luke.py
index 66d94aa5253ae1..15b43b3a729b66 100644
--- a/src/transformers/models/luke/modeling_luke.py
+++ b/src/transformers/models/luke/modeling_luke.py
@@ -53,8 +53,6 @@ class BaseLukeModelOutputWithPooling(BaseModelOutputWithPooling):
     """
     Base class for outputs of the LUKE model.
 
-
-
     Args:
         last_hidden_state (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`):
             Sequence of hidden-states at the output of the last layer of the model.
@@ -86,8 +84,6 @@ class BaseLukeModelOutput(BaseModelOutput):
     """
     Base class for model's outputs, with potential hidden states and attentions.
 
-
-
     Args:
         last_hidden_state (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`):
             Sequence of hidden-states at the output of the last layer of the model.
@@ -119,8 +115,6 @@ class EntityClassificationOutput(ModelOutput):
     """
     Outputs of entity classification models.
 
-
-
     Args:
         loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`labels` is provided):
             Classification (or regression if config.num_labels==1) loss.
@@ -152,8 +146,6 @@ class EntityPairClassificationOutput(ModelOutput):
     """
     Outputs of entity pair classification models.
 
-
-
     Args:
         loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`labels` is provided):
             Classification (or regression if config.num_labels==1) loss.
@@ -185,8 +177,6 @@ class EntitySpanClassificationOutput(ModelOutput):
     """
     Outputs of entity span classification models.
 
-
-
     Args:
         loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`labels` is provided):
             Classification (or regression if config.num_labels==1) loss.
@@ -230,10 +220,6 @@ def __init__(self, config):
         self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
         self.dropout = nn.Dropout(config.hidden_dropout_prob)
 
-        # position_ids (1, len position emb) is contiguous in memory and exported when serialized
-        self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)))
-        self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
-
         # End copy
         self.padding_idx = config.pad_token_id
         self.position_embeddings = nn.Embedding(
@@ -336,8 +322,8 @@ def __init__(self, config):
         super().__init__()
         if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
             raise ValueError(
-                "The hidden size (%d) is not a multiple of the number of attention "
-                "heads (%d)" % (config.hidden_size, config.num_attention_heads)
+                f"The hidden size {config.hidden_size,} is not a multiple of the number of attention "
+                f"heads {config.num_attention_heads}."
             )
 
         self.num_attention_heads = config.num_attention_heads
@@ -495,9 +481,7 @@ def forward(
         else:
             entity_attention_output = attention_output[:, word_size:, :]
 
-        outputs = (word_attention_output, entity_attention_output) + self_outputs[
-            2:
-        ]  # add attentions if we output them
+        outputs = (word_attention_output, entity_attention_output) + self_outputs[2:] # add attentions if we output them
 
         return outputs
 
@@ -721,8 +705,6 @@ def _init_weights(self, module: nn.Module):
     subclass. Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to
     general usage and behavior.
 
-
-
     Parameters:
         config (:class:`~transformers.LukeConfig`): Model configuration class with all the parameters of the
             model. Initializing with a config file does not load the weights associated with the model, only the
@@ -731,8 +713,6 @@ def _init_weights(self, module: nn.Module):
 """
 
 LUKE_INPUTS_DOCSTRING = r"""
-
-
     Args:
         input_ids (:obj:`torch.LongTensor` of shape :obj:`({0})`):
             Indices of input sequence tokens in the vocabulary.
@@ -745,8 +725,6 @@ def _init_weights(self, module: nn.Module):
         attention_mask (:obj:`torch.FloatTensor` of shape :obj:`({0})`, `optional`):
             Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
 
-
-
             - 1 for tokens that are **not masked**,
             - 0 for tokens that are **masked**.
 
@@ -755,8 +733,6 @@ def _init_weights(self, module: nn.Module):
             Segment token indices to indicate first and second portions of the inputs. Indices are selected in ``[0,
             1]``:
 
-
-
             - 0 corresponds to a `sentence A` token,
             - 1 corresponds to a `sentence B` token.
 
@@ -777,8 +753,6 @@ def _init_weights(self, module: nn.Module):
         entity_attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, entity_length)`, `optional`):
             Mask to avoid performing attention on padding entity token indices. Mask values selected in ``[0, 1]``:
 
-
-
             - 1 for entity tokens that are **not masked**,
             - 0 for entity tokens that are **masked**.
 
@@ -786,8 +760,6 @@ def _init_weights(self, module: nn.Module):
             Segment token indices to indicate first and second portions of the entity token inputs. Indices are
             selected in ``[0, 1]``:
 
-
-
             - 0 corresponds to a `portion A` entity token,
             - 1 corresponds to a `portion B` entity token.
 
@@ -803,8 +775,6 @@ def _init_weights(self, module: nn.Module):
         head_mask (:obj:`torch.FloatTensor` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`):
             Mask to nullify selected heads of the self-attention modules. Mask values selected in ``[0, 1]``:
 
-
-
             - 1 indicates the head is **not masked**,
             - 0 indicates the head is **masked**.
 
@@ -877,7 +847,6 @@ def forward(
 
         Returns:
 
-
         Examples::
 
             >>> from transformers import LukeTokenizer, LukeModel
@@ -1232,7 +1201,7 @@ def forward(
             return ((loss,) + output) if loss is not None else output
 
         return EntityPairClassificationOutput(
-            loss=loss if loss is not None else None,
+            loss=loss,
             logits=logits,
             hidden_states=outputs.hidden_states,
             entity_hidden_states=outputs.entity_hidden_states,
@@ -1354,7 +1323,7 @@ def forward(
             return ((loss,) + output) if loss is not None else output
 
         return EntitySpanClassificationOutput(
-            loss=loss if loss is not None else None,
+            loss=loss,
             logits=logits,
             hidden_states=outputs.hidden_states,
             entity_hidden_states=outputs.entity_hidden_states,
diff --git a/src/transformers/models/luke/tokenization_luke.py b/src/transformers/models/luke/tokenization_luke.py
index 26e256b84a3a7b..eb8a085df2308b 100644
--- a/src/transformers/models/luke/tokenization_luke.py
+++ b/src/transformers/models/luke/tokenization_luke.py
@@ -13,6 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Tokenization classes for LUKE."""
+
 import itertools
 import json
 import os
@@ -95,10 +96,10 @@ class LukeTokenizer(RobertaTokenizer):
         max_mention_length (:obj:`int`, `optional`, defaults to 30):
             The maximum number of tokens inside an entity span.
         entity_token_1 (:obj:`str`, `optional`, defaults to :obj:`<ent>`):
-            The special token representing an entity span. This token is only used when `task` is set to
+            The special token representing an entity span. This token is only used when ``task`` is set to
             "entity_classification" or "entity_pair_classification".
         entity_token_2 (:obj:`str`, `optional`, defaults to :obj:`<ent2>`):
-            The special token representing an entity span. This token is only used when `task` is set to
+            The special token representing an entity span. This token is only used when ``task`` is set to
             "entity_pair_classification".
     """
 
@@ -130,9 +131,7 @@ def __init__(
             if isinstance(entity_token_2, str)
             else entity_token_2
         )
-        kwargs["additional_special_tokens"] = [entity_token_1, entity_token_2] + kwargs.get(
-            "additional_special_tokens", []
-        )
+        kwargs["additional_special_tokens"] = [entity_token_1, entity_token_2] + kwargs.get("additional_special_tokens", [])
 
         super().__init__(
             vocab_file=vocab_file,
@@ -156,7 +155,7 @@ def __init__(
         elif task == "entity_pair_classification":
             self.max_entity_length = 2
         else:
-            raise ValueError(f"Task {task} not supported")
+            raise ValueError(f"Task {task} not supported. Select task from ['entity_classification', 'entity_pair_classification'] only.")
 
         self.max_mention_length = max_mention_length
 
@@ -332,9 +331,9 @@ def encode_plus(
         **kwargs
     ) -> BatchEncoding:
         """
-        Tokenize and prepare for the model a sequence or a pair of sequences. .. warning:: This method is deprecated,
-        ``__call__`` should be used instead.
-
+        Tokenize and prepare for the model a sequence or a pair of sequences. 
+        
+        .. warning:: This method is deprecated, ``__call__`` should be used instead.
 
         Args:
             text (:obj:`str`):
@@ -1401,7 +1400,7 @@ def _pad(
         return encoded_inputs
 
     def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
-        vocab_file, merge_file = super(LukeTokenizer, self).save_vocabulary(save_directory, filename_prefix)
+        vocab_file, merge_file = super().save_vocabulary(save_directory, filename_prefix)
 
         entity_vocab_file = os.path.join(
             save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["entity_vocab_file"]
diff --git a/tests/test_tokenization_luke.py b/tests/test_tokenization_luke.py
index 56695254739a69..1146cb4f3299bc 100644
--- a/tests/test_tokenization_luke.py
+++ b/tests/test_tokenization_luke.py
@@ -1,5 +1,5 @@
 # coding=utf-8
-# Copyright 2020 The HuggingFace Team. All rights reserved.
+# Copyright 2021 The HuggingFace Team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.