huggingface · sgugger · Apr 26, 2021 · Apr 25, 2021
diff --git a/src/transformers/commands/add_new_model.py b/src/transformers/commands/add_new_model.py
@@ -57,14 +57,14 @@ def run(self):
         if not _has_cookiecutter:
             raise ImportError(
                 "Model creation dependencies are required to use the `add_new_model` command. Install them by running "
-                "the folowing at the root of your `transformers` clone:\n\n\t$ pip install -e .[modelcreation]\n"
+                "the following at the root of your `transformers` clone:\n\n\t$ pip install -e .[modelcreation]\n"
             )
         # Ensure that there is no other `cookiecutter-template-xxx` directory in the current working directory
         directories = [directory for directory in os.listdir() if "cookiecutter-template-" == directory[:22]]
         if len(directories) > 0:
             raise ValueError(
                 "Several directories starting with `cookiecutter-template-` in current working directory. "
-                "Please clean your directory by removing all folders startign with `cookiecutter-template-` or "
+                "Please clean your directory by removing all folders starting with `cookiecutter-template-` or "
                 "change your working directory."
             )
 

diff --git a/src/transformers/data/processors/squad.py b/src/transformers/data/processors/squad.py
@@ -244,7 +244,7 @@ def squad_convert_example_to_features(
         cls_index = span["input_ids"].index(tokenizer.cls_token_id)
 
         # p_mask: mask with 1 for token than cannot be in the answer (0 for token which can be in an answer)
-        # Original TF implem also keep the classification token (set to 0)
+        # Original TF implementation also keep the classification token (set to 0)
         p_mask = np.ones_like(span["token_type_ids"])
         if tokenizer.padding_side == "right":
             p_mask[len(truncated_query) + sequence_added_tokens :] = 0

diff --git a/src/transformers/feature_extraction_sequence_utils.py b/src/transformers/feature_extraction_sequence_utils.py
@@ -13,7 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """
- Sequence feature extraction class for common feature extrcactors to preprocess sequences.
+ Sequence feature extraction class for common feature extractors to preprocess sequences.
 """
 from typing import Dict, List, Optional, Union
 

diff --git a/src/transformers/file_utils.py b/src/transformers/file_utils.py
@@ -551,7 +551,7 @@ def wrapper(*args, **kwargs):
         ("sklearn", (is_sklearn_available, SKLEARN_IMPORT_ERROR)),
         ("speech", (is_speech_available, SPEECH_IMPORT_ERROR)),
         ("tf", (is_tf_available, TENSORFLOW_IMPORT_ERROR)),
-        ("tokenziers", (is_tokenizers_available, TOKENIZERS_IMPORT_ERROR)),
+        ("tokenizers", (is_tokenizers_available, TOKENIZERS_IMPORT_ERROR)),
         ("torch", (is_torch_available, PYTORCH_IMPORT_ERROR)),
         ("vision", (is_vision_available, VISION_IMPORT_ERROR)),
     ]

diff --git a/src/transformers/generation_logits_process.py b/src/transformers/generation_logits_process.py
@@ -446,7 +446,7 @@ def _set_scores_to_inf_for_banned_tokens(self, scores: torch.Tensor, banned_toke
 
 class PrefixConstrainedLogitsProcessor(LogitsProcessor):
     r"""
-    :class:`transformers.LogitsProcessor` that enforces contrained generation and is useful for prefix-conditioned
+    :class:`transformers.LogitsProcessor` that enforces constrained generation and is useful for prefix-conditioned
     constrained generation. See `Autoregressive Entity Retrieval <https://arxiv.org/abs/2010.00904>`__ for more
     information.
 

diff --git a/src/transformers/generation_stopping_criteria.py b/src/transformers/generation_stopping_criteria.py
@@ -23,7 +23,7 @@
             Prediction scores of a language modeling head. These can be scores for each vocabulary token before SoftMax
             or scores for each vocabulary token after SoftMax.
         kwargs:
-            Additional stopping critera specific kwargs.
+            Additional stopping criteria specific kwargs.
 
     Return:
         :obj:`bool`. :obj:`False` indicates we should continue, :obj:`True` indicates we should stop.

diff --git a/src/transformers/generation_tf_utils.py b/src/transformers/generation_tf_utils.py
@@ -442,8 +442,8 @@ def _generate_no_beam_search(
         **kwargs
     ):
         """
-        Generate sequences for each example without beam search (num_beams == 1). All returned sequence are generated
-        independantly.
+        Generate sequences for each example without beam search (num_beams == 1). All returned sequences are generated
+        independently.
         """
 
         # length of generated sentences / unfinished sentences

diff --git a/src/transformers/generation_utils.py b/src/transformers/generation_utils.py
@@ -821,7 +821,7 @@ def generate(
             ... "at least two people were killed in a suspected bomb attack on a passenger bus "
             ... "in the strife-torn southern philippines on monday , the military said."
             ... )
-            >>> # encode input contex
+            >>> # encode input context
             >>> input_ids = tokenizer(document, return_tensors="pt").input_ids
             >>> # generate 3 independent sequences using beam search decoding (5 beams)
             >>> # with T5 encoder-decoder model conditioned on short news article.

diff --git a/src/transformers/modeling_flax_utils.py b/src/transformers/modeling_flax_utils.py
@@ -94,7 +94,7 @@ def __init__(
         self.key = PRNGKey(seed)
         self.dtype = dtype
 
-        # randomely initialized parameters
+        # randomly initialized parameters
         random_params = self.init_weights(self.key, input_shape)
 
         # save required_params as set

diff --git a/src/transformers/modeling_outputs.py b/src/transformers/modeling_outputs.py
@@ -343,7 +343,7 @@ class CausalLMOutputWithPast(ModelOutput):
             Language modeling loss (for next-token prediction).
         logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`):
             Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
-        past_key_values (:obj:`tuple(tupel(torch.FloatTensor))`, `optional`, returned when ``use_cache=True`` is passed or when ``config.use_cache=True``):
+        past_key_values (:obj:`tuple(tuple(torch.FloatTensor))`, `optional`, returned when ``use_cache=True`` is passed or when ``config.use_cache=True``):
             Tuple of :obj:`tuple(torch.FloatTensor)` of length :obj:`config.n_layers`, with each tuple having 2 tensors
             of shape :obj:`(batch_size, num_heads, sequence_length, embed_size_per_head)`)
 
@@ -423,7 +423,7 @@ class SequenceClassifierOutputWithPast(ModelOutput):
             Classification (or regression if config.num_labels==1) loss.
         logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, config.num_labels)`):
             Classification (or regression if config.num_labels==1) scores (before SoftMax).
-        past_key_values (:obj:`tuple(tupel(torch.FloatTensor))`, `optional`, returned when ``use_cache=True`` is passed or when ``config.use_cache=True``):
+        past_key_values (:obj:`tuple(tuple(torch.FloatTensor))`, `optional`, returned when ``use_cache=True`` is passed or when ``config.use_cache=True``):
             Tuple of :obj:`tuple(torch.FloatTensor)` of length :obj:`config.n_layers`, with each tuple having 2 tensors
             of shape :obj:`(batch_size, num_heads, sequence_length, embed_size_per_head)`)
 

diff --git a/src/transformers/modeling_tf_pytorch_utils.py b/src/transformers/modeling_tf_pytorch_utils.py
@@ -51,7 +51,7 @@ def convert_tf_weight_name_to_pt_weight_name(tf_name, start_prefix_to_remove="")
     )  # '_._' is replaced by a level separation (can be used to convert TF2.0 lists in PyTorch nn.ModulesList)
     tf_name = re.sub(r"//+", "/", tf_name)  # Remove empty levels at the end
     tf_name = tf_name.split("/")  # Convert from TF2.0 '/' separators to PyTorch '.' separators
-    # Some weights have a single name withtout "/" such as final_logits_bias in BART
+    # Some weights have a single name without "/" such as final_logits_bias in BART
     if len(tf_name) > 1:
         tf_name = tf_name[1:]  # Remove level zero
 

diff --git a/src/transformers/modeling_tf_utils.py b/src/transformers/modeling_tf_utils.py
@@ -659,7 +659,7 @@ def serving(self, inputs):
 
         Args:
             inputs (:obj:`Dict[str, tf.Tensor]`):
-                The input of the saved model as a dictionnary of tensors.
+                The input of the saved model as a dictionary of tensors.
         """
         output = self.call(inputs)
 
@@ -944,7 +944,7 @@ def _get_resized_lm_head_decoder(self, old_lm_head_decoder, new_num_tokens):
                 vectors from the end. If not provided or :obj:`None`, just returns None
 
         Return:
-            :obj:`tf.Variable`: Pointer to the resized decoder or None if the output embeddings are differents of the
+            :obj:`tf.Variable`: Pointer to the resized decoder or None if the output embeddings are different from the
             input ones.
         """
         new_lm_head_decoder = old_lm_head_decoder

diff --git a/src/transformers/modeling_utils.py b/src/transformers/modeling_utils.py
@@ -291,7 +291,7 @@ def get_head_mask(
                 The mask indicating if we should keep the heads or not (1.0 for keep, 0.0 for discard).
             num_hidden_layers (:obj:`int`):
                 The number of hidden layers in the model.
-            is_attention_chunked: (:obj:`bool`, `optional, defaults to :obj:`False`):
+            is_attention_chunked: (:obj:`bool`, `optional`, defaults to :obj:`False`):
                 Whether or not the attentions scores are computed by chunks or not.
 
         Returns:

diff --git a/src/transformers/models/auto/modeling_auto.py b/src/transformers/models/auto/modeling_auto.py
@@ -716,7 +716,7 @@
     "AutoModelForPreTraining", MODEL_FOR_PRETRAINING_MAPPING, head_doc="pretraining"
 )
 
-# Private on puprose, the public class will add the deprecation warnings.
+# Private on purpose, the public class will add the deprecation warnings.
 _AutoModelWithLMHead = auto_class_factory(
     "AutoModelWithLMHead", MODEL_WITH_LM_HEAD_MAPPING, head_doc="language modeling"
 )

diff --git a/src/transformers/models/auto/modeling_flax_auto.py b/src/transformers/models/auto/modeling_flax_auto.py
@@ -103,7 +103,7 @@
 )
 
 FlaxAutoModelForSequenceClassification = auto_class_factory(
-    "AFlaxutoModelForSequenceClassification",
+    "FlaxAutoModelForSequenceClassification",
     FLAX_MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING,
     head_doc="sequence classification",
 )

diff --git a/src/transformers/models/auto/modeling_tf_auto.py b/src/transformers/models/auto/modeling_tf_auto.py
@@ -469,7 +469,7 @@
     "TFAutoModelForPreTraining", TF_MODEL_FOR_PRETRAINING_MAPPING, head_doc="pretraining"
 )
 
-# Private on puprose, the public class will add the deprecation warnings.
+# Private on purpose, the public class will add the deprecation warnings.
 _TFAutoModelWithLMHead = auto_class_factory(
     "TFAutoModelWithLMHead", TF_MODEL_WITH_LM_HEAD_MAPPING, head_doc="language modeling"
 )

diff --git a/src/transformers/models/bart/configuration_bart.py b/src/transformers/models/bart/configuration_bart.py
@@ -171,7 +171,7 @@ def __init__(
         self.gradient_checkpointing = gradient_checkpointing
         self.scale_embedding = scale_embedding  # scale factor will be sqrt(d_model) if True
 
-        # ensure backward compatibilty for BART CNN models
+        # ensure backward compatibility for BART CNN models
         if self.forced_bos_token_id is None and kwargs.get("force_bos_token_to_be_generated", False):
             self.forced_bos_token_id = self.bos_token_id
             warnings.warn(

diff --git a/src/transformers/models/bart/modeling_bart.py b/src/transformers/models/bart/modeling_bart.py
@@ -111,7 +111,7 @@ class BartLearnedPositionalEmbedding(nn.Embedding):
 
     def __init__(self, num_embeddings: int, embedding_dim: int):
         # Bart is set up so that if padding_idx is specified then offset the embedding ids by 2
-        # and adjust num_embeddings appropriately. Other models dont have this hack
+        # and adjust num_embeddings appropriately. Other models don't have this hack
         self.offset = 2
         super().__init__(num_embeddings + self.offset, embedding_dim)
 
@@ -236,9 +236,9 @@ def forward(
             attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
 
         if output_attentions:
-            # this operation is a bit akward, but it's required to
+            # this operation is a bit awkward, but it's required to
             # make sure that attn_weights keeps its gradient.
-            # In order to do so, attn_weights have to reshaped
+            # In order to do so, attn_weights have to be reshaped
             # twice and have to be reused in the following
             attn_weights_reshaped = attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
             attn_weights = attn_weights_reshaped.view(bsz * self.num_heads, tgt_len, src_len)

diff --git a/src/transformers/models/bart/modeling_tf_bart.py b/src/transformers/models/bart/modeling_tf_bart.py
@@ -116,7 +116,7 @@ class TFBartLearnedPositionalEmbedding(TFSharedEmbeddings):
 
     def __init__(self, num_embeddings: int, embedding_dim: int, **kwargs):
         # Bart is set up so that if padding_idx is specified then offset the embedding ids by 2
-        # and adjust num_embeddings appropriately. Other models dont have this hack
+        # and adjust num_embeddings appropriately. Other models don't have this hack
         self.offset = 2
         super().__init__(num_embeddings + self.offset, embedding_dim, **kwargs)
 
@@ -572,7 +572,7 @@ def serving(self, inputs):
             Mask to nullify selected heads of the attention modules in the encoder. Mask values selected in ``[0, 1]``:
 
             - 1 indicates the head is **not masked**,
-            - 0 indicates the heas is **masked**.
+            - 0 indicates the head is **masked**.
 
         decoder_head_mask (:obj:`tf.Tensor` of shape :obj:`(decoder_layers, decoder_attention_heads)`, `optional`):
             Mask to nullify selected heads of the attention modules in the decoder. Mask values selected in ``[0, 1]``:
@@ -677,7 +677,7 @@ def call(
                 Mask to nullify selected heads of the attention modules. Mask values selected in ``[0, 1]``:
 
                 - 1 indicates the head is **not masked**,
-                - 0 indicates the heas is **masked**.
+                - 0 indicates the head is **masked**.
 
             inputs_embeds (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
                 Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded
@@ -856,14 +856,14 @@ def call(
                 Mask to nullify selected heads of the attention modules. Mask values selected in ``[0, 1]``:
 
                 - 1 indicates the head is **not masked**,
-                - 0 indicates the heas is **masked**.
+                - 0 indicates the head is **masked**.
 
             encoder_head_mask (:obj:`tf.Tensor` of shape :obj:`(encoder_layers, encoder_attention_heads)`, `optional`):
                 Mask to nullify selected heads of the attention modules in encoder to avoid performing cross-attention
                 on hidden heads. Mask values selected in ``[0, 1]``:
 
                 - 1 indicates the head is **not masked**,
-                - 0 indicates the heas is **masked**.
+                - 0 indicates the head is **masked**.
 
             past_key_values (:obj:`Tuple[Tuple[tf.Tensor]]` of length :obj:`config.n_layers` with each tuple having 2 tuples each of which has 2 tensors of shape :obj:`(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
                 Contains precomputed key and value hidden-states of the attention blocks. Can be used to speed up

diff --git a/src/transformers/models/bert_japanese/tokenization_bert_japanese.py b/src/transformers/models/bert_japanese/tokenization_bert_japanese.py
@@ -304,7 +304,7 @@ def tokenize(self, text, never_split=None, **kwargs):
 
 
 class CharacterTokenizer:
-    """Runs Character tokenziation."""
+    """Runs Character tokenization."""
 
     def __init__(self, vocab, unk_token, normalize_text=True):
         """

diff --git a/src/transformers/models/bertweet/tokenization_bertweet.py b/src/transformers/models/bertweet/tokenization_bertweet.py
@@ -451,7 +451,7 @@ def add_from_file(self, f):
    the class Tokenizer.
 
 4. When instantiating Tokenizer objects, there is a single option: preserve_case. By default, it is set to True. If it
-   is set to False, then the tokenizer will downcase everything except for emoticons.
+   is set to False, then the tokenizer will lowercase everything except for emoticons.
 
 """
 

diff --git a/src/transformers/models/big_bird/modeling_big_bird.py b/src/transformers/models/big_bird/modeling_big_bird.py
@@ -725,7 +725,7 @@ def bigbird_block_sparse_attention(
             band_product, dim=-1
         )  # [bsz, n_heads, from_seq_len//from_block_size-4, from_block_size, (5+n_rand_blocks)*to_block_size]
 
-        # contibution of sliding keys
+        # contribution of sliding keys
         # [bsz, n_heads, m//from_block_size-4, from_block_size, 3*to_block_size] x [bsz, n_heads, from_seq_len//from_block_size-4, 3*to_block_size, -1]
         context_layer = self.torch_bmm_nd(
             attn_weights[:, :, :, :, to_block_size : 4 * to_block_size], exp_blocked_value_matrix, ndim=5
@@ -877,7 +877,7 @@ def bigbird_block_sparse_attention(
                 attn_probs_view[:, :, q_idx, :, q_idx : q_idx + 3, :] = right_slice.view(
                     bsz, n_heads, from_block_size, 3, to_block_size
                 )  # inner_band_product
-            # global keys (correspomding to 1st key block)
+            # global keys (corresponding to 1st key block)
             attention_probs[:, :, 2 * from_block_size : -2 * from_block_size, :to_block_size] = attn_weights[
                 :, :, :, :, :to_block_size
             ].view(
@@ -947,7 +947,7 @@ def bigbird_block_sparse_attention(
 
     @staticmethod
     def torch_gather_b2(params, indices):
-        # this operation is equilvalent to tf.gather when batch_dims=2
+        # this operation is equivalent to tf.gather when batch_dims=2
 
         if params.shape[:2] != indices.shape[:2]:
             raise ValueError(
@@ -1055,7 +1055,7 @@ def _bigbird_block_rand_mask(
             to_block_size: int. size of block in to sequence.
             num_rand_blocks: int. Number of random chunks per row.
             last_idx: if -1 then num_rand_blocks blocks chosen anywhere in to sequence,
-            if positive then num_rand_blocks blocks choosen only upto last_idx.
+            if positive then num_rand_blocks blocks chosen only up to last_idx.
 
         Returns:
             adjacency list of size from_seq_length//from_block_size-2 by num_rand_blocks
@@ -1150,7 +1150,7 @@ def _bigbird_block_rand_mask_with_head(
         plan_block_length = np.array(plan_from_length) // from_block_size
         # till when to follow plan
         max_plan_idx = plan_from_length.index(from_seq_length)
-        # Random Attention adjajency list
+        # Random Attention adjacency list
         rand_attn = [
             np.zeros((num_blocks, np.sum(plan_num_rand_blocks[: max_plan_idx + 1])), dtype=np.int32)
             for i in range(num_heads)
@@ -1247,8 +1247,8 @@ def _get_single_block_row_attention(
 
         Args:
             block_id: int. block id of row.
-            to_start_block_id: int. random attention coloum start id.
-            to_end_block_id: int. random attention coloum end id.
+            to_start_block_id: int. random attention column start id.
+            to_end_block_id: int. random attention column end id.
             num_rand_blocks: int. number of random blocks to be selected.
             window_block_left: int. number of blocks of window to left of a block.
             window_block_right: int. number of blocks of window to right of a block.
@@ -1826,7 +1826,7 @@ def _init_weights(self, module):
 @dataclass
 class BigBirdForPreTrainingOutput(ModelOutput):
     """
-    Output type of :class:`~transformers.BigBirdtForPreTraining`.
+    Output type of :class:`~transformers.BigBirdForPreTraining`.
 
     Args:
         loss (`optional`, returned when ``labels`` is provided, ``torch.FloatTensor`` of shape :obj:`(1,)`):
@@ -2907,7 +2907,7 @@ def forward(
 
         logits_mask = None
         if question_lengths is not None:
-            # setting lengths logits to `-infi`
+            # setting lengths logits to `-inf`
             logits_mask = self.prepare_question_mask(question_lengths, seqlen)
             if token_type_ids is None:
                 token_type_ids = (~logits_mask).long()

diff --git a/src/transformers/models/blenderbot/modeling_blenderbot.py b/src/transformers/models/blenderbot/modeling_blenderbot.py
@@ -237,9 +237,9 @@ def forward(
             attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
 
         if output_attentions:
-            # this operation is a bit akward, but it's required to
+            # this operation is a bit awkward, but it's required to
             # make sure that attn_weights keeps its gradient.
-            # In order to do so, attn_weights have to reshaped
+            # In order to do so, attn_weights have to be reshaped
             # twice and have to be reused in the following
             attn_weights_reshaped = attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
             attn_weights = attn_weights_reshaped.view(bsz * self.num_heads, tgt_len, src_len)