From aa3c05696eca4c316f3bd0040a2fc5cf10e8dd86 Mon Sep 17 00:00:00 2001 From: sgugger Date: Mon, 26 Oct 2020 20:16:53 -0400 Subject: [PATCH 1/3] Fix a few docstrings --- src/transformers/tokenization_camembert_fast.py | 10 ++++++---- src/transformers/tokenization_dpr.py | 12 ++++++------ src/transformers/tokenization_dpr_fast.py | 12 ++++++------ src/transformers/trainer.py | 17 ++++++++++------- utils/style_doc.py | 3 ++- 5 files changed, 30 insertions(+), 24 deletions(-) diff --git a/src/transformers/tokenization_camembert_fast.py b/src/transformers/tokenization_camembert_fast.py index 912e975fe95049..b07e084ab6d86c 100644 --- a/src/transformers/tokenization_camembert_fast.py +++ b/src/transformers/tokenization_camembert_fast.py @@ -66,10 +66,12 @@ class CamembertTokenizerFast(PreTrainedTokenizerFast): This tokenizer inherits from :class:`~transformers.PreTrainedTokenizerFast` which contains most of the main methods. Users should refer to this superclass for more information regarding those methods. - vocab_file (:obj:`str`): `SentencePiece `__ file (generally has a `.spm` - extension) that contains the vocabulary necessary to instantiate a tokenizer. bos_token (:obj:`str`, `optional`, - defaults to :obj:`""`): The beginning of sequence token that was used during pretraining. Can be used a sequence - classifier token. + Args: + vocab_file (:obj:`str`): + `SentencePiece `__ file (generally has a `.spm` extension) that + contains the vocabulary necessary to instantiate a tokenizer. + bos_token (:obj:`str`, `optional`, defaults to :obj:`""`): + The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token. .. note:: diff --git a/src/transformers/tokenization_dpr.py b/src/transformers/tokenization_dpr.py index 92e6b9be9718f9..4aca836e63b32a 100644 --- a/src/transformers/tokenization_dpr.py +++ b/src/transformers/tokenization_dpr.py @@ -129,10 +129,10 @@ class DPRQuestionEncoderTokenizer(BertTokenizer): CUSTOM_DPR_READER_DOCSTRING = r""" - Return a dictionary with the token ids of the input strings and other information to give to - :obj:`.decode_best_spans`. It converts the strings of a question and different passages (title and text) in a - sequence of IDs (integers), using the tokenizer and vocabulary. The resulting :obj:`input_ids` is a matrix of - size :obj:`(n_passages, sequence_length)` with the format: + Return a dictionary with the token ids of the input strings and other information to give to + :obj:`.decode_best_spans`. It converts the strings of a question and different passages (title and text) in a + sequence of IDs (integers), using the tokenizer and vocabulary. The resulting :obj:`input_ids` is a matrix of size + :obj:`(n_passages, sequence_length)` with the format: :: @@ -189,12 +189,12 @@ class DPRQuestionEncoderTokenizer(BertTokenizer): `What are attention masks? <../glossary.html#attention-mask>`__ - Return: + Returns: :obj:`Dict[str, List[List[int]]]`: A dictionary with the following keys: - ``input_ids``: List of token ids to be fed to a model. - ``attention_mask``: List of indices specifying which tokens should be attended to by the model. - """ + """ @add_start_docstrings(CUSTOM_DPR_READER_DOCSTRING) diff --git a/src/transformers/tokenization_dpr_fast.py b/src/transformers/tokenization_dpr_fast.py index f81ae54211de09..29e98119abcdfa 100644 --- a/src/transformers/tokenization_dpr_fast.py +++ b/src/transformers/tokenization_dpr_fast.py @@ -132,12 +132,12 @@ class DPRQuestionEncoderTokenizerFast(BertTokenizerFast): CUSTOM_DPR_READER_DOCSTRING = r""" - Return a dictionary with the token ids of the input strings and other information to give to - :obj:`.decode_best_spans`. It converts the strings of a question and different passages (title and text) in a - sequence of IDs (integers), using the tokenizer and vocabulary. The resulting :obj:`input_ids` is a matrix of - size :obj:`(n_passages, sequence_length)` with the format: + Return a dictionary with the token ids of the input strings and other information to give to + :obj:`.decode_best_spans`. It converts the strings of a question and different passages (title and text) in a + sequence of IDs (integers), using the tokenizer and vocabulary. The resulting :obj:`input_ids` is a matrix of size + :obj:`(n_passages, sequence_length)` with the format: - [CLS] [SEP] [SEP] + [CLS] [SEP] [SEP] Args: questions (:obj:`str` or :obj:`List[str]`): @@ -195,7 +195,7 @@ class DPRQuestionEncoderTokenizerFast(BertTokenizerFast): - ``input_ids``: List of token ids to be fed to a model. - ``attention_mask``: List of indices specifying which tokens should be attended to by the model. - """ + """ @add_start_docstrings(CUSTOM_DPR_READER_DOCSTRING) diff --git a/src/transformers/trainer.py b/src/transformers/trainer.py index f032d4e0756324..6e37618479427f 100755 --- a/src/transformers/trainer.py +++ b/src/transformers/trainer.py @@ -191,18 +191,21 @@ class Trainer: The function may have zero argument, or a single one containing the optuna/Ray Tune trial object, to be able to choose different architectures according to hyper parameters (such as layer count, sizes of inner - layers, dropout probabilities etc). compute_metrics (:obj:`Callable[[EvalPrediction], Dict]`, `optional`): + layers, dropout probabilities etc). + compute_metrics (:obj:`Callable[[EvalPrediction], Dict]`, `optional`): The function that will be used to compute metrics at evaluation. Must take a - :class:`~transformers.EvalPrediction` and return a dictionary string to metric values. callbacks (List of - :obj:`~transformers.TrainerCallback`, `optional`): A list of callbacks to customize the training loop. Will - add those to the list of default callbacks detailed in :doc:`here `. + :class:`~transformers.EvalPrediction` and return a dictionary string to metric values. + callbacks (List of :obj:`~transformers.TrainerCallback`, `optional`): + A list of callbacks to customize the training loop. Will add those to the list of default callbacks + detailed in :doc:`here `. If you want to remove one of the default callbacks used, use the :meth:`Trainer.remove_callback` method. - optimizers (:obj:`Tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR`, `optional`): A tuple + optimizers (:obj:`Tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR`, `optional`): A tuple containing the optimizer and the scheduler to use. Will default to an instance of :class:`~transformers.AdamW` on your model and a scheduler given by - :func:`~transformers.get_linear_schedule_with_warmup` controlled by :obj:`args`. kwargs: Deprecated keyword - arguments. + :func:`~transformers.get_linear_schedule_with_warmup` controlled by :obj:`args`. + kwargs: + Deprecated keyword arguments. """ def __init__( diff --git a/utils/style_doc.py b/utils/style_doc.py index 2c67b784d73dbd..7f8a2bac868ad4 100644 --- a/utils/style_doc.py +++ b/utils/style_doc.py @@ -312,10 +312,11 @@ class DocstringStyler(CodeStyler): """Class to style docstrings that take the main method from `CodeStyler`.""" def is_no_style_block(self, line): + if _re_textual_blocks.search(line) is not None: + return False if _re_example.search(line) is not None: return True return _re_code_block.search(line) is not None - # return super().is_no_style_block(line) is not None def is_comment_or_textual_block(self, line): if _re_return.search(line) is not None: From 0a48f2e7e9c55b722f72fffc0ea6b8c9919ea1fa Mon Sep 17 00:00:00 2001 From: sgugger Date: Mon, 26 Oct 2020 20:18:44 -0400 Subject: [PATCH 2/3] More fixes --- .../tokenization_camembert_fast.py | 19 +++++++++---------- src/transformers/training_args.py | 18 ++++++++++-------- 2 files changed, 19 insertions(+), 18 deletions(-) diff --git a/src/transformers/tokenization_camembert_fast.py b/src/transformers/tokenization_camembert_fast.py index b07e084ab6d86c..f111c76d141736 100644 --- a/src/transformers/tokenization_camembert_fast.py +++ b/src/transformers/tokenization_camembert_fast.py @@ -75,23 +75,22 @@ class CamembertTokenizerFast(PreTrainedTokenizerFast): .. note:: - When building a sequence using special tokens, this is not the token that is used for the beginning - of sequence. The token used is the :obj:`cls_token`. + When building a sequence using special tokens, this is not the token that is used for the beginning of + sequence. The token used is the :obj:`cls_token`. eos_token (:obj:`str`, `optional`, defaults to :obj:`""`): The end of sequence token. .. note:: - When building a sequence using special tokens, this is not the token that is used for the end - of sequence. The token used is the :obj:`sep_token`. + When building a sequence using special tokens, this is not the token that is used for the end of + sequence. The token used is the :obj:`sep_token`. sep_token (:obj:`str`, `optional`, defaults to :obj:`""`): - The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences - for sequence classification or for a text and a question for question answering. - It is also used as the last token of a sequence built with special tokens. + The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for + sequence classification or for a text and a question for question answering. It is also used as the last + token of a sequence built with special tokens. cls_token (:obj:`str`, `optional`, defaults to :obj:`""`): - The classifier token which is used when doing sequence classification (classification of the whole - sequence instead of per-token classification). It is the first token of the sequence when built with - special tokens. + The classifier token which is used when doing sequence classification (classification of the whole sequence + instead of per-token classification). It is the first token of the sequence when built with special tokens. unk_token (:obj:`str`, `optional`, defaults to :obj:`""`): The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this token instead. diff --git a/src/transformers/training_args.py b/src/transformers/training_args.py index 0d56d73f8bb909..fe33980ec047cf 100644 --- a/src/transformers/training_args.py +++ b/src/transformers/training_args.py @@ -144,28 +144,30 @@ class TrainingArguments: If using `nlp.Dataset` datasets, whether or not to automatically remove the columns unused by the model forward method. - (Note that this behavior is not implemented for :class:`~transformers.TFTrainer` yet.) label_names - (:obj:`List[str]`, `optional`): The list of keys in your dictionary of inputs that correspond to the - labels. + (Note that this behavior is not implemented for :class:`~transformers.TFTrainer` yet.) + label_names (:obj:`List[str]`, `optional`): + The list of keys in your dictionary of inputs that correspond to the labels. Will eventually default to :obj:`["labels"]` except if the model used is one of the :obj:`XxxForQuestionAnswering` in which case it will default to :obj:`["start_positions", - "end_positions"]`. load_best_model_at_end (:obj:`bool`, `optional`, defaults to :obj:`False`): Whether or - not to load the best model found during training at the end of training. + "end_positions"]`. + load_best_model_at_end (:obj:`bool`, `optional`, defaults to :obj:`False`): + Whether or not to load the best model found during training at the end of training. .. note:: When set to :obj:`True`, the parameters :obj:`save_steps` will be ignored and the model will be saved after each evaluation. - metric_for_best_model (:obj:`str`, `optional`) + metric_for_best_model (:obj:`str`, `optional`): Use in conjunction with :obj:`load_best_model_at_end` to specify the metric to use to compare two different models. Must be the name of a metric returned by the evaluation with or without the prefix :obj:`"eval_"`. Will default to :obj:`"loss"` if unspecified and :obj:`load_best_model_at_end=True` (to use the evaluation loss). If you set this value, :obj:`greater_is_better` will default to :obj:`True`. Don't forget to set it to - :obj:`False` if your metric is better when lower. greater_is_better (:obj:`bool`, `optional`) Use in - conjunction with :obj:`load_best_model_at_end` and :obj:`metric_for_best_model` to specify if better models + :obj:`False` if your metric is better when lower. + greater_is_better (:obj:`bool`, `optional`): + Use in conjunction with :obj:`load_best_model_at_end` and :obj:`metric_for_best_model` to specify if better models should have a greater metric or not. Will default to: - :obj:`True` if :obj:`metric_for_best_model` is set to a value that isn't :obj:`"loss"` or From 7953654b307721c88abbaaa2ddb54686aa5a6f13 Mon Sep 17 00:00:00 2001 From: sgugger Date: Mon, 26 Oct 2020 20:19:02 -0400 Subject: [PATCH 3/3] Styling --- src/transformers/training_args.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/transformers/training_args.py b/src/transformers/training_args.py index fe33980ec047cf..fe88d2aa4bcd35 100644 --- a/src/transformers/training_args.py +++ b/src/transformers/training_args.py @@ -145,7 +145,7 @@ class TrainingArguments: forward method. (Note that this behavior is not implemented for :class:`~transformers.TFTrainer` yet.) - label_names (:obj:`List[str]`, `optional`): + label_names (:obj:`List[str]`, `optional`): The list of keys in your dictionary of inputs that correspond to the labels. Will eventually default to :obj:`["labels"]` except if the model used is one of the @@ -167,8 +167,8 @@ class TrainingArguments: If you set this value, :obj:`greater_is_better` will default to :obj:`True`. Don't forget to set it to :obj:`False` if your metric is better when lower. greater_is_better (:obj:`bool`, `optional`): - Use in conjunction with :obj:`load_best_model_at_end` and :obj:`metric_for_best_model` to specify if better models - should have a greater metric or not. Will default to: + Use in conjunction with :obj:`load_best_model_at_end` and :obj:`metric_for_best_model` to specify if better + models should have a greater metric or not. Will default to: - :obj:`True` if :obj:`metric_for_best_model` is set to a value that isn't :obj:`"loss"` or :obj:`"eval_loss"`.