From 421c60620d8aa218ac9ec0eb2b769d2a54046c45 Mon Sep 17 00:00:00 2001 From: Sam Shleifer Date: Tue, 10 Nov 2020 10:18:34 -0500 Subject: [PATCH] [docs] improve bart/marian/mBART/pegasus docs (#8421) --- docs/source/model_doc/bart.rst | 23 ++++++- docs/source/model_doc/marian.rst | 103 ++++++++++++++++++++---------- docs/source/model_doc/mbart.rst | 13 ++-- docs/source/model_doc/pegasus.rst | 17 +++-- tests/test_modeling_bart.py | 4 +- 5 files changed, 114 insertions(+), 46 deletions(-) diff --git a/docs/source/model_doc/bart.rst b/docs/source/model_doc/bart.rst index 84a7b699bfab79..f75b7810c91959 100644 --- a/docs/source/model_doc/bart.rst +++ b/docs/source/model_doc/bart.rst @@ -34,6 +34,8 @@ ________________________________________________________________________________ - An example of how to train :class:`~transformers.BartForConditionalGeneration` with a Hugging Face :obj:`datasets` object can be found in this `forum discussion `__. +- `Distilled checkpoints `__ are described in this `paper + `__. Implementation Notes @@ -44,14 +46,31 @@ Implementation Notes - The forward pass of :class:`~transformers.BartModel` will create decoder inputs (using the helper function :func:`transformers.modeling_bart._prepare_bart_decoder_inputs`) if they are not passed. This is different than some other modeling APIs. -- Model predictions are intended to be identical to the original implementation. This only works, however, if the - string you pass to :func:`fairseq.encode` starts with a space. +- Model predictions are intended to be identical to the original implementation when + :obj:`force_bos_token_to_be_generated=True`. This only works, however, if the string you pass to + :func:`fairseq.encode` starts with a space. - :meth:`~transformers.BartForConditionalGeneration.generate` should be used for conditional generation tasks like summarization, see the example in that docstrings. - Models that load the `facebook/bart-large-cnn` weights will not have a :obj:`mask_token_id`, or be able to perform mask-filling tasks. - For training/forward passes that don't involve beam search, pass :obj:`use_cache=False`. +Mask Filling +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +The :obj:`facebook/bart-base` and :obj:`facebook/bart-large` checkpoints can be used to fill multi-token masks. + +.. code-block:: + + from transformers import BartForConditionalGeneration, BartTokenizer + model = BartForConditionalGeneration.from_pretrained("facebook/bart-large", force_bos_token_to_be_generated=True) + tok = BartTokenizer.from_pretrained("facebook/bart-large") + example_english_phrase = "UN Chief Says There Is No in Syria" + batch = tok(example_english_phrase, return_tensors='pt') + generated_ids = model.generate(batch['input_ids']) + assert tok.batch_decode(generated_ids, skip_special_tokens=True) == ['UN Chief Says There Is No Plan to Stop Chemical Weapons in Syria'] + + BartConfig ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/docs/source/model_doc/marian.rst b/docs/source/model_doc/marian.rst index 31fd028d441003..e90123adc4be94 100644 --- a/docs/source/model_doc/marian.rst +++ b/docs/source/model_doc/marian.rst @@ -5,7 +5,7 @@ MarianMT `__ and assign @patrickvonplaten. -Translations should be similar, but not identical to, output in the test set linked to in each model card. +Translations should be similar, but not identical to output in the test set linked to in each model card. Implementation Notes ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -35,32 +35,46 @@ Naming `__, three digit codes require googling "language code {code}". - Codes formatted like :obj:`es_AR` are usually :obj:`code_{region}`. That one is Spanish from Argentina. +- The models were converted in two stages. The first 1000 models use ISO-639-2 codes to identify languages, the second + group use a combination of ISO-639-5 codes and ISO-639-2 codes. -Multilingual Models +Examples ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -All model names use the following format: :obj:`Helsinki-NLP/opus-mt-{src}-{tgt}`: +- Since Marian models are smaller than many other translation models available in the library, they can be useful for + fine-tuning experiments and integration tests. +- `Fine-tune on TPU + `__ +- `Fine-tune on GPU + `__ +- `Fine-tune on GPU with pytorch-lightning + `__ + +Multilingual Models +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - If :obj:`src` is in all caps, the model supports multiple input languages, you can figure out which ones by - looking at the model card, or the Group Members `mapping - `_ . - - If :obj:`tgt` is in all caps, the model can output multiple languages, and you should specify a language code by - prepending the desired output language to the :obj:`src_text`. - - You can see a tokenizer's supported language codes in ``tokenizer.supported_language_codes`` +- All model names use the following format: :obj:`Helsinki-NLP/opus-mt-{src}-{tgt}`: +- If a model can output multiple languages, and you should specify a language code by prepending the desired output + language to the :obj:`src_text`. +- You can see a models's supported language codes in its model card, under target constituents, like in `opus-mt-en-roa + `__. +- Note that if a model is only multilingual on the source side, like :obj:`Helsinki-NLP/opus-mt-roa-en`, no language + codes are required. -Example of translating english to many romance languages, using language codes: +New multi-lingual models from the `Tatoeba-Challenge repo `__ +require 3 character language codes: .. code-block:: python from transformers import MarianMTModel, MarianTokenizer src_text = [ - '>>fr<< this is a sentence in english that we want to translate to french', - '>>pt<< This should go to portuguese', - '>>es<< And this to Spanish' + '>>fra<< this is a sentence in english that we want to translate to french', + '>>por<< This should go to portuguese', + '>>esp<< And this to Spanish' ] - model_name = 'Helsinki-NLP/opus-mt-en-ROMANCE' + model_name = 'Helsinki-NLP/opus-mt-en-roa' tokenizer = MarianTokenizer.from_pretrained(model_name) print(tokenizer.supported_language_codes) model = MarianMTModel.from_pretrained(model_name) @@ -70,25 +84,42 @@ Example of translating english to many romance languages, using language codes: # 'Isto deve ir para o português.', # 'Y esto al español'] -Sometimes, models were trained on collections of languages that do not resolve to a group. In this case, _ is used as a -separator for src or tgt, as in :obj:`Helsinki-NLP/opus-mt-en_el_es_fi-en_el_es_fi`. These still require language -codes. -There are many supported regional language codes, like :obj:`>>es_ES<<` (Spain) and :obj:`>>es_AR<<` (Argentina), that -do not seem to change translations. I have not found these to provide different results than just using :obj:`>>es<<`. -For example: - - `Helsinki-NLP/opus-mt-NORTH_EU-NORTH_EU`: translates from all NORTH_EU languages (see `mapping - `_) to all NORTH_EU languages. Use a special - language code like :obj:`>>de<<` to specify output language. - - `Helsinki-NLP/opus-mt-ROMANCE-en`: translates from many romance languages to english, no codes needed since there - is only one target language. +Code to see available pretrained models: + +.. code-block:: python + + from transformers.hf_api import HfApi + model_list = HfApi().model_list() + org = "Helsinki-NLP" + model_ids = [x.modelId for x in model_list if x.modelId.startswith(org)] + suffix = [x.split('/')[1] for x in model_ids] + old_style_multi_models = [f'{org}/{s}' for s in suffix if s != s.lower()] + + +Old Style Multi-Lingual Models +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +These are the old style multi-lingual models ported from the OPUS-MT-Train repo: and the members of each language +group: .. code-block:: python + ['Helsinki-NLP/opus-mt-NORTH_EU-NORTH_EU', + 'Helsinki-NLP/opus-mt-ROMANCE-en', + 'Helsinki-NLP/opus-mt-SCANDINAVIA-SCANDINAVIA', + 'Helsinki-NLP/opus-mt-de-ZH', + 'Helsinki-NLP/opus-mt-en-CELTIC', + 'Helsinki-NLP/opus-mt-en-ROMANCE', + 'Helsinki-NLP/opus-mt-es-NORWAY', + 'Helsinki-NLP/opus-mt-fi-NORWAY', + 'Helsinki-NLP/opus-mt-fi-ZH', + 'Helsinki-NLP/opus-mt-fi_nb_no_nn_ru_sv_en-SAMI', + 'Helsinki-NLP/opus-mt-sv-NORWAY', + 'Helsinki-NLP/opus-mt-sv-ZH'] GROUP_MEMBERS = { 'ZH': ['cmn', 'cn', 'yue', 'ze_zh', 'zh_cn', 'zh_CN', 'zh_HK', 'zh_tw', 'zh_TW', 'zh_yue', 'zhs', 'zht', 'zh'], 'ROMANCE': ['fr', 'fr_BE', 'fr_CA', 'fr_FR', 'wa', 'frp', 'oc', 'ca', 'rm', 'lld', 'fur', 'lij', 'lmo', 'es', 'es_AR', 'es_CL', 'es_CO', 'es_CR', 'es_DO', 'es_EC', 'es_ES', 'es_GT', 'es_HN', 'es_MX', 'es_NI', 'es_PA', 'es_PE', 'es_PR', 'es_SV', 'es_UY', 'es_VE', 'pt', 'pt_br', 'pt_BR', 'pt_PT', 'gl', 'lad', 'an', 'mwl', 'it', 'it_IT', 'co', 'nap', 'scn', 'vec', 'sc', 'ro', 'la'], @@ -99,16 +130,22 @@ For example: 'CELTIC': ['ga', 'cy', 'br', 'gd', 'kw', 'gv'] } -Code to see available pretrained models: -.. code-block:: python - from transformers.hf_api import HfApi - model_list = HfApi().model_list() - org = "Helsinki-NLP" - model_ids = [x.modelId for x in model_list if x.modelId.startswith(org)] - suffix = [x.split('/')[1] for x in model_ids] - multi_models = [f'{org}/{s}' for s in suffix if s != s.lower()] + +Example of translating english to many romance languages, using old-style 2 character language codes + + +.. code-block::python + + from transformers import MarianMTModel, MarianTokenizer + src_text = [ '>>fr<< this is a sentence in english that we want to translate to french', '>>pt<< This should go to portuguese', '>>es<< And this to Spanish'] + + model_name = 'Helsinki-NLP/opus-mt-en-ROMANCE' tokenizer = MarianTokenizer.from_pretrained(model_name) + print(tokenizer.supported_language_codes) model = MarianMTModel.from_pretrained(model_name) translated = + model.generate(**tokenizer.prepare_seq2seq_batch(src_text)) tgt_text = [tokenizer.decode(t, skip_special_tokens=True) for t in translated] + # ["c'est une phrase en anglais que nous voulons traduire en français", 'Isto deve ir para o português.', 'Y esto al español'] + MarianConfig diff --git a/docs/source/model_doc/mbart.rst b/docs/source/model_doc/mbart.rst index 99725eb36a0e91..9d3aab93fd3590 100644 --- a/docs/source/model_doc/mbart.rst +++ b/docs/source/model_doc/mbart.rst @@ -19,6 +19,13 @@ on the encoder, decoder, or reconstructing parts of the text. The Authors' code can be found `here `__ +Examples +_______________________________________________________________________________________________________________________ + +- Examples and scripts for fine-tuning mBART and other models for sequence to sequence tasks can be found in + `examples/seq2seq/ `__. +- Given the large embeddings table, mBART consumes a large amount of GPU RAM, especially for fine-tuning. + :class:`MarianMTModel` is usually a better choice for bilingual machine translation. Training ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -38,11 +45,7 @@ the sequences for sequence-to-sequence fine-tuning. example_english_phrase = "UN Chief Says There Is No Military Solution in Syria" expected_translation_romanian = "Şeful ONU declară că nu există o soluţie militară în Siria" batch = tokenizer.prepare_seq2seq_batch(example_english_phrase, src_lang="en_XX", tgt_lang="ro_RO", tgt_texts=expected_translation_romanian) - input_ids = batch["input_ids"] - target_ids = batch["decoder_input_ids"] - decoder_input_ids = target_ids[:, :-1].contiguous() - labels = target_ids[:, 1:].clone() - model(input_ids=input_ids, decoder_input_ids=decoder_input_ids, labels=labels) #forward + model(input_ids=batch['input_ids'], labels=batch['labels']) # forward pass - Generation diff --git a/docs/source/model_doc/pegasus.rst b/docs/source/model_doc/pegasus.rst index 1a6466474540c9..a652ce0851ef4f 100644 --- a/docs/source/model_doc/pegasus.rst +++ b/docs/source/model_doc/pegasus.rst @@ -31,10 +31,19 @@ All the `checkpoints `__ are fine- - Each checkpoint is 2.2 GB on disk and 568M parameters. - FP16 is not supported (help/ideas on this appreciated!). - Summarizing xsum in fp32 takes about 400ms/sample, with default parameters on a v100 GPU. -- For XSUM, The paper reports rouge1,rouge2, rougeL of paper: 47.21/24.56/39.25. As of Aug 9, this port scores - 46.91/24.34/39.1. +- Full replication results and correctly pre-processed data can be found in this `Issue + `__. +- `Distilled checkpoints `__ are described in this `paper + `__. -The gap is likely because of different alpha/length_penalty implementations in beam search. +Examples +_______________________________________________________________________________________________________________________ + +- `Script `__ to + fine-tune pegasus on the XSUM dataset. Data download instructions at `examples/seq2seq/ + `__. +- FP16 is not supported (help/ideas on this appreciated!). +- The adafactor optimizer is recommended for pegasus fine-tuning. Implementation Notes @@ -45,7 +54,7 @@ Implementation Notes - Some key configuration differences: - static, sinusoidal position embeddings - - no :obj:`layernorm_embedding` (:obj`PegasusConfig.normalize_embedding=False`) + - no :obj:`layernorm_embedding` (:obj:`PegasusConfig.normalize_embedding=False`) - the model starts generating with pad_token_id (which has 0 token_embedding) as the prefix. - more beams are used (:obj:`num_beams=8`) - All pretrained pegasus checkpoints are the same besides three attributes: :obj:`tokenizer.model_max_length` (maximum diff --git a/tests/test_modeling_bart.py b/tests/test_modeling_bart.py index c6f9f65dca9678..55232389366977 100644 --- a/tests/test_modeling_bart.py +++ b/tests/test_modeling_bart.py @@ -476,9 +476,9 @@ def test_bart_base_mask_filling(self): @slow def test_bart_large_mask_filling(self): - pbase = pipeline(task="fill-mask", model="facebook/bart-large") + plarge = pipeline(task="fill-mask", model="facebook/bart-large") src_text = [" I went to the ."] - results = [x["token_str"] for x in pbase(src_text)] + results = [x["token_str"] for x in plarge(src_text)] expected_results = ["Ġbathroom", "Ġgym", "Ġwrong", "Ġmovies", "Ġhospital"] self.assertListEqual(results, expected_results)