From 5ffde9b4bcfe0745134dd1a981ca7b1fd9eb841e Mon Sep 17 00:00:00 2001 From: Stas Bekman Date: Sun, 20 Sep 2020 15:30:12 -0700 Subject: [PATCH 1/4] make it easier to run the scripts --- examples/seq2seq/README.md | 14 +++++++------- examples/seq2seq/convert_model_to_fp16.py | 2 ++ examples/seq2seq/convert_pl_checkpoint_to_hf.py | 2 ++ examples/seq2seq/distillation.py | 2 ++ examples/seq2seq/download_wmt.py | 2 ++ examples/seq2seq/finetune.py | 9 ++++++++- examples/seq2seq/finetune.sh | 3 --- examples/seq2seq/minify_dataset.py | 2 ++ examples/seq2seq/pack_dataset.py | 2 ++ examples/seq2seq/run_distributed_eval.py | 2 ++ examples/seq2seq/run_eval.py | 2 ++ examples/seq2seq/run_eval_search.py | 2 ++ examples/seq2seq/save_len_file.py | 9 +++------ examples/seq2seq/test_bash_script.py | 2 ++ examples/seq2seq/test_data/wmt_en_ro/train.len | Bin 26 -> 34 bytes examples/seq2seq/test_data/wmt_en_ro/val.len | Bin 40 -> 48 bytes examples/seq2seq/test_datasets.py | 9 ++++----- examples/seq2seq/test_fsmt_bleu_score.py | 10 ++-------- 18 files changed, 44 insertions(+), 30 deletions(-) mode change 100644 => 100755 examples/seq2seq/convert_model_to_fp16.py mode change 100644 => 100755 examples/seq2seq/convert_pl_checkpoint_to_hf.py mode change 100644 => 100755 examples/seq2seq/distillation.py mode change 100644 => 100755 examples/seq2seq/download_wmt.py mode change 100644 => 100755 examples/seq2seq/finetune.py mode change 100644 => 100755 examples/seq2seq/minify_dataset.py mode change 100644 => 100755 examples/seq2seq/pack_dataset.py mode change 100644 => 100755 examples/seq2seq/run_distributed_eval.py mode change 100644 => 100755 examples/seq2seq/run_eval.py mode change 100644 => 100755 examples/seq2seq/run_eval_search.py mode change 100644 => 100755 examples/seq2seq/save_len_file.py diff --git a/examples/seq2seq/README.md b/examples/seq2seq/README.md index 2eb9c6e4121f32..bd0fd1a9d4c255 100644 --- a/examples/seq2seq/README.md +++ b/examples/seq2seq/README.md @@ -92,7 +92,7 @@ All finetuning bash scripts call finetune.py (or distillation.py) with reasonabl To see all the possible command line options, run: ```bash -./finetune.sh --help # this calls python finetune.py --help + --help # this calls ./finetune.py --help ``` ### Finetuning Training Params @@ -189,7 +189,7 @@ If 'translation' is in your task name, the computed metric will be BLEU. Otherwi For t5, you need to specify --task translation_{src}_to_{tgt} as follows: ```bash export DATA_DIR=wmt_en_ro -python run_eval.py t5-base \ +./run_eval.py t5-base \ $DATA_DIR/val.source t5_val_generations.txt \ --reference_path $DATA_DIR/val.target \ --score_path enro_bleu.json \ @@ -203,7 +203,7 @@ python run_eval.py t5-base \ This command works for MBART, although the BLEU score is suspiciously low. ```bash export DATA_DIR=wmt_en_ro -python run_eval.py facebook/mbart-large-en-ro $DATA_DIR/val.source mbart_val_generations.txt \ +./run_eval.py facebook/mbart-large-en-ro $DATA_DIR/val.source mbart_val_generations.txt \ --reference_path $DATA_DIR/val.target \ --score_path enro_bleu.json \ --task translation \ @@ -216,7 +216,7 @@ python run_eval.py facebook/mbart-large-en-ro $DATA_DIR/val.source mbart_val_gen Summarization (xsum will be very similar): ```bash export DATA_DIR=cnn_dm -python run_eval.py sshleifer/distilbart-cnn-12-6 $DATA_DIR/val.source dbart_val_generations.txt \ +./run_eval.py sshleifer/distilbart-cnn-12-6 $DATA_DIR/val.source dbart_val_generations.txt \ --reference_path $DATA_DIR/val.target \ --score_path cnn_rouge.json \ --task summarization \ @@ -230,7 +230,7 @@ python run_eval.py sshleifer/distilbart-cnn-12-6 $DATA_DIR/val.source dbart_val_ ### Multi-GPU Evalulation here is a command to run xsum evaluation on 8 GPUS. It is more than linearly faster than run_eval.py in some cases because it uses SortishSampler to minimize padding. You can also use it on 1 GPU. `data_dir` must have -`{type_path}.source` and `{type_path}.target`. Run `python run_distributed_eval.py --help` for all clargs. +`{type_path}.source` and `{type_path}.target`. Run `./run_distributed_eval.py --help` for all clargs. ```bash python -m torch.distributed.launch --nproc_per_node=8 run_distributed_eval.py \ @@ -363,11 +363,11 @@ This feature can only be used: - with fairseq installed - on 1 GPU - without sortish sampler -- after calling `python save_len_file.py $tok $data_dir` +- after calling `./save_len_file.py $tok $data_dir` For example, ```bash -python save_len_file.py Helsinki-NLP/opus-mt-en-ro wmt_en_ro +./save_len_file.py Helsinki-NLP/opus-mt-en-ro wmt_en_ro ./dynamic_bs_example.sh --max_tokens_per_batch=2000 --output_dir benchmark_dynamic_bs ``` splits `wmt_en_ro/train` into 11,197 uneven lengthed batches and can finish 1 epoch in 8 minutes on a v100. diff --git a/examples/seq2seq/convert_model_to_fp16.py b/examples/seq2seq/convert_model_to_fp16.py old mode 100644 new mode 100755 index 24042cc0e7e581..26b1ff8fd8f664 --- a/examples/seq2seq/convert_model_to_fp16.py +++ b/examples/seq2seq/convert_model_to_fp16.py @@ -1,3 +1,5 @@ +#!/usr/bin/env python + from typing import Union import fire diff --git a/examples/seq2seq/convert_pl_checkpoint_to_hf.py b/examples/seq2seq/convert_pl_checkpoint_to_hf.py old mode 100644 new mode 100755 index ccae1672919315..5f3c984f3724c1 --- a/examples/seq2seq/convert_pl_checkpoint_to_hf.py +++ b/examples/seq2seq/convert_pl_checkpoint_to_hf.py @@ -1,3 +1,5 @@ +#!/usr/bin/env python + import os from pathlib import Path from typing import Dict, List diff --git a/examples/seq2seq/distillation.py b/examples/seq2seq/distillation.py old mode 100644 new mode 100755 index 3b1ce10d0d5fbc..1ffd02e9a18450 --- a/examples/seq2seq/distillation.py +++ b/examples/seq2seq/distillation.py @@ -1,3 +1,5 @@ +#!/usr/bin/env python + import argparse import gc import os diff --git a/examples/seq2seq/download_wmt.py b/examples/seq2seq/download_wmt.py old mode 100644 new mode 100755 index efe5ff0b9129f5..bef04726c45ede --- a/examples/seq2seq/download_wmt.py +++ b/examples/seq2seq/download_wmt.py @@ -1,3 +1,5 @@ +#!/usr/bin/env python + from pathlib import Path import fire diff --git a/examples/seq2seq/finetune.py b/examples/seq2seq/finetune.py old mode 100644 new mode 100755 index f54f15c1d55be3..1f5a7b6e10593c --- a/examples/seq2seq/finetune.py +++ b/examples/seq2seq/finetune.py @@ -1,7 +1,10 @@ +#!/usr/bin/env python + import argparse import glob import logging import os +import sys import time from collections import defaultdict from pathlib import Path @@ -13,7 +16,6 @@ from torch.utils.data import DataLoader from callbacks import Seq2SeqLoggingCallback, get_checkpoint_callback, get_early_stopping_callback -from lightning_base import BaseTransformer, add_generic_args, generic_train from transformers import MBartTokenizer, T5ForConditionalGeneration from transformers.modeling_bart import shift_tokens_right from utils import ( @@ -35,6 +37,11 @@ ) +# need the parent dir scripts +sys.path.insert(2, str(Path(__file__).resolve().parents[1])) +from lightning_base import BaseTransformer, add_generic_args, generic_train # noqa + + logger = logging.getLogger(__name__) diff --git a/examples/seq2seq/finetune.sh b/examples/seq2seq/finetune.sh index 4d140db48e0a52..683c2d7752df13 100755 --- a/examples/seq2seq/finetune.sh +++ b/examples/seq2seq/finetune.sh @@ -1,6 +1,3 @@ -# Add parent directory to python path to access lightning_base.py -export PYTHONPATH="../":"${PYTHONPATH}" - # the proper usage is documented in the README, you need to specify data_dir, output_dir and model_name_or_path # run ./finetune.sh --help to see all the possible options python finetune.py \ diff --git a/examples/seq2seq/minify_dataset.py b/examples/seq2seq/minify_dataset.py old mode 100644 new mode 100755 index da70ced60ab481..c441db565c7f68 --- a/examples/seq2seq/minify_dataset.py +++ b/examples/seq2seq/minify_dataset.py @@ -1,3 +1,5 @@ +#!/usr/bin/env python + from pathlib import Path import fire diff --git a/examples/seq2seq/pack_dataset.py b/examples/seq2seq/pack_dataset.py old mode 100644 new mode 100755 index 4274054a920212..11351b75a7b3d0 --- a/examples/seq2seq/pack_dataset.py +++ b/examples/seq2seq/pack_dataset.py @@ -1,3 +1,5 @@ +#!/usr/bin/env python + """Fill examples with bitext up to max_tokens without breaking up examples. [['I went', 'yo fui'], ['to the store', 'a la tienda'] diff --git a/examples/seq2seq/run_distributed_eval.py b/examples/seq2seq/run_distributed_eval.py old mode 100644 new mode 100755 index e8218e19171be2..985c02929bbebf --- a/examples/seq2seq/run_distributed_eval.py +++ b/examples/seq2seq/run_distributed_eval.py @@ -1,3 +1,5 @@ +#!/usr/bin/env python + import argparse import shutil import time diff --git a/examples/seq2seq/run_eval.py b/examples/seq2seq/run_eval.py old mode 100644 new mode 100755 index 09ff4c9a53f5cc..0269ea24385c30 --- a/examples/seq2seq/run_eval.py +++ b/examples/seq2seq/run_eval.py @@ -1,3 +1,5 @@ +#!/usr/bin/env python + import argparse import datetime import json diff --git a/examples/seq2seq/run_eval_search.py b/examples/seq2seq/run_eval_search.py old mode 100644 new mode 100755 index 2a819e169f9d34..292918c9f3d93e --- a/examples/seq2seq/run_eval_search.py +++ b/examples/seq2seq/run_eval_search.py @@ -1,3 +1,5 @@ +#!/usr/bin/env python + import argparse import itertools import operator diff --git a/examples/seq2seq/save_len_file.py b/examples/seq2seq/save_len_file.py old mode 100644 new mode 100755 index edd2cf07a670b0..15413cab165428 --- a/examples/seq2seq/save_len_file.py +++ b/examples/seq2seq/save_len_file.py @@ -1,14 +1,11 @@ +#!/usr/bin/env python + import fire from torch.utils.data import DataLoader from tqdm import tqdm from transformers import AutoTokenizer - - -try: - from .utils import Seq2SeqDataset, pickle_save -except ImportError: - from utils import Seq2SeqDataset, pickle_save +from utils import Seq2SeqDataset, pickle_save def save_len_file( diff --git a/examples/seq2seq/test_bash_script.py b/examples/seq2seq/test_bash_script.py index 7d163d1c35dc84..45c928ad33c8c4 100644 --- a/examples/seq2seq/test_bash_script.py +++ b/examples/seq2seq/test_bash_script.py @@ -1,3 +1,5 @@ +#!/usr/bin/env python + import argparse import os import sys diff --git a/examples/seq2seq/test_data/wmt_en_ro/train.len b/examples/seq2seq/test_data/wmt_en_ro/train.len index 33ce003c8ae3139914a389a714812a2ab13aece4..2632a33e8b8a3a601bdf93f3b8e6783a0c7bba60 100644 GIT binary patch literal 34 lcmZo*nJUfz0kKmwye+)ry(fEDdzX5%dDnR7dRM3F0RV-J2?PKD literal 26 hcmZo*jxA)+@V4-d_nz!s?Op24=3V2R>s_6y2LNfV2s8iy diff --git a/examples/seq2seq/test_data/wmt_en_ro/val.len b/examples/seq2seq/test_data/wmt_en_ro/val.len index 897314a960b28d927b597805693e63f9de71d903..fdf8fa353eb8d41a6d92ccae9df4b2f8aaf970b5 100644 GIT binary patch delta 19 WcmdNe;B8=;s>%QXu~R0pDgyu?i~~*p delta 11 ScmXreU~OQIEo7L;s|)}RKmu?8 diff --git a/examples/seq2seq/test_datasets.py b/examples/seq2seq/test_datasets.py index 671543133e886f..aaf94fa5e0f732 100644 --- a/examples/seq2seq/test_datasets.py +++ b/examples/seq2seq/test_datasets.py @@ -6,14 +6,13 @@ import pytest from torch.utils.data import DataLoader +from pack_dataset import pack_data_dir +from save_len_file import save_len_file +from test_seq2seq_examples import ARTICLES, BART_TINY, MARIAN_TINY, MBART_TINY, SUMMARIES, T5_TINY, make_test_data_dir from transformers import AutoTokenizer from transformers.modeling_bart import shift_tokens_right from transformers.testing_utils import slow - -from .pack_dataset import pack_data_dir -from .save_len_file import save_len_file -from .test_seq2seq_examples import ARTICLES, BART_TINY, MARIAN_TINY, MBART_TINY, SUMMARIES, T5_TINY, make_test_data_dir -from .utils import FAIRSEQ_AVAILABLE, DistributedSortishSampler, LegacySeq2SeqDataset, Seq2SeqDataset +from utils import FAIRSEQ_AVAILABLE, DistributedSortishSampler, LegacySeq2SeqDataset, Seq2SeqDataset BERT_BASE_CASED = "bert-base-cased" diff --git a/examples/seq2seq/test_fsmt_bleu_score.py b/examples/seq2seq/test_fsmt_bleu_score.py index 95f475698f6f8f..beb7f2bc9857fd 100644 --- a/examples/seq2seq/test_fsmt_bleu_score.py +++ b/examples/seq2seq/test_fsmt_bleu_score.py @@ -14,19 +14,13 @@ # limitations under the License. import io -import unittest - - -try: - from .utils import calculate_bleu -except ImportError: - from utils import calculate_bleu - import json +import unittest from parameterized import parameterized from transformers import FSMTForConditionalGeneration, FSMTTokenizer from transformers.testing_utils import get_tests_dir, require_torch, slow, torch_device +from utils import calculate_bleu filename = get_tests_dir() + "/test_data/fsmt/fsmt_val_data.json" From ee9e6ec0e3b20a5dd27f392010b9c849299ce12d Mon Sep 17 00:00:00 2001 From: Stas Bekman Date: Sun, 20 Sep 2020 15:35:19 -0700 Subject: [PATCH 2/4] another script --- examples/seq2seq/distillation.py | 7 ++++++- examples/seq2seq/finetune.py | 2 +- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/examples/seq2seq/distillation.py b/examples/seq2seq/distillation.py index 1ffd02e9a18450..2ad6d32a6dc91d 100755 --- a/examples/seq2seq/distillation.py +++ b/examples/seq2seq/distillation.py @@ -3,6 +3,7 @@ import argparse import gc import os +import sys import warnings from pathlib import Path from typing import List @@ -15,7 +16,6 @@ from finetune import SummarizationModule, TranslationModule from finetune import main as ft_main from initialization_utils import copy_layers, init_student -from lightning_base import generic_train from transformers import AutoModelForSeq2SeqLM, MBartTokenizer, T5Config, T5ForConditionalGeneration from transformers.modeling_bart import shift_tokens_right from utils import ( @@ -29,6 +29,11 @@ ) +# need the parent dir module +sys.path.insert(2, str(Path(__file__).resolve().parents[1])) +from lightning_base import generic_train # noqa + + class BartSummarizationDistiller(SummarizationModule): """Supports Bart, Pegasus and other models that inherit from Bart.""" diff --git a/examples/seq2seq/finetune.py b/examples/seq2seq/finetune.py index 1f5a7b6e10593c..ae75753cba61b2 100755 --- a/examples/seq2seq/finetune.py +++ b/examples/seq2seq/finetune.py @@ -37,7 +37,7 @@ ) -# need the parent dir scripts +# need the parent dir module sys.path.insert(2, str(Path(__file__).resolve().parents[1])) from lightning_base import BaseTransformer, add_generic_args, generic_train # noqa From b318640b66981e8fbe525a77ce9cc231b67f9072 Mon Sep 17 00:00:00 2001 From: Stas Bekman Date: Thu, 24 Sep 2020 12:09:42 -0700 Subject: [PATCH 3/4] oops --- examples/seq2seq/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/seq2seq/README.md b/examples/seq2seq/README.md index bd0fd1a9d4c255..f32d7ab7dc080f 100644 --- a/examples/seq2seq/README.md +++ b/examples/seq2seq/README.md @@ -92,7 +92,7 @@ All finetuning bash scripts call finetune.py (or distillation.py) with reasonabl To see all the possible command line options, run: ```bash - --help # this calls ./finetune.py --help + ./finetune --help ``` ### Finetuning Training Params From c855ebe163e8d4338c37ab33d50206bb302fb9ca Mon Sep 17 00:00:00 2001 From: Stas Bekman Date: Thu, 24 Sep 2020 12:10:19 -0700 Subject: [PATCH 4/4] typo --- examples/seq2seq/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/seq2seq/README.md b/examples/seq2seq/README.md index f32d7ab7dc080f..bdc675e0870ba0 100644 --- a/examples/seq2seq/README.md +++ b/examples/seq2seq/README.md @@ -92,7 +92,7 @@ All finetuning bash scripts call finetune.py (or distillation.py) with reasonabl To see all the possible command line options, run: ```bash - ./finetune --help + ./finetune.py --help ``` ### Finetuning Training Params