From 3c120ace03359813d73ea351d59b9446bdc3e99b Mon Sep 17 00:00:00 2001 From: Sharath T S Date: Wed, 8 Jul 2020 16:33:43 -0700 Subject: [PATCH 1/5] Update DataPrep --- .../BERT/data/create_datasets_from_start.sh | 39 ++++++++++++------- 1 file changed, 24 insertions(+), 15 deletions(-) diff --git a/PyTorch/LanguageModeling/BERT/data/create_datasets_from_start.sh b/PyTorch/LanguageModeling/BERT/data/create_datasets_from_start.sh index 756cec209..3222ff46d 100755 --- a/PyTorch/LanguageModeling/BERT/data/create_datasets_from_start.sh +++ b/PyTorch/LanguageModeling/BERT/data/create_datasets_from_start.sh @@ -13,30 +13,39 @@ # See the License for the specific language governing permissions and # limitations under the License. -# Download -python3 /workspace/bert/data/bertPrep.py --action download --dataset bookscorpus -python3 /workspace/bert/data/bertPrep.py --action download --dataset wikicorpus_en +to_download=${1:-"wiki_only"} -python3 /workspace/bert/data/bertPrep.py --action download --dataset google_pretrained_weights # Includes vocab +#Download +if [ "$to_download" = "wiki_books" ] ; then + python3 /workspace/bert/data/bertPrep.py --action download --dataset bookscorpus +fi +python3 /workspace/bert/data/bertPrep.py --action download --dataset wikicorpus_en +python3 /workspace/bert/data/bertPrep.py --action download --dataset google_pretrained_weights # Includes vocab python3 /workspace/bert/data/bertPrep.py --action download --dataset squad -#python3 /workspace/bert/data/bertPrep.py --action download --dataset mrpc - +python3 /workspace/bert/data/bertPrep.py --action download --dataset mrpc +python3 /workspace/bert/data/bertPrep.py --action download --dataset sst-2 # Properly format the text files -python3 /workspace/bert/data/bertPrep.py --action text_formatting --dataset bookscorpus +if [ "$to_download" = "wiki_books" ] ; then + python3 /workspace/bert/data/bertPrep.py --action text_formatting --dataset bookscorpus +fi python3 /workspace/bert/data/bertPrep.py --action text_formatting --dataset wikicorpus_en +if [ "$to_download" = "wiki_books" ] ; then + DATASET="books_wiki_en_corpus" +else + DATASET="wikicorpus_en" + # Shard the text files +fi -# Shard the text files (group wiki+books then shard) -python3 /workspace/bert/data/bertPrep.py --action sharding --dataset books_wiki_en_corpus - +# Shard the text files +python3 /workspace/bert/data/bertPrep.py --action sharding --dataset $DATASET # Create HDF5 files Phase 1 -python3 /workspace/bert/data/bertPrep.py --action create_hdf5_files --dataset books_wiki_en_corpus --max_seq_length 128 \ - --max_predictions_per_seq 20 --vocab_file $BERT_PREP_WORKING_DIR/download/google_pretrained_weights/uncased_L-24_H-1024_A-16/vocab.txt --do_lower_case 1 - +python3 /workspace/bert/data/bertPrep.py --action create_hdf5_files --dataset $DATASET --max_seq_length 128 \ +--max_predictions_per_seq 20 --vocab_file $BERT_PREP_WORKING_DIR/download/google_pretrained_weights/uncased_L-24_H-1024_A-16/vocab.txt --do_lower_case 1 # Create HDF5 files Phase 2 -python3 /workspace/bert/data/bertPrep.py --action create_hdf5_files --dataset books_wiki_en_corpus --max_seq_length 512 \ - --max_predictions_per_seq 80 --vocab_file $BERT_PREP_WORKING_DIR/download/google_pretrained_weights/uncased_L-24_H-1024_A-16/vocab.txt --do_lower_case 1 \ No newline at end of file +python3 /workspace/bert/data/bertPrep.py --action create_hdf5_files --dataset $DATASET --max_seq_length 512 \ +--max_predictions_per_seq 80 --vocab_file $BERT_PREP_WORKING_DIR/download/google_pretrained_weights/uncased_L-24_H-1024_A-16/vocab.txt --do_lower_case 1 From f403955c52a03ac85ea9013578914ef3ad900633 Mon Sep 17 00:00:00 2001 From: Sharath T S Date: Wed, 8 Jul 2020 16:35:08 -0700 Subject: [PATCH 2/5] Update create_datasets_from_start.sh --- .../LanguageModeling/BERT/data/create_datasets_from_start.sh | 2 -- 1 file changed, 2 deletions(-) diff --git a/PyTorch/LanguageModeling/BERT/data/create_datasets_from_start.sh b/PyTorch/LanguageModeling/BERT/data/create_datasets_from_start.sh index 3222ff46d..9d502e4b6 100755 --- a/PyTorch/LanguageModeling/BERT/data/create_datasets_from_start.sh +++ b/PyTorch/LanguageModeling/BERT/data/create_datasets_from_start.sh @@ -23,8 +23,6 @@ fi python3 /workspace/bert/data/bertPrep.py --action download --dataset wikicorpus_en python3 /workspace/bert/data/bertPrep.py --action download --dataset google_pretrained_weights # Includes vocab python3 /workspace/bert/data/bertPrep.py --action download --dataset squad -python3 /workspace/bert/data/bertPrep.py --action download --dataset mrpc -python3 /workspace/bert/data/bertPrep.py --action download --dataset sst-2 # Properly format the text files if [ "$to_download" = "wiki_books" ] ; then From 53f731cba2ff5c892dcc5276f3c7c03b4317ee2c Mon Sep 17 00:00:00 2001 From: Sharath T S Date: Wed, 8 Jul 2020 16:35:55 -0700 Subject: [PATCH 3/5] Update README.md --- PyTorch/LanguageModeling/BERT/README.md | 27 +++++++++++++++++++------ 1 file changed, 21 insertions(+), 6 deletions(-) diff --git a/PyTorch/LanguageModeling/BERT/README.md b/PyTorch/LanguageModeling/BERT/README.md index fd317cefe..8ada3d8d3 100755 --- a/PyTorch/LanguageModeling/BERT/README.md +++ b/PyTorch/LanguageModeling/BERT/README.md @@ -178,6 +178,8 @@ Where `` is the optimization level. In the pretraining, `O2` is set a #### Enabling TF32 +This section is model specific and needs to show how to enable TF32. How is TF32 being implemented? Tweaking layers, preprocessing data, etc… + TensorFloat-32 (TF32) is the new math mode in [NVIDIA A100](https://www.nvidia.com/en-us/data-center/a100/) GPUs for handling the matrix math also called tensor operations. TF32 running on Tensor Cores in A100 GPUs can provide up to 10x speedups compared to single-precision floating-point math (FP32) on Volta GPUs. TF32 Tensor Cores can speed up networks using FP32, typically with no loss of accuracy. It is more robust than FP16 for models which require high dynamic range for weights or activations. @@ -254,10 +256,10 @@ If you want to use a pre-trained checkpoint, visit [NGC](https://ngc.nvidia.com/ Resultant logs and checkpoints of pretraining and fine-tuning routines are stored in the `results/` folder. -`data` and `vocab.txt` are downloaded in the `data/` directory by default. Refer to the [Getting the data](#getting-the-data) section for more details on how to process a custom corpus as required for BERT pretraining. - +`data` and `vocab.txt` are downloaded in the `data/` directory by default. Refer to the [Getting the data](#getting-the-data) section for more details on how to process a custom corpus as required for BERT pretraining. + 5. Download and preprocess the dataset. - + This repository provides scripts to download, verify, and extract the following datasets: - [SQuAD](https://rajpurkar.github.io/SQuAD-explorer/) (fine-tuning for question answering) @@ -266,9 +268,22 @@ This repository provides scripts to download, verify, and extract the following To download, verify, extract the datasets, and create the shards in `.hdf5` format, run: `/workspace/bert/data/create_datasets_from_start.sh` - -Note: For fine tuning only, Wikipedia and Bookscorpus dataset download can be skipped by commenting it out. The pretraining dataset is 170GB+ and takes 15+ hours to download. The BookCorpus server could sometimes get overloaded and also contain broken links resulting in HTTP 403 and 503 errors. You can either skip the missing files or retry downloading at a later time. Expired dataset links are ignored during data download. - + +Note: For fine tuning only, Wikipedia and Bookscorpus dataset download and preprocessing can be skipped by commenting it out. + +- Download Wikipedia only for pretraining + +The pretraining dataset is 170GB+ and takes 15+ hours to download. The BookCorpus server most of the times get overloaded and also contain broken links resulting in HTTP 403 and 503 errors. Hence, it is recommended to skip downloading BookCorpus data by running: + +`/workspace/bert/data/create_datasets_from_start.sh wiki_only` + +- Download Wikipedia and BookCorpus + +Users are welcome to download BookCorpus from other sources to match our accuracy, or repeatedly try our script until the required number of files are downloaded by running the following: +`/workspace/bert/data/create_datasets_from_start.sh wiki_books` + +Note: Not using BookCorpus can potentially change final accuracy on a few downstream tasks. + 6. Start pretraining. To run on a single node 8 x V100 32G cards, from within the container, you can use the following script to run pre-training. From c775a09cb4d06ff568d3f9972e04a51819ecaaf5 Mon Sep 17 00:00:00 2001 From: Sharath T S Date: Wed, 8 Jul 2020 16:39:02 -0700 Subject: [PATCH 4/5] Update README.md --- PyTorch/LanguageModeling/BERT/README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/PyTorch/LanguageModeling/BERT/README.md b/PyTorch/LanguageModeling/BERT/README.md index 8ada3d8d3..41a5278d4 100755 --- a/PyTorch/LanguageModeling/BERT/README.md +++ b/PyTorch/LanguageModeling/BERT/README.md @@ -280,6 +280,7 @@ The pretraining dataset is 170GB+ and takes 15+ hours to download. The BookCorpu - Download Wikipedia and BookCorpus Users are welcome to download BookCorpus from other sources to match our accuracy, or repeatedly try our script until the required number of files are downloaded by running the following: + `/workspace/bert/data/create_datasets_from_start.sh wiki_books` Note: Not using BookCorpus can potentially change final accuracy on a few downstream tasks. From 0e1c028d779af11647133db8032a8147503e6bcc Mon Sep 17 00:00:00 2001 From: Sharath T S Date: Wed, 8 Jul 2020 16:40:14 -0700 Subject: [PATCH 5/5] Update README.md --- PyTorch/LanguageModeling/BERT/README.md | 2 -- 1 file changed, 2 deletions(-) diff --git a/PyTorch/LanguageModeling/BERT/README.md b/PyTorch/LanguageModeling/BERT/README.md index 41a5278d4..58e241ab2 100755 --- a/PyTorch/LanguageModeling/BERT/README.md +++ b/PyTorch/LanguageModeling/BERT/README.md @@ -178,8 +178,6 @@ Where `` is the optimization level. In the pretraining, `O2` is set a #### Enabling TF32 -This section is model specific and needs to show how to enable TF32. How is TF32 being implemented? Tweaking layers, preprocessing data, etc… - TensorFloat-32 (TF32) is the new math mode in [NVIDIA A100](https://www.nvidia.com/en-us/data-center/a100/) GPUs for handling the matrix math also called tensor operations. TF32 running on Tensor Cores in A100 GPUs can provide up to 10x speedups compared to single-precision floating-point math (FP32) on Volta GPUs. TF32 Tensor Cores can speed up networks using FP32, typically with no loss of accuracy. It is more robust than FP16 for models which require high dynamic range for weights or activations.