From 579c5191c68e0058a23aa4ac6511e88c4859788c Mon Sep 17 00:00:00 2001
From: Parmeet Singh Bhatia <parmeetbhatia@fb.com>
Date: Tue, 19 Oct 2021 11:20:16 -0700
Subject: [PATCH] Import torchtext #1410 0930843

Summary: Import latest from github

Reviewed By: Nayef211

Differential Revision: D31745899

fbshipit-source-id: e4ac5c337bcbd1a8809544add7679dd3da242999
---
 .circleci/config.yml                          |   6 +-
 .circleci/config.yml.in                       |   6 +-
 .circleci/unittest/linux/scripts/install.sh   |   3 +
 .circleci/unittest/windows/scripts/install.sh |   3 +
 README.rst                                    |  43 +--
 examples/BERT/README.md                       | 143 ----------
 examples/BERT/data.py                         |  54 ----
 examples/BERT/metrics.py                      |  72 -----
 examples/BERT/mlm_task.py                     | 269 ------------------
 examples/BERT/model.py                        | 177 ------------
 examples/BERT/ns_task.py                      | 262 -----------------
 examples/BERT/qa_task.py                      | 214 --------------
 examples/BERT/utils.py                        |  58 ----
 packaging/build_conda.sh                      |   2 +-
 packaging/build_wheel.sh                      |   2 +-
 test/asset/raw_datasets.jsonl                 |   3 -
 test/common/case_utils.py                     |   7 +
 test/experimental/test_builtin_datasets.py    |   3 +
 test/experimental/test_datasets.py            |  34 +++
 torchtext/_extension.py                       |  60 ++--
 torchtext/_internal/__init__.py               |   0
 torchtext/_internal/module_utils.py           |  11 +
 torchtext/data/datasets_utils.py              |   2 +-
 torchtext/experimental/datasets/__init__.py   |   3 +-
 torchtext/experimental/datasets/sst2.py       |  90 ++++++
 torchtext/vocab/vectors.py                    |   2 +-
 version.txt                                   |   2 +-
 27 files changed, 202 insertions(+), 1329 deletions(-)
 delete mode 100644 examples/BERT/README.md
 delete mode 100644 examples/BERT/data.py
 delete mode 100644 examples/BERT/metrics.py
 delete mode 100644 examples/BERT/mlm_task.py
 delete mode 100644 examples/BERT/model.py
 delete mode 100644 examples/BERT/ns_task.py
 delete mode 100644 examples/BERT/qa_task.py
 delete mode 100644 examples/BERT/utils.py
 create mode 100644 test/common/case_utils.py
 create mode 100644 test/experimental/test_datasets.py
 create mode 100644 torchtext/_internal/__init__.py
 create mode 100644 torchtext/_internal/module_utils.py
 create mode 100644 torchtext/experimental/datasets/sst2.py

diff --git a/.circleci/config.yml b/.circleci/config.yml
index ce597158dd..bc3f09d043 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -83,7 +83,7 @@ smoke_test_common: &smoke_test_common
 jobs:
   circleci_consistency:
     docker:
-      - image: circleci/python:3.8
+      - image: cimg/python:3.8
     steps:
       - checkout
       - run:
@@ -234,7 +234,7 @@ jobs:
   # Requires org-member context
   binary_wheel_upload:
     docker:
-      - image: circleci/python:3.8
+      - image: cimg/python:3.8
     steps:
       - attach_workspace:
           at: ~/workspace
@@ -497,7 +497,7 @@ jobs:
             - v1-windows-dataset-vector-{{ checksum ".cachekey" }}
             - v1-windows-dataset-{{ checksum ".cachekey" }}
 
- 
+
       - run:
           name: Run tests
           # Downloading embedding vector takes long time.
diff --git a/.circleci/config.yml.in b/.circleci/config.yml.in
index c8e846ea0a..911295217b 100644
--- a/.circleci/config.yml.in
+++ b/.circleci/config.yml.in
@@ -83,7 +83,7 @@ smoke_test_common: &smoke_test_common
 jobs:
   circleci_consistency:
     docker:
-      - image: circleci/python:3.8
+      - image: cimg/python:3.8
     steps:
       - checkout
       - run:
@@ -234,7 +234,7 @@ jobs:
   # Requires org-member context
   binary_wheel_upload:
     docker:
-      - image: circleci/python:3.8
+      - image: cimg/python:3.8
     steps:
       - attach_workspace:
           at: ~/workspace
@@ -497,7 +497,7 @@ jobs:
             - v1-windows-dataset-vector-{{ checksum ".cachekey" }}
             - v1-windows-dataset-{{ checksum ".cachekey" }}
           {% endraw %}
- 
+
       - run:
           name: Run tests
           # Downloading embedding vector takes long time.
diff --git a/.circleci/unittest/linux/scripts/install.sh b/.circleci/unittest/linux/scripts/install.sh
index e9201b266b..a3ecba2770 100755
--- a/.circleci/unittest/linux/scripts/install.sh
+++ b/.circleci/unittest/linux/scripts/install.sh
@@ -13,6 +13,9 @@ conda activate ./env
 printf "* Installing PyTorch\n"
 conda install -y -c "pytorch-${UPLOAD_CHANNEL}" ${CONDA_CHANNEL_FLAGS} pytorch cpuonly
 
+printf "Installing torchdata from source\n"
+pip install git+https://github.com/pytorch/data.git
+
 printf "* Installing torchtext\n"
 git submodule update --init --recursive
 python setup.py develop
diff --git a/.circleci/unittest/windows/scripts/install.sh b/.circleci/unittest/windows/scripts/install.sh
index 622ebc1cd1..1922b9a78f 100644
--- a/.circleci/unittest/windows/scripts/install.sh
+++ b/.circleci/unittest/windows/scripts/install.sh
@@ -18,6 +18,9 @@ conda activate ./env
 printf "* Installing PyTorch\n"
 conda install -y -c "pytorch-${UPLOAD_CHANNEL}" ${CONDA_CHANNEL_FLAGS} pytorch cpuonly
 
+printf "Installing torchdata from source\n"
+pip install git+https://github.com/pytorch/data.git
+
 printf "* Installing torchtext\n"
 git submodule update --init --recursive
 "$root_dir/packaging/vc_env_helper.bat" python setup.py develop
diff --git a/README.rst b/README.rst
index cd23d169aa..8a216e8262 100644
--- a/README.rst
+++ b/README.rst
@@ -1,7 +1,7 @@
 .. image:: https://circleci.com/gh/pytorch/text.svg?style=svg
     :target: https://circleci.com/gh/pytorch/text
 
-.. image:: https://codecov.io/gh/pytorch/text/branch/master/graph/badge.svg
+.. image:: https://codecov.io/gh/pytorch/text/branch/main/graph/badge.svg
     :target: https://codecov.io/gh/pytorch/text
 
 .. image:: https://img.shields.io/badge/dynamic/json.svg?label=docs&url=https%3A%2F%2Fpypi.org%2Fpypi%2Ftorchtext%2Fjson&query=%24.info.version&colorB=brightgreen&prefix=v
@@ -12,13 +12,13 @@ torchtext
 
 This repository consists of:
 
-* `torchtext.datasets <https://github.com/pytorch/text/tree/master/torchtext/datasets>`_: The raw text iterators for common NLP datasets
-* `torchtext.data <https://github.com/pytorch/text/tree/master/torchtext/data>`_: Some basic NLP building blocks (tokenizers, metrics, functionals etc.)
-* `torchtext.nn <https://github.com/pytorch/text/tree/master/torchtext/nn>`_: NLP related modules
-* `torchtext.vocab <https://github.com/pytorch/text/tree/master/torchtext/vocab.py>`_: Vocab and Vectors related classes and factory functions
-* `examples <https://github.com/pytorch/text/tree/master/examples>`_: Example NLP workflows with PyTorch and torchtext library.
+* `torchtext.datasets <https://github.com/pytorch/text/tree/main/torchtext/datasets>`_: The raw text iterators for common NLP datasets
+* `torchtext.data <https://github.com/pytorch/text/tree/main/torchtext/data>`_: Some basic NLP building blocks (tokenizers, metrics, functionals etc.)
+* `torchtext.nn <https://github.com/pytorch/text/tree/main/torchtext/nn>`_: NLP related modules
+* `torchtext.vocab <https://github.com/pytorch/text/tree/main/torchtext/vocab.py>`_: Vocab and Vectors related classes and factory functions
+* `examples <https://github.com/pytorch/text/tree/main/examples>`_: Example NLP workflows with PyTorch and torchtext library.
 
-Note: The legacy code discussed in `torchtext v0.7.0 release note <https://github.com/pytorch/text/releases/tag/v0.7.0-rc3>`_ has been retired to `torchtext.legacy <https://github.com/pytorch/text/tree/master/torchtext/legacy>`_ folder. Those legacy code will not be maintained by the development team, and we plan to fully remove them in the future release. See `torchtext.legacy <https://github.com/pytorch/text/tree/master/torchtext/legacy>`_ folder for more details.
+Note: The legacy code discussed in `torchtext v0.7.0 release note <https://github.com/pytorch/text/releases/tag/v0.7.0-rc3>`_ has been retired to `torchtext.legacy <https://github.com/pytorch/text/tree/main/torchtext/legacy>`_ folder. Those legacy code will not be maintained by the development team, and we plan to fully remove them in the future release. See `torchtext.legacy <https://github.com/pytorch/text/tree/main/torchtext/legacy>`_ folder for more details.
 
 Installation
 ============
@@ -29,14 +29,15 @@ We recommend Anaconda as a Python package management system. Please refer to `py
    :header: "PyTorch version", "torchtext version", "Supported Python version"
    :widths: 10, 10, 10
 
-   nightly build, master, 3.6+
-   1.9, 0.10, 3.6+
-   1.8, 0.9, 3.6+
-   1.7, 0.8, 3.6+
-   1.6, 0.7, 3.6+
-   1.5, 0.6, 3.5+
-   1.4, 0.5, "2.7, 3.5+"
-   0.4 and below, 0.2.3, "2.7, 3.5+"
+   nightly build, main, ">=3.6, <=3.9"
+   1.9, 0.10, ">=3.6, <=3.9"
+   1.8, 0.9, ">=3.6, <=3.9"
+   1.7.1, 0.8.1, ">=3.6, <=3.9"
+   1.7, 0.8, ">=3.6, <=3.8"
+   1.6, 0.7, ">=3.6, <=3.8"
+   1.5, 0.6, ">=3.5, <=3.8"
+   1.4, 0.5, "2.7, >=3.5, <=3.8"
+   0.4 and below, 0.2.3, "2.7, >=3.5, <=3.8"
 
 Using conda::
 
@@ -82,7 +83,7 @@ To build torchtext from source, you need ``git``, ``CMake`` and C++11 compiler s
 **Note**
 
 When building from source, make sure that you have the same C++ compiler as the one used to build PyTorch. A simple way is to build PyTorch from source and use the same environment to build torchtext.
-If you are using the nightly build of PyTorch, checkout the environment it was built with `conda (here) <https://github.com/pytorch/builder/tree/master/conda>`_ and `pip (here) <https://github.com/pytorch/builder/tree/master/manywheel>`_.
+If you are using the nightly build of PyTorch, checkout the environment it was built with `conda (here) <https://github.com/pytorch/builder/tree/main/conda>`_ and `pip (here) <https://github.com/pytorch/builder/tree/main/manywheel>`_.
 
 Documentation
 =============
@@ -130,8 +131,8 @@ To get started with torchtext, users may refer to the following tutorials availa
 
 We have re-written several building blocks under ``torchtext.experimental``:
 
-* `Transforms <https://github.com/pytorch/text/blob/master/torchtext/experimental/transforms.py>`_: some basic data processing building blocks
-* `Vectors <https://github.com/pytorch/text/blob/master/torchtext/experimental/vectors.py>`_: the vectors to convert tokens into tensors.
+* `Transforms <https://github.com/pytorch/text/blob/main/torchtext/experimental/transforms.py>`_: some basic data processing building blocks
+* `Vectors <https://github.com/pytorch/text/blob/main/torchtext/experimental/vectors.py>`_: the vectors to convert tokens into tensors.
 
 These prototype building blocks in the experimental folder are available in the nightly release only. The nightly packages are accessible via Pip and Conda for Windows, Mac, and Linux. For example, Linux users can install the nightly wheels with the following command::
 
@@ -142,7 +143,7 @@ For more detailed instructions, please refer to `Install PyTorch <https://pytorc
 [BC Breaking] Legacy
 ====================
 
-In the v0.9.0 release, we moved the following legacy code to `torchtext.legacy <https://github.com/pytorch/text/tree/master/torchtext/legacy>`_. This is part of the work to revamp the torchtext library and the motivation has been discussed in `Issue #664 <https://github.com/pytorch/text/issues/664>`_:
+In the v0.9.0 release, we moved the following legacy code to `torchtext.legacy <https://github.com/pytorch/text/tree/main/torchtext/legacy>`_. This is part of the work to revamp the torchtext library and the motivation has been discussed in `Issue #664 <https://github.com/pytorch/text/issues/664>`_:
 
 * ``torchtext.legacy.data.field``
 * ``torchtext.legacy.data.batch``
@@ -151,9 +152,9 @@ In the v0.9.0 release, we moved the following legacy code to `torchtext.legacy <
 * ``torchtext.legacy.data.pipeline``
 * ``torchtext.legacy.datasets``
 
-We have a `migration tutorial <https://colab.research.google.com/github/pytorch/text/blob/master/examples/legacy_tutorial/migration_tutorial.ipynb>`_ to help users switch to the torchtext datasets in ``v0.9.0`` release. For the users who still want the legacy components, they can add ``legacy`` to the import path.  
+We have a `migration tutorial <https://colab.research.google.com/github/pytorch/text/blob/main/examples/legacy_tutorial/migration_tutorial.ipynb>`_ to help users switch to the torchtext datasets in ``v0.9.0`` release. For the users who still want the legacy components, they can add ``legacy`` to the import path.  
 
-In the v0.10.0 release, we retire the Vocab class to `torchtext.legacy <https://github.com/pytorch/text/tree/master/torchtext/legacy>`_. Users can still access the legacy Vocab via ``torchtext.legacy.vocab``. This class has been replaced by a Vocab module that is backed by efficient C++ implementation and provides common functional APIs for NLP workflows. 
+In the v0.10.0 release, we retire the Vocab class to `torchtext.legacy <https://github.com/pytorch/text/tree/main/torchtext/legacy>`_. Users can still access the legacy Vocab via ``torchtext.legacy.vocab``. This class has been replaced by a Vocab module that is backed by efficient C++ implementation and provides common functional APIs for NLP workflows. 
 
 Disclaimer on Datasets
 ======================
diff --git a/examples/BERT/README.md b/examples/BERT/README.md
deleted file mode 100644
index 3089efed65..0000000000
--- a/examples/BERT/README.md
+++ /dev/null
@@ -1,143 +0,0 @@
-# BERT with torchtext
-
-This example shows how to train a BERT model with PyTorch and torchtext only. Then, we fine-tune the pre-trained BERT for the question-answer task.
-
-
-## Generate pre-trained BERT
-
-Train the BERT model with masked language modeling task and next-sentence task. Run the tasks on a local GPU or CPU:
-
-    python mlm_task.py
-    python ns_task.py
-
-or run the tasks on a SLURM powered cluster with Distributed Data Parallel (DDP):
-
-    srun --label --ntasks-per-node=1 --time=4000 --mem-per-cpu=5120 --gres=gpu:8 --cpus-per-task 80 --nodes=1 --pty python mlm_task.py --parallel DDP  --log-interval 600  --dataset BookCorpus
-
-    srun --label --ntasks-per-node=1 --time=4000 --mem-per-cpu=5120 --gres=gpu:8 --cpus-per-task 80 --nodes=1 --pty python ns_task.py --parallel DDP --bert-model mlm_bert.pt --dataset BookCorpus
-
-The result ppl of mlm_task is 18.97899 for the test set.
-The result loss of ns_task is 0.05446 for the test set.
-
-## Fine-tune pre-trained BERT for question-answer task
-
-With SQuAD dataset, the pre-trained BERT is used for question-answer task:
-
-    python qa_task.py  --bert-model ns_bert.pt --epochs 30
-
-The pre-trained BERT models and vocab are available:
-
-* [torchtext_bert_vocab.pt](https://pytorch.s3.amazonaws.com/models/text/torchtext_bert_example/torchtext_bert_vocab.pt)
-* [mlm_bert.pt](https://pytorch.s3.amazonaws.com/models/text/torchtext_bert_example/mlm_bert.pt)
-* [ns_bert.pt](https://pytorch.s3.amazonaws.com/models/text/torchtext_bert_example/ns_bert.pt)
-
-An example train/valid/test printout with the pretrained BERT model in question-answer task:
-
-    | epoch   1 |   200/ 1055 batches | lr 5.00000 | ms/batch 748.82 | loss  3.75 | ppl    42.32
-    | epoch   1 |   400/ 1055 batches | lr 5.00000 | ms/batch 746.04 | loss  3.46 | ppl    31.85
-    | epoch   1 |   600/ 1055 batches | lr 5.00000 | ms/batch 748.82 | loss  3.09 | ppl    21.90
-    | epoch   1 |   800/ 1055 batches | lr 5.00000 | ms/batch 743.96 | loss  2.77 | ppl    15.89
-    | epoch   1 |  1000/ 1055 batches | lr 5.00000 | ms/batch 743.21 | loss  2.30 | ppl     9.99
-    -----------------------------------------------------------------------------------------
-    | end of epoch   1 | time: 821.76s | valid loss  1.92 | exact   49.945% | f1   62.056%
-    -----------------------------------------------------------------------------------------
-    | epoch   2 |   200/ 1055 batches | lr 5.00000 | ms/batch 749.20 | loss  1.81 | ppl     6.10
-    | epoch   2 |   400/ 1055 batches | lr 5.00000 | ms/batch 743.78 | loss  1.72 | ppl     5.61
-    | epoch   2 |   600/ 1055 batches | lr 5.00000 | ms/batch 744.54 | loss  1.66 | ppl     5.28
-    | epoch   2 |   800/ 1055 batches | lr 5.00000 | ms/batch 744.99 | loss  1.64 | ppl     5.17
-    | epoch   2 |  1000/ 1055 batches | lr 5.00000 | ms/batch 744.06 | loss  1.60 | ppl     4.96
-    -----------------------------------------------------------------------------------------
-    | end of epoch   2 | time: 821.15s | valid loss  1.58 | exact   59.221% | f1   71.034%
-    -----------------------------------------------------------------------------------------
-    | epoch   3 |   200/ 1055 batches | lr 5.00000 | ms/batch 747.07 | loss  1.41 | ppl     4.10
-    | epoch   3 |   400/ 1055 batches | lr 5.00000 | ms/batch 743.91 | loss  1.39 | ppl     4.03
-    | epoch   3 |   600/ 1055 batches | lr 5.00000 | ms/batch 743.71 | loss  1.39 | ppl     4.03
-    | epoch   3 |   800/ 1055 batches | lr 5.00000 | ms/batch 744.33 | loss  1.39 | ppl     4.03
-    | epoch   3 |  1000/ 1055 batches | lr 5.00000 | ms/batch 744.86 | loss  1.40 | ppl     4.05
-    -----------------------------------------------------------------------------------------
-    | end of epoch   3 | time: 820.46s | valid loss  1.46 | exact   62.612% | f1   73.513%
-    -----------------------------------------------------------------------------------------
-    | epoch   4 |   200/ 1055 batches | lr 5.00000 | ms/batch 749.89 | loss  1.20 | ppl     3.33
-    | epoch   4 |   400/ 1055 batches | lr 5.00000 | ms/batch 748.50 | loss  1.20 | ppl     3.32
-    | epoch   4 |   600/ 1055 batches | lr 5.00000 | ms/batch 745.78 | loss  1.24 | ppl     3.47
-    | epoch   4 |   800/ 1055 batches | lr 5.00000 | ms/batch 744.94 | loss  1.24 | ppl     3.45
-    | epoch   4 |  1000/ 1055 batches | lr 5.00000 | ms/batch 744.22 | loss  1.25 | ppl     3.48
-    -----------------------------------------------------------------------------------------
-    | end of epoch   4 | time: 822.04s | valid loss  1.47 | exact   62.758% | f1   73.744%
-    -----------------------------------------------------------------------------------------
-    | epoch   5 |   200/ 1055 batches | lr 5.00000 | ms/batch 747.76 | loss  1.05 | ppl     2.87
-    | epoch   5 |   400/ 1055 batches | lr 5.00000 | ms/batch 743.78 | loss  1.08 | ppl     2.94
-    | epoch   5 |   600/ 1055 batches | lr 5.00000 | ms/batch 743.69 | loss  1.09 | ppl     2.97
-    | epoch   5 |   800/ 1055 batches | lr 5.00000 | ms/batch 743.58 | loss  1.10 | ppl     3.01
-    | epoch   5 |  1000/ 1055 batches | lr 5.00000 | ms/batch 743.05 | loss  1.13 | ppl     3.08
-    -----------------------------------------------------------------------------------------
-    | end of epoch   5 | time: 819.86s | valid loss  1.49 | exact   63.372% | f1   74.179%
-    -----------------------------------------------------------------------------------------
-    | epoch   6 |   200/ 1055 batches | lr 5.00000 | ms/batch 748.29 | loss  0.93 | ppl     2.54
-    | epoch   6 |   400/ 1055 batches | lr 5.00000 | ms/batch 744.01 | loss  0.96 | ppl     2.62
-    | epoch   6 |   600/ 1055 batches | lr 5.00000 | ms/batch 744.13 | loss  0.97 | ppl     2.63
-    | epoch   6 |   800/ 1055 batches | lr 5.00000 | ms/batch 744.19 | loss  0.99 | ppl     2.68
-    | epoch   6 |  1000/ 1055 batches | lr 5.00000 | ms/batch 744.10 | loss  1.00 | ppl     2.73
-    -----------------------------------------------------------------------------------------
-    | end of epoch   6 | time: 820.67s | valid loss  1.52 | exact   62.902% | f1   73.918%
-    -----------------------------------------------------------------------------------------
-    | epoch   7 |   200/ 1055 batches | lr 0.50000 | ms/batch 748.94 | loss  0.74 | ppl     2.09
-    | epoch   7 |   400/ 1055 batches | lr 0.50000 | ms/batch 743.26 | loss  0.70 | ppl     2.01
-    | epoch   7 |   600/ 1055 batches | lr 0.50000 | ms/batch 745.73 | loss  0.68 | ppl     1.97
-    | epoch   7 |   800/ 1055 batches | lr 0.50000 | ms/batch 745.74 | loss  0.67 | ppl     1.96
-    | epoch   7 |  1000/ 1055 batches | lr 0.50000 | ms/batch 744.42 | loss  0.65 | ppl     1.92
-    -----------------------------------------------------------------------------------------
-    | end of epoch   7 | time: 820.97s | valid loss  1.60 | exact   65.965% | f1   76.315%
-    -----------------------------------------------------------------------------------------
-    | epoch   8 |   200/ 1055 batches | lr 0.50000 | ms/batch 748.37 | loss  0.61 | ppl     1.85
-    | epoch   8 |   400/ 1055 batches | lr 0.50000 | ms/batch 747.32 | loss  0.60 | ppl     1.82
-    | epoch   8 |   600/ 1055 batches | lr 0.50000 | ms/batch 746.12 | loss  0.60 | ppl     1.82
-    | epoch   8 |   800/ 1055 batches | lr 0.50000 | ms/batch 745.98 | loss  0.60 | ppl     1.83
-    | epoch   8 |  1000/ 1055 batches | lr 0.50000 | ms/batch 744.58 | loss  0.60 | ppl     1.82
-    -----------------------------------------------------------------------------------------
-    | end of epoch   8 | time: 821.95s | valid loss  1.64 | exact   65.214% | f1   76.046%
-    -----------------------------------------------------------------------------------------
-    | epoch   9 |   200/ 1055 batches | lr 0.05000 | ms/batch 748.68 | loss  0.55 | ppl     1.74
-    | epoch   9 |   400/ 1055 batches | lr 0.05000 | ms/batch 743.93 | loss  0.54 | ppl     1.71
-    | epoch   9 |   600/ 1055 batches | lr 0.05000 | ms/batch 744.58 | loss  0.55 | ppl     1.72
-    | epoch   9 |   800/ 1055 batches | lr 0.05000 | ms/batch 744.37 | loss  0.56 | ppl     1.75
-    | epoch   9 |  1000/ 1055 batches | lr 0.05000 | ms/batch 744.40 | loss  0.54 | ppl     1.72
-    -----------------------------------------------------------------------------------------
-    | end of epoch   9 | time: 820.87s | valid loss  1.66 | exact   65.272% | f1   75.929%
-    -----------------------------------------------------------------------------------------
-    | epoch  10 |   200/ 1055 batches | lr 0.00500 | ms/batch 748.50 | loss  0.54 | ppl     1.72
-    | epoch  10 |   400/ 1055 batches | lr 0.00500 | ms/batch 744.92 | loss  0.55 | ppl     1.72
-    | epoch  10 |   600/ 1055 batches | lr 0.00500 | ms/batch 745.06 | loss  0.55 | ppl     1.73
-    | epoch  10 |   800/ 1055 batches | lr 0.00500 | ms/batch 745.30 | loss  0.54 | ppl     1.71
-    | epoch  10 |  1000/ 1055 batches | lr 0.00500 | ms/batch 746.06 | loss  0.54 | ppl     1.72
-    -----------------------------------------------------------------------------------------
-    | end of epoch  10 | time: 821.62s | valid loss  1.67 | exact   65.382% | f1   76.090%
-    -----------------------------------------------------------------------------------------
-    =========================================================================================
-    | End of training | test loss  1.61 | exact   66.124% | f1   76.373% 
-    =========================================================================================
-
-## Structure of the example
-
-### model.py
-
-This file defines the Transformer and MultiheadAttention models used for BERT. The embedding layer include PositionalEncoding and TokenTypeEncoding layers. MLMTask, NextSentenceTask, and QuestionAnswerTask are the models for the three tasks mentioned above.
-
-### data.py
-
-This file provides a few datasets required to train the BERT model and question-answer task. Please note that BookCorpus dataset is not available publicly.
-
-
-### mlm_task.py, ns_task.py, qa_task.py
-
-Those three files define the train/valid/test process for the tasks.
-
-
-### metrics.py
-
-This file provides two metrics (F1 and exact score) for question-answer task
-
-
-### utils.py
-
-This file provides a few utils used by the three tasks.
diff --git a/examples/BERT/data.py b/examples/BERT/data.py
deleted file mode 100644
index b5e669dc9f..0000000000
--- a/examples/BERT/data.py
+++ /dev/null
@@ -1,54 +0,0 @@
-import glob
-import torch
-import logging
-from torchtext.data.utils import get_tokenizer
-import random
-from torchtext.experimental.datasets import LanguageModelingDataset
-
-
-###################################################################
-# Set up dataset for book corpus
-###################################################################
-def BookCorpus(vocab, tokenizer=get_tokenizer("basic_english"),
-               data_select=('train', 'valid', 'test'), removed_tokens=[],
-               min_sentence_len=None):
-
-    if isinstance(data_select, str):
-        data_select = [data_select]
-    if not set(data_select).issubset(set(('train', 'test', 'valid'))):
-        raise TypeError('data_select is not supported!')
-
-    extracted_files = glob.glob('/datasets01/bookcorpus/021819/*/*.txt')
-    random.seed(1000)
-    random.shuffle(extracted_files)
-
-    num_files = len(extracted_files)
-    _path = {'train': extracted_files[:(num_files // 20 * 17)],
-             'test': extracted_files[(num_files // 20 * 17):(num_files // 20 * 18)],
-             'valid': extracted_files[(num_files // 20 * 18):]}
-
-    data = {}
-    for item in _path.keys():
-        data[item] = []
-        logging.info('Creating {} data'.format(item))
-        tokens = []
-        for txt_file in _path[item]:
-            with open(txt_file, 'r', encoding="utf8", errors='ignore') as f:
-                for line in f.readlines():
-                    _tokens = tokenizer(line.strip())
-                    if min_sentence_len:
-                        if len(_tokens) >= min_sentence_len:
-                            tokens.append([vocab.stoi[token] for token in _tokens])
-                    else:
-                        tokens += [vocab.stoi[token] for token in _tokens]
-        data[item] = tokens
-
-    for key in data_select:
-        if data[key] == []:
-            raise TypeError('Dataset {} is empty!'.format(key))
-    if min_sentence_len:
-        return tuple(LanguageModelingDataset(data[d], vocab, lambda x: x, False)
-                     for d in data_select)
-    else:
-        return tuple(LanguageModelingDataset(torch.tensor(data[d]).long(), vocab, lambda x: x, False)
-                     for d in data_select)
diff --git a/examples/BERT/metrics.py b/examples/BERT/metrics.py
deleted file mode 100644
index dba20bb753..0000000000
--- a/examples/BERT/metrics.py
+++ /dev/null
@@ -1,72 +0,0 @@
-import collections
-import re
-import string
-
-
-def compute_qa_exact(ans_pred_tokens_samples):
-
-    '''
-        Input: ans_pred_tokens_samples: [([ans1_tokens_candidate1, ans1_tokens_candidate2], pred1_tokens),
-                                         ([ans2_tokens_candidate1, ans2_tokens_candidate2], pred2_tokens),
-                                         ...
-                                         ([ansn_tokens_candidate1, ansn_tokens_candidate2], predn_tokens)]
-        ans1_tokens_candidate1 = ['this', 'is', 'an', 'sample', 'example']
-        Output: exact score of the samples
-    '''
-
-    def normalize_txt(text):
-        # lower case
-        text = text.lower()
-
-        # remove punc
-        exclude = set(string.punctuation)
-        text = "".join(ch for ch in text if ch not in exclude)
-
-        # remove articles
-        regex = re.compile(r"\b(a|an|the)\b", re.UNICODE)
-        text = re.sub(regex, " ", text)
-
-        # white space fix
-        return " ".join(text.split())
-
-    exact_scores = []
-    for (ans_tokens, pred_tokens) in ans_pred_tokens_samples:
-        pred_str = " ".join(pred_tokens)
-        candidate_score = []
-        for item in ans_tokens:
-            ans_str = " ".join(item)
-            candidate_score.append(int(normalize_txt(ans_str) == normalize_txt(pred_str)))
-        exact_scores.append(max(candidate_score))
-    return 100.0 * sum(exact_scores) / len(exact_scores)
-
-
-def compute_qa_f1(ans_pred_tokens_samples):
-
-    '''
-        Input: ans_pred_tokens_samples: [([ans1_tokens_candidate1, ans1_tokens_candidate2], pred1_tokens),
-                                         ([ans2_tokens_candidate1, ans2_tokens_candidate2], pred2_tokens),
-                                         ...
-                                         ([ansn_tokens_candidate1, ansn_tokens_candidate2], predn_tokens)]
-        ans1_tokens_candidate1 = ['this', 'is', 'an', 'sample', 'example']
-        Output: f1 score of the samples
-    '''
-    def sample_f1(ans_tokens, pred_tokens):
-        common = collections.Counter(ans_tokens) & collections.Counter(pred_tokens)
-        num_same = sum(common.values())
-        if len(ans_tokens) == 0 or len(pred_tokens) == 0:
-            # If either is no-answer, then F1 is 1 if they agree, 0 otherwise
-            return int(ans_tokens == pred_tokens)
-        if num_same == 0:
-            return 0
-        precision = 1.0 * num_same / len(pred_tokens)
-        recall = 1.0 * num_same / len(ans_tokens)
-        f1 = (2 * precision * recall) / (precision + recall)
-        return f1
-
-    f1_scores = []
-    for (ans_tokens, pred_tokens) in ans_pred_tokens_samples:
-        candidate_score = []
-        for item in ans_tokens:
-            candidate_score.append(sample_f1(item, pred_tokens))
-        f1_scores.append(max(candidate_score))
-    return 100.0 * sum(f1_scores) / len(f1_scores)
diff --git a/examples/BERT/mlm_task.py b/examples/BERT/mlm_task.py
deleted file mode 100644
index aaf61a8e87..0000000000
--- a/examples/BERT/mlm_task.py
+++ /dev/null
@@ -1,269 +0,0 @@
-import argparse
-import time
-import math
-import torch
-import torch.nn as nn
-from model import MLMTask
-from utils import run_demo, run_ddp, wrap_up
-import torch.distributed as dist
-from torch.nn.parallel import DistributedDataParallel as DDP
-from torch.utils.data import DataLoader
-
-
-def collate_batch(batch_data, args, mask_id, cls_id):
-    batch_data = torch.tensor(batch_data).long().view(args.batch_size, -1).t().contiguous()
-    # Generate masks with args.mask_frac
-    data_len = batch_data.size(0)
-    ones_num = int(data_len * args.mask_frac)
-    zeros_num = data_len - ones_num
-    lm_mask = torch.cat([torch.zeros(zeros_num), torch.ones(ones_num)])
-    lm_mask = lm_mask[torch.randperm(data_len)]
-    batch_data = torch.cat((torch.tensor([[cls_id] * batch_data.size(1)]).long(), batch_data))
-    lm_mask = torch.cat((torch.tensor([0.0]), lm_mask))
-
-    targets = torch.stack([batch_data[i] for i in range(lm_mask.size(0)) if lm_mask[i]]).view(-1)
-    batch_data = batch_data.masked_fill(lm_mask.bool().unsqueeze(1), mask_id)
-    return batch_data, lm_mask, targets
-
-
-def process_raw_data(raw_data, args):
-    _num = raw_data.size(0) // (args.batch_size * args.bptt)
-    raw_data = raw_data[:(_num * args.batch_size * args.bptt)]
-    return raw_data
-
-
-def evaluate(data_source, model, vocab, ntokens, criterion, args, device):
-    # Turn on evaluation mode which disables dropout.
-    model.eval()
-    total_loss = 0.
-    mask_id = vocab.stoi['<MASK>']
-    cls_id = vocab.stoi['<cls>']
-    dataloader = DataLoader(data_source, batch_size=args.batch_size * args.bptt,
-                            shuffle=False, collate_fn=lambda b: collate_batch(b, args, mask_id, cls_id))
-    with torch.no_grad():
-        for batch, (data, lm_mask, targets) in enumerate(dataloader):
-            if args.parallel == 'DDP':
-                data = data.to(device[0])
-                targets = targets.to(device[0])
-            else:
-                data = data.to(device)
-                targets = targets.to(device)
-            data = data.transpose(0, 1)  # Wrap up by DDP or DataParallel
-            output = model(data)
-            output = torch.stack([output[i] for i in range(lm_mask.size(0)) if lm_mask[i]])
-            output_flat = output.view(-1, ntokens)
-            total_loss += criterion(output_flat, targets).item()
-    return total_loss / ((len(data_source) - 1) / args.bptt / args.batch_size)
-
-
-def train(model, vocab, train_loss_log, train_data,
-          optimizer, criterion, ntokens, epoch, scheduler, args, device, rank=None):
-    model.train()
-    total_loss = 0.
-    start_time = time.time()
-    mask_id = vocab.stoi['<MASK>']
-    cls_id = vocab.stoi['<cls>']
-    train_loss_log.append(0.0)
-    dataloader = DataLoader(train_data, batch_size=args.batch_size * args.bptt,
-                            shuffle=False, collate_fn=lambda b: collate_batch(b, args, mask_id, cls_id))
-
-    for batch, (data, lm_mask, targets) in enumerate(dataloader):
-        optimizer.zero_grad()
-        if args.parallel == 'DDP':
-            data = data.to(device[0])
-            targets = targets.to(device[0])
-        else:
-            data = data.to(device)
-            targets = targets.to(device)
-        data = data.transpose(0, 1)  # Wrap up by DDP or DataParallel
-        output = model(data)
-        output = torch.stack([output[i] for i in range(lm_mask.size(0)) if lm_mask[i]])
-        loss = criterion(output.view(-1, ntokens), targets)
-        loss.backward()
-        torch.nn.utils.clip_grad_norm_(model.parameters(), args.clip)
-        optimizer.step()
-        total_loss += loss.item()
-        if batch % args.log_interval == 0 and batch > 0:
-            cur_loss = total_loss / args.log_interval
-            elapsed = time.time() - start_time
-            if (rank is None) or rank == 0:
-                train_loss_log[-1] = cur_loss
-                print('| epoch {:3d} | {:5d}/{:5d} batches | lr {:05.5f} | ms/batch {:5.2f} | '
-                      'loss {:5.2f} | ppl {:8.2f}'.format(epoch, batch,
-                                                          len(train_data) // (args.bptt * args.batch_size),
-                                                          scheduler.get_last_lr()[0],
-                                                          elapsed * 1000 / args.log_interval,
-                                                          cur_loss, math.exp(cur_loss)))
-            total_loss = 0
-            start_time = time.time()
-
-
-def run_main(args, rank=None):
-    torch.manual_seed(args.seed)
-    if args.parallel == 'DDP':
-        n = torch.cuda.device_count() // args.world_size
-        device = list(range(rank * n, (rank + 1) * n))
-    else:
-        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-
-    import torchtext
-    if args.dataset == 'WikiText103':
-        from torchtext.experimental.datasets import WikiText103 as WLMDataset
-    elif args.dataset == 'WikiText2':
-        from torchtext.experimental.datasets import WikiText2 as WLMDataset
-    elif args.dataset == 'WMTNewsCrawl':
-        from data import WMTNewsCrawl as WLMDataset
-    elif args.dataset == 'EnWik9':
-        from torchtext.datasets import EnWik9
-    elif args.dataset == 'BookCorpus':
-        from data import BookCorpus
-    else:
-        print("dataset for MLM task is not supported")
-
-    try:
-        vocab = torch.load(args.save_vocab)
-    except:
-        train_dataset, valid_dataset, test_dataset = WLMDataset()
-        old_vocab = train_dataset.vocab
-        vocab = torchtext.legacy.vocab.Vocab(counter=old_vocab.freqs,
-                                      specials=['<unk>', '<pad>', '<MASK>'])
-        with open(args.save_vocab, 'wb') as f:
-            torch.save(vocab, f)
-
-    if args.dataset == 'WikiText103' or args.dataset == 'WikiText2':
-        train_dataset, valid_dataset, test_dataset = WLMDataset(vocab=vocab)
-        train_dataset.data = torch.cat(tuple(filter(lambda t: t.numel() > 0, train_dataset)))
-        valid_dataset.data = torch.cat(tuple(filter(lambda t: t.numel() > 0, valid_dataset)))
-        test_dataset.data = torch.cat(tuple(filter(lambda t: t.numel() > 0, test_dataset)))
-    elif args.dataset == 'WMTNewsCrawl':
-        from torchtext.experimental.datasets import WikiText2
-        test_dataset, valid_dataset = WikiText2(vocab=vocab, data_select=('test', 'valid'))
-        valid_dataset.data = torch.cat(tuple(filter(lambda t: t.numel() > 0, valid_dataset)))
-        test_dataset.data = torch.cat(tuple(filter(lambda t: t.numel() > 0, test_dataset)))
-        train_dataset, = WLMDataset(vocab=vocab, data_select='train')
-        train_dataset.data = torch.cat(tuple(filter(lambda t: t.numel() > 0, train_dataset)))
-    elif args.dataset == 'EnWik9':
-        enwik9 = EnWik9()
-        idx1, idx2 = int(len(enwik9) * 0.8), int(len(enwik9) * 0.9)
-        train_data = torch.tensor([vocab.stoi[_id]
-                                  for _id in enwik9[0:idx1]]).long()
-        val_data = torch.tensor([vocab.stoi[_id]
-                                 for _id in enwik9[idx1:idx2]]).long()
-        test_data = torch.tensor([vocab.stoi[_id]
-                                 for _id in enwik9[idx2:]]).long()
-        from torchtext.experimental.datasets import LanguageModelingDataset
-        train_dataset = LanguageModelingDataset(train_data, vocab)
-        valid_dataset = LanguageModelingDataset(val_data, vocab)
-        test_dataset = LanguageModelingDataset(test_data, vocab)
-    elif args.dataset == 'BookCorpus':
-        train_dataset, valid_dataset, test_dataset = BookCorpus(vocab)
-
-    train_data = process_raw_data(train_dataset.data, args)
-    if rank is not None:
-        # Chunk training data by rank for different gpus
-        chunk_len = len(train_data) // args.world_size
-        train_data = train_data[(rank * chunk_len):((rank + 1) * chunk_len)]
-    val_data = process_raw_data(valid_dataset.data, args)
-    test_data = process_raw_data(test_dataset.data, args)
-
-    ntokens = len(train_dataset.get_vocab())
-    if args.checkpoint != 'None':
-        model = torch.load(args.checkpoint)
-    else:
-        model = MLMTask(ntokens, args.emsize, args.nhead, args.nhid, args.nlayers, args.dropout)
-    if args.parallel == 'DDP':
-        model = model.to(device[0])
-        model = DDP(model, device_ids=device)
-    else:
-        model = model.to(device)
-    criterion = nn.CrossEntropyLoss()
-    optimizer = torch.optim.SGD(model.parameters(), lr=args.lr)
-    scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=0.1)
-    best_val_loss = None
-    train_loss_log, val_loss_log = [], []
-
-    for epoch in range(1, args.epochs + 1):
-        epoch_start_time = time.time()
-        train(model, train_dataset.vocab, train_loss_log, train_data,
-              optimizer, criterion, ntokens, epoch, scheduler, args, device, rank)
-        val_loss = evaluate(val_data, model, train_dataset.vocab, ntokens, criterion, args, device)
-        if (rank is None) or (rank == 0):
-            val_loss_log.append(val_loss)
-            print('-' * 89)
-            print('| end of epoch {:3d} | time: {:5.2f}s | valid loss {:5.2f} | '
-                  'valid ppl {:8.2f}'.format(epoch, (time.time() - epoch_start_time),
-                                             val_loss, math.exp(val_loss)))
-            print('-' * 89)
-        if not best_val_loss or val_loss < best_val_loss:
-            if rank is None:
-                with open(args.save, 'wb') as f:
-                    torch.save(model, f)
-            elif rank == 0:
-                with open(args.save, 'wb') as f:
-                    torch.save(model.state_dict(), f)
-            best_val_loss = val_loss
-        else:
-            scheduler.step()
-    if args.parallel == 'DDP':
-        dist.barrier()
-        rank0_devices = [x - rank * len(device) for x in device]
-        device_pairs = zip(rank0_devices, device)
-        map_location = {'cuda:%d' % x: 'cuda:%d' % y for x, y in device_pairs}
-        model.load_state_dict(
-            torch.load(args.save, map_location=map_location))
-        test_loss = evaluate(test_data, model, train_dataset.vocab, ntokens, criterion, args, device)
-        if rank == 0:
-            wrap_up(train_loss_log, val_loss_log, test_loss, args, model.module, 'mlm_loss.txt', 'full_mlm_model.pt')
-    else:
-        with open(args.save, 'rb') as f:
-            model = torch.load(f)
-        test_loss = evaluate(test_data, model, train_dataset.vocab, ntokens, criterion, args, device)
-        wrap_up(train_loss_log, val_loss_log, test_loss, args, model, 'mlm_loss.txt', 'full_mlm_model.pt')
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser(description='PyTorch Wikitext-2 Transformer Language Model')
-    parser.add_argument('--emsize', type=int, default=768,
-                        help='size of word embeddings')
-    parser.add_argument('--nhid', type=int, default=3072,
-                        help='number of hidden units per layer')
-    parser.add_argument('--nlayers', type=int, default=12,
-                        help='number of layers')
-    parser.add_argument('--nhead', type=int, default=12,
-                        help='the number of heads in the encoder/decoder of the transformer model')
-    parser.add_argument('--lr', type=float, default=6,
-                        help='initial learning rate')
-    parser.add_argument('--clip', type=float, default=0.1,
-                        help='gradient clipping')
-    parser.add_argument('--epochs', type=int, default=8,
-                        help='upper epoch limit')
-    parser.add_argument('--batch_size', type=int, default=32, metavar='N',
-                        help='batch size')
-    parser.add_argument('--bptt', type=int, default=128,
-                        help='sequence length')
-    parser.add_argument('--dropout', type=float, default=0.2,
-                        help='dropout applied to layers (0 = no dropout)')
-    parser.add_argument('--seed', type=int, default=5431916812,
-                        help='random seed')
-    parser.add_argument('--log-interval', type=int, default=10, metavar='N',
-                        help='report interval')
-    parser.add_argument('--checkpoint', type=str, default='None',
-                        help='path to load the checkpoint')
-    parser.add_argument('--save', type=str, default='mlm_bert.pt',
-                        help='path to save the final model')
-    parser.add_argument('--save-vocab', type=str, default='torchtext_bert_vocab.pt',
-                        help='path to save the vocab')
-    parser.add_argument('--mask_frac', type=float, default=0.15,
-                        help='the fraction of masked tokens')
-    parser.add_argument('--dataset', type=str, default='WikiText2',
-                        help='dataset used for MLM task')
-    parser.add_argument('--parallel', type=str, default='None',
-                        help='Use DataParallel to train model')
-    parser.add_argument('--world_size', type=int, default=8,
-                        help='the world size to initiate DPP')
-    args = parser.parse_args()
-
-    if args.parallel == 'DDP':
-        run_demo(run_ddp, run_main, args)
-    else:
-        run_main(args)
diff --git a/examples/BERT/model.py b/examples/BERT/model.py
deleted file mode 100644
index 484117e19c..0000000000
--- a/examples/BERT/model.py
+++ /dev/null
@@ -1,177 +0,0 @@
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-from torch.nn import Linear, Dropout, LayerNorm, TransformerEncoder
-from torchtext.nn import MultiheadAttentionContainer, InProjContainer, ScaledDotProduct
-
-
-class PositionalEncoding(nn.Module):
-    def __init__(self, d_model, max_len=5000):
-        super(PositionalEncoding, self).__init__()
-        self.pos_embedding = nn.Embedding(max_len, d_model)
-
-    def forward(self, x):
-        S, N = x.size()
-        pos = torch.arange(S,
-                           dtype=torch.long,
-                           device=x.device).unsqueeze(0).expand((N, S)).t()
-        return self.pos_embedding(pos)
-
-
-class TokenTypeEncoding(nn.Module):
-    def __init__(self, type_token_num, d_model):
-        super(TokenTypeEncoding, self).__init__()
-        self.token_type_embeddings = nn.Embedding(type_token_num, d_model)
-
-    def forward(self, seq_input, token_type_input):
-        S, N = seq_input.size()
-        if token_type_input is None:
-            token_type_input = torch.zeros((S, N),
-                                           dtype=torch.long,
-                                           device=seq_input.device)
-        return self.token_type_embeddings(token_type_input)
-
-
-class BertEmbedding(nn.Module):
-    def __init__(self, ntoken, ninp, dropout=0.5):
-        super(BertEmbedding, self).__init__()
-        self.ninp = ninp
-        self.ntoken = ntoken
-        self.pos_embed = PositionalEncoding(ninp)
-        self.embed = nn.Embedding(ntoken, ninp)
-        self.tok_type_embed = TokenTypeEncoding(2, ninp)  # Two sentence type
-        self.norm = LayerNorm(ninp)
-        self.dropout = Dropout(dropout)
-
-    def forward(self, seq_inputs):
-        src, token_type_input = seq_inputs
-        src = self.embed(src) + self.pos_embed(src) \
-            + self.tok_type_embed(src, token_type_input)
-        return self.dropout(self.norm(src))
-
-
-class TransformerEncoderLayer(nn.Module):
-    def __init__(self, d_model, nhead, dim_feedforward=2048,
-                 dropout=0.1, activation="gelu"):
-        super(TransformerEncoderLayer, self).__init__()
-        in_proj_container = InProjContainer(Linear(d_model, d_model),
-                                            Linear(d_model, d_model),
-                                            Linear(d_model, d_model))
-        self.mha = MultiheadAttentionContainer(nhead, in_proj_container,
-                                               ScaledDotProduct(), Linear(d_model, d_model))
-        self.linear1 = Linear(d_model, dim_feedforward)
-        self.dropout = Dropout(dropout)
-        self.linear2 = Linear(dim_feedforward, d_model)
-
-        self.norm1 = LayerNorm(d_model)
-        self.norm2 = LayerNorm(d_model)
-        self.dropout1 = Dropout(dropout)
-        self.dropout2 = Dropout(dropout)
-
-        if activation == "relu":
-            self.activation = F.relu
-        elif activation == "gelu":
-            self.activation = F.gelu
-        else:
-            raise RuntimeError("only relu/gelu are supported, not {}".format(activation))
-
-    def init_weights(self):
-        self.mha.in_proj_container.query_proj.init_weights()
-        self.mha.in_proj_container.key_proj.init_weights()
-        self.mha.in_proj_container.value_proj.init_weights()
-        self.mha.out_proj.init_weights()
-        self.linear1.weight.data.normal_(mean=0.0, std=0.02)
-        self.linear2.weight.data.normal_(mean=0.0, std=0.02)
-        self.norm1.bias.data.zero_()
-        self.norm1.weight.data.fill_(1.0)
-        self.norm2.bias.data.zero_()
-        self.norm2.weight.data.fill_(1.0)
-
-    def forward(self, src, src_mask=None, src_key_padding_mask=None):
-        attn_output, attn_output_weights = self.mha(src, src, src, attn_mask=src_mask)
-        src = src + self.dropout1(attn_output)
-        src = self.norm1(src)
-        src2 = self.linear2(self.dropout(self.activation(self.linear1(src))))
-        src = src + self.dropout2(src2)
-        src = self.norm2(src)
-        return src
-
-
-class BertModel(nn.Module):
-    """Contain a transformer encoder."""
-
-    def __init__(self, ntoken, ninp, nhead, nhid, nlayers, embed_layer, dropout=0.5):
-        super(BertModel, self).__init__()
-        self.model_type = 'Transformer'
-        self.bert_embed = embed_layer
-        encoder_layers = TransformerEncoderLayer(ninp, nhead, nhid, dropout)
-        self.transformer_encoder = TransformerEncoder(encoder_layers, nlayers)
-        self.ninp = ninp
-
-    def forward(self, seq_inputs):
-        src = self.bert_embed(seq_inputs)
-        output = self.transformer_encoder(src)
-        return output
-
-
-class MLMTask(nn.Module):
-    """Contain a transformer encoder plus MLM head."""
-
-    def __init__(self, ntoken, ninp, nhead, nhid, nlayers, dropout=0.5):
-        super(MLMTask, self).__init__()
-        embed_layer = BertEmbedding(ntoken, ninp)
-        self.bert_model = BertModel(ntoken, ninp, nhead, nhid, nlayers, embed_layer, dropout=0.5)
-        self.mlm_span = Linear(ninp, ninp)
-        self.activation = F.gelu
-        self.norm_layer = LayerNorm(ninp, eps=1e-12)
-        self.mlm_head = Linear(ninp, ntoken)
-
-    def forward(self, src, token_type_input=None):
-        src = src.transpose(0, 1)  # Wrap up by nn.DataParallel
-        output = self.bert_model((src, token_type_input))
-        output = self.mlm_span(output)
-        output = self.activation(output)
-        output = self.norm_layer(output)
-        output = self.mlm_head(output)
-        return output
-
-
-class NextSentenceTask(nn.Module):
-    """Contain a pretrain BERT model and a linear layer."""
-
-    def __init__(self, bert_model):
-        super(NextSentenceTask, self).__init__()
-        self.bert_model = bert_model
-        self.linear_layer = Linear(bert_model.ninp,
-                                   bert_model.ninp)
-        self.ns_span = Linear(bert_model.ninp, 2)
-        self.activation = nn.Tanh()
-
-    def forward(self, src, token_type_input):
-        src = src.transpose(0, 1)  # Wrap up by nn.DataParallel
-        output = self.bert_model((src, token_type_input))
-        # Send the first <'cls'> seq to a classifier
-        output = self.activation(self.linear_layer(output[0]))
-        output = self.ns_span(output)
-        return output
-
-
-class QuestionAnswerTask(nn.Module):
-    """Contain a pretrain BERT model and a linear layer."""
-
-    def __init__(self, bert_model):
-        super(QuestionAnswerTask, self).__init__()
-        self.bert_model = bert_model
-        self.activation = F.gelu
-        self.qa_span = Linear(bert_model.ninp, 2)
-
-    def forward(self, src, token_type_input):
-        output = self.bert_model((src, token_type_input))
-        # transpose output (S, N, E) to (N, S, E)
-        output = output.transpose(0, 1)
-        output = self.activation(output)
-        pos_output = self.qa_span(output)
-        start_pos, end_pos = pos_output.split(1, dim=-1)
-        start_pos = start_pos.squeeze(-1)
-        end_pos = end_pos.squeeze(-1)
-        return start_pos, end_pos
diff --git a/examples/BERT/ns_task.py b/examples/BERT/ns_task.py
deleted file mode 100644
index 3084686ebb..0000000000
--- a/examples/BERT/ns_task.py
+++ /dev/null
@@ -1,262 +0,0 @@
-import argparse
-import time
-import math
-import torch
-import torch.nn as nn
-from torch.nn.parallel import DistributedDataParallel as DDP
-from torch.utils.data import DataLoader
-from model import NextSentenceTask, BertModel, BertEmbedding
-from utils import run_demo, run_ddp, wrap_up
-
-
-def process_raw_data(whole_data, args):
-    processed_data = []
-    for _idx in range(len(whole_data)):
-        item = whole_data[_idx]
-        if isinstance(item, list):
-            item = torch.tensor(item)
-        if len(item) > 1:
-            # idx to split the text into two sentencd
-            split_idx = torch.randint(1, len(item), size=(1, 1)).item()
-            # Index 2 means same sentence label. Initial true int(1)
-            processed_data.append([item[:split_idx], item[split_idx:], 1])
-    # Random shuffle data to have args.frac_ns next sentence set up
-    shuffle_idx1 = torch.randperm(len(processed_data))
-    shuffle_idx2 = torch.randperm(len(processed_data))
-    num_shuffle = int(len(processed_data) * args.frac_ns)
-    shuffle_zip = list(zip(shuffle_idx1, shuffle_idx2))[:num_shuffle]
-    for (i, j) in shuffle_zip:
-        processed_data[i][1] = processed_data[j][0]
-        processed_data[i][2] = int(0)  # Switch same sentence label to false 0
-    return processed_data
-
-
-def collate_batch(batch, args, cls_id, sep_id, pad_id):
-    # Fix sequence length to args.bptt with padding or trim
-    seq_list = []
-    tok_type = []
-    same_sentence_labels = []
-    for item in batch:
-        qa_item = torch.cat([item[0], torch.tensor([sep_id]).long(), item[1], torch.tensor([sep_id]).long()])
-        if qa_item.size(0) > args.bptt:
-            qa_item = qa_item[:args.bptt]
-        elif qa_item.size(0) < args.bptt:
-            qa_item = torch.cat((qa_item,
-                                 torch.tensor([pad_id] * (args.bptt -
-                                              qa_item.size(0)))))
-        seq_list.append(qa_item)
-        _tok_tp = torch.ones((qa_item.size(0)))
-        _idx = min(len(item[0]) + 1, args.bptt)
-        _tok_tp[:_idx] = 0.0
-        tok_type.append(_tok_tp)
-        same_sentence_labels.append(item[2])
-    seq_input = torch.stack(seq_list).long().t().contiguous()
-    seq_input = torch.cat((torch.tensor([[cls_id] * seq_input.size(1)]).long(), seq_input))
-    tok_type = torch.stack(tok_type).long().t().contiguous()
-    tok_type = torch.cat((torch.tensor([[0] * tok_type.size(1)]).long(), tok_type))
-    return seq_input, tok_type, torch.tensor(same_sentence_labels).long().contiguous()
-
-
-def evaluate(data_source, model, device, criterion, cls_id, sep_id, pad_id, args):
-    model.eval()
-    total_loss = 0.
-    batch_size = args.batch_size
-    dataloader = DataLoader(data_source, batch_size=batch_size, shuffle=True,
-                            collate_fn=lambda b: collate_batch(b, args, cls_id, sep_id, pad_id))
-    with torch.no_grad():
-        for idx, (seq_input, tok_type, target_ns_labels) in enumerate(dataloader):
-            if args.parallel == 'DDP':
-                seq_input = seq_input.to(device[0])
-                tok_type = tok_type.to(device[0])
-                target_ns_labels = target_ns_labels.to(device[0])
-            else:
-                seq_input = seq_input.to(device)
-                tok_type = tok_type.to(device)
-                target_ns_labels = target_ns_labels.to(device)
-            seq_input = seq_input.transpose(0, 1)  # Wrap up by DDP or DataParallel
-            ns_labels = model(seq_input, token_type_input=tok_type)
-            loss = criterion(ns_labels, target_ns_labels)
-            total_loss += loss.item()
-    return total_loss / (len(data_source) // batch_size)
-
-
-def train(train_dataset, model, train_loss_log, device, optimizer, criterion,
-          epoch, scheduler, cls_id, sep_id, pad_id, args, rank=None):
-    model.train()
-    total_loss = 0.
-    start_time = time.time()
-    batch_size = args.batch_size
-    dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True,
-                            collate_fn=lambda b: collate_batch(b, args, cls_id, sep_id, pad_id))
-    train_loss_log.append(0.0)
-    for idx, (seq_input, tok_type, target_ns_labels) in enumerate(dataloader):
-        if args.parallel == 'DDP':
-            seq_input = seq_input.to(device[0])
-            tok_type = tok_type.to(device[0])
-            target_ns_labels = target_ns_labels.to(device[0])
-        else:
-            seq_input = seq_input.to(device)
-            tok_type = tok_type.to(device)
-            target_ns_labels = target_ns_labels.to(device)
-        optimizer.zero_grad()
-        seq_input = seq_input.transpose(0, 1)  # Wrap up by DDP or DataParallel
-        ns_labels = model(seq_input, token_type_input=tok_type)
-        loss = criterion(ns_labels, target_ns_labels)
-        loss.backward()
-        torch.nn.utils.clip_grad_norm_(model.parameters(), args.clip)
-        optimizer.step()
-        total_loss += loss.item()
-        if idx % args.log_interval == 0 and idx > 0:
-            cur_loss = total_loss / args.log_interval
-            elapsed = time.time() - start_time
-            if (rank is None) or rank == 0:
-                train_loss_log[-1] = cur_loss
-                print('| epoch {:3d} | {:5d}/{:5d} batches | lr {:05.5f} | '
-                      'ms/batch {:5.2f} | '
-                      'loss {:8.5f} | ppl {:5.2f}'.format(epoch, idx,
-                                                          len(train_dataset) // batch_size,
-                                                          scheduler.get_last_lr()[0],
-                                                          elapsed * 1000 / args.log_interval,
-                                                          cur_loss, math.exp(cur_loss)))
-            total_loss = 0
-            start_time = time.time()
-
-
-def run_main(args, rank=None):
-    # Set the random seed manually for reproducibility.
-    torch.manual_seed(args.seed)
-    if args.parallel == 'DDP':
-        n = torch.cuda.device_count() // args.world_size
-        device = list(range(rank * n, (rank + 1) * n))
-    else:
-        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-    vocab = torch.load(args.save_vocab)
-    cls_id = vocab.stoi['<cls>']
-    pad_id = vocab.stoi['<pad>']
-    sep_id = vocab.stoi['<sep>']
-
-    if args.dataset == 'WikiText103':
-        from torchtext.experimental.datasets import WikiText103
-        train_dataset, valid_dataset, test_dataset = WikiText103(vocab=vocab)
-    elif args.dataset == 'BookCorpus':
-        from data import BookCorpus
-        train_dataset, valid_dataset, test_dataset = BookCorpus(vocab, min_sentence_len=60)
-
-    if rank is not None:
-        chunk_len = len(train_dataset.data) // args.world_size
-        train_dataset.data = train_dataset.data[(rank * chunk_len):((rank + 1) * chunk_len)]
-
-    if args.checkpoint != 'None':
-        model = torch.load(args.checkpoint)
-    else:
-        embed_layer = BertEmbedding(len(vocab), args.emsize)
-        pretrained_bert = BertModel(len(vocab), args.emsize, args.nhead, args.nhid, args.nlayers, embed_layer, args.dropout)
-        pretrained_bert.load_state_dict(torch.load(args.bert_model))
-        model = NextSentenceTask(pretrained_bert)
-
-    if args.parallel == 'DDP':
-        model = model.to(device[0])
-        model = DDP(model, device_ids=device)
-    else:
-        model = model.to(device)
-    criterion = nn.CrossEntropyLoss()
-    optimizer = torch.optim.SGD(model.parameters(), lr=args.lr)
-    scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=0.1)
-    best_val_loss = None
-    train_loss_log, val_loss_log = [], []
-
-    for epoch in range(1, args.epochs + 1):
-        epoch_start_time = time.time()
-        train(process_raw_data(train_dataset, args), model, train_loss_log, device, optimizer,
-              criterion, epoch, scheduler, cls_id, sep_id, pad_id, args, rank)
-        val_loss = evaluate(process_raw_data(valid_dataset, args), model, device, criterion,
-                            cls_id, sep_id, pad_id, args)
-        val_loss_log.append(val_loss)
-
-        if (rank is None) or (rank == 0):
-            print('-' * 89)
-            print('| end of epoch {:3d} | time: {:5.2f}s '
-                  '| valid loss {:8.5f} | '.format(epoch,
-                                                   (time.time() - epoch_start_time),
-                                                   val_loss))
-            print('-' * 89)
-        if not best_val_loss or val_loss < best_val_loss:
-            if rank is None:
-                with open(args.save, 'wb') as f:
-                    torch.save(model, f)
-            elif rank == 0:
-                with open(args.save, 'wb') as f:
-                    torch.save(model.state_dict(), f)
-            best_val_loss = val_loss
-        else:
-            scheduler.step()
-    if args.parallel == 'DDP':
-        rank0_devices = [x - rank * len(device) for x in device]
-        device_pairs = zip(rank0_devices, device)
-        map_location = {'cuda:%d' % x: 'cuda:%d' % y for x, y in device_pairs}
-        model.load_state_dict(torch.load(args.save, map_location=map_location))
-        test_loss = evaluate(process_raw_data(test_dataset, args), model, device, criterion,
-                             cls_id, sep_id, pad_id, args)
-        if rank == 0:
-            wrap_up(train_loss_log, val_loss_log, test_loss, args, model.module, 'ns_loss.txt', 'ns_model.pt')
-    else:
-        with open(args.save, 'rb') as f:
-            model = torch.load(f)
-
-        test_loss = evaluate(process_raw_data(test_dataset, args), model, device, criterion,
-                             cls_id, sep_id, pad_id, args)
-        wrap_up(train_loss_log, val_loss_log, test_loss, args, model, 'ns_loss.txt', 'ns_model.pt')
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser(description='Question-Answer fine-tuning task')
-    parser.add_argument('--dataset', type=str, default='WikiText103',
-                        help='dataset used for next sentence task')
-    parser.add_argument('--lr', type=float, default=0.25,
-                        help='initial learning rate')
-    parser.add_argument('--clip', type=float, default=0.1,
-                        help='gradient clipping')
-    parser.add_argument('--epochs', type=int, default=5,
-                        help='upper epoch limit')
-    parser.add_argument('--batch_size', type=int, default=24, metavar='N',
-                        help='batch size')
-    parser.add_argument('--bptt', type=int, default=128,
-                        help='max. sequence length for the next-sentence pair')
-    parser.add_argument('--min_sentence_len', type=int, default=60,
-                        help='min. sequence length for the raw text tokens')
-    parser.add_argument('--seed', type=int, default=312216194,
-                        help='random seed')
-    parser.add_argument('--cuda', action='store_true',
-                        help='use CUDA')
-    parser.add_argument('--log-interval', type=int, default=600, metavar='N',
-                        help='report interval')
-    parser.add_argument('--checkpoint', type=str, default='None',
-                        help='path to load the checkpoint')
-    parser.add_argument('--save', type=str, default='ns_bert.pt',
-                        help='path to save the bert model')
-    parser.add_argument('--save-vocab', type=str, default='torchtext_bert_vocab.pt',
-                        help='path to save the vocab')
-    parser.add_argument('--bert-model', type=str, default='mlm_bert.pt',
-                        help='path to save the pretrained bert')
-    parser.add_argument('--frac_ns', type=float, default=0.5,
-                        help='fraction of not next sentence')
-    parser.add_argument('--parallel', type=str, default='None',
-                        help='Use DataParallel/DDP to train model')
-    parser.add_argument('--world_size', type=int, default=8,
-                        help='the world size to initiate DPP')
-    parser.add_argument('--emsize', type=int, default=768,
-                        help='size of word embeddings')
-    parser.add_argument('--nhid', type=int, default=3072,
-                        help='number of hidden units per layer')
-    parser.add_argument('--nlayers', type=int, default=12,
-                        help='number of layers')
-    parser.add_argument('--nhead', type=int, default=12,
-                        help='the number of heads in the encoder/decoder of the transformer model')
-    parser.add_argument('--dropout', type=float, default=0.2,
-                        help='dropout applied to layers (0 = no dropout)')
-    args = parser.parse_args()
-
-    if args.parallel == 'DDP':
-        run_demo(run_ddp, run_main, args)
-    else:
-        run_main(args)
diff --git a/examples/BERT/qa_task.py b/examples/BERT/qa_task.py
deleted file mode 100644
index c11d4561c0..0000000000
--- a/examples/BERT/qa_task.py
+++ /dev/null
@@ -1,214 +0,0 @@
-import argparse
-import time
-import math
-import torch
-import torch.nn as nn
-from torch.utils.data import DataLoader
-import torchtext
-from torchtext.experimental.datasets import SQuAD1
-from model import QuestionAnswerTask
-from metrics import compute_qa_exact, compute_qa_f1
-from utils import print_loss_log
-from model import BertModel, BertEmbedding
-
-
-def process_raw_data(data):
-    _data = []
-    for (context, question, answers, ans_pos) in data:
-        right_length = True
-        for _idx in range(len(ans_pos)):
-            if ans_pos[_idx][1] + question.size(0) + 2 >= args.bptt:
-                right_length = False
-        if right_length:
-            _data.append((context, question, answers, ans_pos))
-    return _data
-
-
-def collate_batch(batch):
-    seq_list = []
-    ans_pos_list = []
-    tok_type = []
-    for (context, question, answers, ans_pos) in batch:
-        qa_item = torch.cat((torch.tensor([cls_id]), question, torch.tensor([sep_id]),
-                             context, torch.tensor([sep_id])))
-        if qa_item.size(0) > args.bptt:
-            qa_item = qa_item[:args.bptt]
-        elif qa_item.size(0) < args.bptt:
-            qa_item = torch.cat((qa_item,
-                                 torch.tensor([pad_id] * (args.bptt -
-                                              qa_item.size(0)))))
-        seq_list.append(qa_item)
-        pos_list = [pos + question.size(0) + 2 for pos in ans_pos]  # 1 for sep and 1 for cls
-        ans_pos_list.append(pos_list)
-        tok_type.append(torch.cat((torch.zeros((question.size(0) + 2)),
-                                   torch.ones((args.bptt -
-                                               question.size(0) - 2)))))
-    _ans_pos_list = []
-    for pos in zip(*ans_pos_list):
-        _ans_pos_list.append(torch.stack(list(pos)))
-    return torch.stack(seq_list).long().t().contiguous().to(device), \
-        _ans_pos_list, \
-        torch.stack(tok_type).long().t().contiguous().to(device)
-
-
-def evaluate(data_source, vocab):
-    model.eval()
-    total_loss = 0.
-    batch_size = args.batch_size
-    dataloader = DataLoader(data_source, batch_size=batch_size, shuffle=True,
-                            collate_fn=collate_batch)
-    ans_pred_tokens_samples = []
-    with torch.no_grad():
-        for idx, (seq_input, ans_pos_list, tok_type) in enumerate(dataloader):
-            start_pos, end_pos = model(seq_input, token_type_input=tok_type)
-            target_start_pos, target_end_pos = [], []
-            for item in ans_pos_list:
-                _target_start_pos, _target_end_pos = item.to(device).split(1, dim=-1)
-                target_start_pos.append(_target_start_pos.squeeze(-1))
-                target_end_pos.append(_target_end_pos.squeeze(-1))
-            loss = (criterion(start_pos, target_start_pos[0])
-                    + criterion(end_pos, target_end_pos[0])) / 2
-            total_loss += loss.item()
-            start_pos = nn.functional.softmax(start_pos, dim=1).argmax(1)
-            end_pos = nn.functional.softmax(end_pos, dim=1).argmax(1)
-            seq_input = seq_input.transpose(0, 1)  # convert from (S, N) to (N, S)
-            for num in range(0, seq_input.size(0)):
-                if int(start_pos[num]) > int(end_pos[num]):
-                    continue  # start pos is in front of end pos
-                ans_tokens = []
-                for _idx in range(len(target_end_pos)):
-                    ans_tokens.append([vocab.itos[int(seq_input[num][i])]
-                                       for i in range(target_start_pos[_idx][num],
-                                                      target_end_pos[_idx][num] + 1)])
-                pred_tokens = [vocab.itos[int(seq_input[num][i])]
-                               for i in range(start_pos[num],
-                                              end_pos[num] + 1)]
-                ans_pred_tokens_samples.append((ans_tokens, pred_tokens))
-    return total_loss / (len(data_source) // batch_size), \
-        compute_qa_exact(ans_pred_tokens_samples), \
-        compute_qa_f1(ans_pred_tokens_samples)
-
-
-def train():
-    model.train()
-    total_loss = 0.
-    start_time = time.time()
-    batch_size = args.batch_size
-    dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True,
-                            collate_fn=collate_batch)
-    train_loss_log.append(0.0)
-    for idx, (seq_input, ans_pos, tok_type) in enumerate(dataloader):
-        optimizer.zero_grad()
-        start_pos, end_pos = model(seq_input, token_type_input=tok_type)
-        target_start_pos, target_end_pos = ans_pos[0].to(device).split(1, dim=-1)
-        target_start_pos = target_start_pos.squeeze(-1)
-        target_end_pos = target_end_pos.squeeze(-1)
-        loss = (criterion(start_pos, target_start_pos) + criterion(end_pos, target_end_pos)) / 2
-        loss.backward()
-        torch.nn.utils.clip_grad_norm_(model.parameters(), args.clip)
-        optimizer.step()
-        total_loss += loss.item()
-        if idx % args.log_interval == 0 and idx > 0:
-            cur_loss = total_loss / args.log_interval
-            train_loss_log[-1] = cur_loss
-            elapsed = time.time() - start_time
-            print('| epoch {:3d} | {:5d}/{:5d} batches | lr {:05.5f} | '
-                  'ms/batch {:5.2f} | '
-                  'loss {:5.2f} | ppl {:8.2f}'.format(epoch, idx,
-                                                      len(train_dataset) // batch_size,
-                                                      scheduler.get_last_lr()[0],
-                                                      elapsed * 1000 / args.log_interval,
-                                                      cur_loss, math.exp(cur_loss)))
-            total_loss = 0
-            start_time = time.time()
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser(description='Question-Answer fine-tuning task')
-    parser.add_argument('--lr', type=float, default=5.0,
-                        help='initial learning rate')
-    parser.add_argument('--clip', type=float, default=0.1,
-                        help='gradient clipping')
-    parser.add_argument('--epochs', type=int, default=2,
-                        help='upper epoch limit')
-    parser.add_argument('--batch_size', type=int, default=72, metavar='N',
-                        help='batch size')
-    parser.add_argument('--bptt', type=int, default=128,
-                        help='max. sequence length for context + question')
-    parser.add_argument('--seed', type=int, default=21192391,
-                        help='random seed')
-    parser.add_argument('--log-interval', type=int, default=200, metavar='N',
-                        help='report interval')
-    parser.add_argument('--save', type=str, default='qa_model.pt',
-                        help='path to save the final bert model')
-    parser.add_argument('--save-vocab', type=str, default='torchtext_bert_vocab.pt',
-                        help='path to save the vocab')
-    parser.add_argument('--bert-model', type=str, default='ns_bert.pt',
-                        help='path to save the pretrained bert')
-    parser.add_argument('--emsize', type=int, default=768,
-                        help='size of word embeddings')
-    parser.add_argument('--nhid', type=int, default=3072,
-                        help='number of hidden units per layer')
-    parser.add_argument('--nlayers', type=int, default=12,
-                        help='number of layers')
-    parser.add_argument('--nhead', type=int, default=12,
-                        help='the number of heads in the encoder/decoder of the transformer model')
-    parser.add_argument('--dropout', type=float, default=0.2,
-                        help='dropout applied to layers (0 = no dropout)')
-    args = parser.parse_args()
-    torch.manual_seed(args.seed)
-
-    try:
-        vocab = torch.load(args.save_vocab)
-    except:
-        train_dataset, dev_dataset = SQuAD1()
-        old_vocab = train_dataset.vocab
-        vocab = torchtext.legacy.vocab.Vocab(counter=old_vocab.freqs,
-                                      specials=['<unk>', '<pad>', '<MASK>'])
-        with open(args.save_vocab, 'wb') as f:
-            torch.save(vocab, f)
-    pad_id = vocab.stoi['<pad>']
-    sep_id = vocab.stoi['<sep>']
-    cls_id = vocab.stoi['<cls>']
-    train_dataset, dev_dataset = SQuAD1(vocab=vocab)
-    train_dataset = process_raw_data(train_dataset)
-    dev_dataset = process_raw_data(dev_dataset)
-    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-    embed_layer = BertEmbedding(len(vocab), args.emsize)
-    pretrained_bert = BertModel(len(vocab), args.emsize, args.nhead, args.nhid, args.nlayers, embed_layer, args.dropout)
-    pretrained_bert.load_state_dict(torch.load(args.bert_model))
-    model = QuestionAnswerTask(pretrained_bert).to(device)
-    criterion = nn.CrossEntropyLoss()
-    optimizer = torch.optim.SGD(model.parameters(), lr=args.lr)
-    scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=0.1)
-    best_f1 = None
-    train_loss_log, val_loss_log = [], []
-
-    for epoch in range(1, args.epochs + 1):
-        epoch_start_time = time.time()
-        train()
-        val_loss, val_exact, val_f1 = evaluate(dev_dataset, vocab)
-        val_loss_log.append(val_loss)
-        print('-' * 89)
-        print('| end of epoch {:3d} | time: {:5.2f}s | valid loss {:5.2f} | '
-              'exact {:8.3f}% | '
-              'f1 {:8.3f}%'.format(epoch, (time.time() - epoch_start_time),
-                                   val_loss, val_exact, val_f1))
-        print('-' * 89)
-        if best_f1 is None or val_f1 > best_f1:
-            with open(args.save, 'wb') as f:
-                torch.save(model, f)
-            best_f1 = val_f1
-        else:
-            scheduler.step()
-
-    with open(args.save, 'rb') as f:
-        model = torch.load(f)
-    test_loss, test_exact, test_f1 = evaluate(dev_dataset, vocab)
-    print('=' * 89)
-    print('| End of training | test loss {:5.2f} | exact {:8.3f}% | f1 {:8.3f}%'.format(
-        test_loss, test_exact, test_f1))
-    print('=' * 89)
-    print_loss_log('qa_loss.txt', train_loss_log, val_loss_log, test_loss)
-    with open(args.save, 'wb') as f:
-        torch.save(model, f)
diff --git a/examples/BERT/utils.py b/examples/BERT/utils.py
deleted file mode 100644
index 94cf371663..0000000000
--- a/examples/BERT/utils.py
+++ /dev/null
@@ -1,58 +0,0 @@
-import torch
-import torch.distributed as dist
-import os
-import torch.multiprocessing as mp
-import math
-
-
-def setup(rank, world_size, seed):
-    os.environ['MASTER_ADDR'] = 'localhost'
-    os.environ['MASTER_PORT'] = '12355'
-    # initialize the process group
-    dist.init_process_group("nccl", rank=rank, world_size=world_size)
-
-    # Explicitly setting seed to make sure that models created in two processes
-    # start from same random weights and biases.
-    torch.manual_seed(seed)
-
-
-def cleanup():
-    dist.destroy_process_group()
-
-
-def run_demo(demo_fn, main_fn, args):
-    mp.spawn(demo_fn,
-             args=(main_fn, args,),
-             nprocs=args.world_size,
-             join=True)
-
-
-def run_ddp(rank, main_fn, args):
-    setup(rank, args.world_size, args.seed)
-    main_fn(args, rank)
-    cleanup()
-
-
-def print_loss_log(file_name, train_loss, val_loss, test_loss, args=None):
-    with open(file_name, 'w') as f:
-        if args:
-            for item in args.__dict__:
-                f.write(item + ':    ' + str(args.__dict__[item]) + '\n')
-        for idx in range(len(train_loss)):
-            f.write('epoch {:3d} | train loss {:8.5f}'.format(idx + 1,
-                                                              train_loss[idx]) + '\n')
-        for idx in range(len(val_loss)):
-            f.write('epoch {:3d} | val loss {:8.5f}'.format(idx + 1,
-                                                            val_loss[idx]) + '\n')
-        f.write('test loss {:8.5f}'.format(test_loss) + '\n')
-
-
-def wrap_up(train_loss_log, val_loss_log, test_loss, args, model, ns_loss_log, model_filename):
-    print('=' * 89)
-    print('| End of training | test loss {:8.5f} | test ppl {:8.5f}'.format(test_loss, math.exp(test_loss)))
-    print('=' * 89)
-    print_loss_log(ns_loss_log, train_loss_log, val_loss_log, test_loss)
-    with open(args.save, 'wb') as f:
-        torch.save(model.bert_model.state_dict(), f)
-    with open(model_filename, 'wb') as f:
-        torch.save(model.state_dict(), f)
diff --git a/packaging/build_conda.sh b/packaging/build_conda.sh
index 825d53a626..bac0ad7bef 100755
--- a/packaging/build_conda.sh
+++ b/packaging/build_conda.sh
@@ -6,7 +6,7 @@ script_dir="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
 
 export BUILD_TYPE="conda"
 export NO_CUDA_PACKAGE=1
-setup_env 0.11.0
+setup_env 0.12.0
 export SOURCE_ROOT_DIR="$PWD"
 setup_conda_pytorch_constraint
 setup_visual_studio_constraint
diff --git a/packaging/build_wheel.sh b/packaging/build_wheel.sh
index 8cd7391b05..34ae8ef996 100755
--- a/packaging/build_wheel.sh
+++ b/packaging/build_wheel.sh
@@ -6,7 +6,7 @@ script_dir="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
 
 export BUILD_TYPE="wheel"
 export NO_CUDA_PACKAGE=1
-setup_env 0.11.0
+setup_env 0.12.0
 setup_wheel_python
 pip_install numpy future
 setup_pip_pytorch_version
diff --git a/test/asset/raw_datasets.jsonl b/test/asset/raw_datasets.jsonl
index c27ec73f44..eaf70d4ebf 100644
--- a/test/asset/raw_datasets.jsonl
+++ b/test/asset/raw_datasets.jsonl
@@ -30,9 +30,6 @@
 {"dataset_name": "IWSLT2017", "split": "train", "NUM_LINES": 206112, "MD5": "aca701032b1c4411afc4d9fa367796ba", "URL": "https://drive.google.com/u/0/uc?id=12ycYSzLIG253AFN35Y6qoyf9wtkOjakp", "first_line": "c75166d2ffde3978586af8a8ebdf6450"}
 {"dataset_name": "IWSLT2017", "split": "valid", "NUM_LINES": 888, "MD5": "aca701032b1c4411afc4d9fa367796ba", "URL": "https://drive.google.com/u/0/uc?id=12ycYSzLIG253AFN35Y6qoyf9wtkOjakp", "first_line": "c43021713268b2efc08a255c191f2c74"}
 {"dataset_name": "IWSLT2017", "split": "test", "NUM_LINES": 1568, "MD5": "aca701032b1c4411afc4d9fa367796ba", "URL": "https://drive.google.com/u/0/uc?id=12ycYSzLIG253AFN35Y6qoyf9wtkOjakp", "first_line": "cfff6f23c564bc4cb372bee4987f6707"}
-{"dataset_name": "WMT14", "split": "train", "NUM_LINES": 4500966, "MD5": "874ab6bbfe9c21ec987ed1b9347f95ec", "URL": "https://drive.google.com/uc?export=download&id=0B_bZck-ksdkpM25jRUN2X2UxMm8", "first_line": "27a5871c90db257250806f52ba6aff0c"}
-{"dataset_name": "WMT14", "split": "valid", "NUM_LINES": 3000, "MD5": "874ab6bbfe9c21ec987ed1b9347f95ec", "URL": "https://drive.google.com/uc?export=download&id=0B_bZck-ksdkpM25jRUN2X2UxMm8", "first_line": "a302bd241b4c3138a400e38a846f93b0"}
-{"dataset_name": "WMT14", "split": "test", "NUM_LINES": 3003, "MD5": "874ab6bbfe9c21ec987ed1b9347f95ec", "URL": "https://drive.google.com/uc?export=download&id=0B_bZck-ksdkpM25jRUN2X2UxMm8", "first_line": "140245f6a92f95225150f717e2d7a1a7"}
 {"dataset_name": "WikiText2", "split": "train", "NUM_LINES": 36718, "MD5": "542ccefacc6c27f945fb54453812b3cd", "URL": "https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-v1.zip", "first_line": "c3e189c0ef8590f093c38b41bdba5239"}
 {"dataset_name": "WikiText2", "split": "valid", "NUM_LINES": 3760, "MD5": "542ccefacc6c27f945fb54453812b3cd", "URL": "https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-v1.zip", "first_line": "c3e189c0ef8590f093c38b41bdba5239"}
 {"dataset_name": "WikiText2", "split": "test", "NUM_LINES": 4358, "MD5": "542ccefacc6c27f945fb54453812b3cd", "URL": "https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-v1.zip", "first_line": "c3e189c0ef8590f093c38b41bdba5239"}
diff --git a/test/common/case_utils.py b/test/common/case_utils.py
new file mode 100644
index 0000000000..03eec2627f
--- /dev/null
+++ b/test/common/case_utils.py
@@ -0,0 +1,7 @@
+import unittest
+from torchtext._internal.module_utils import is_module_available
+
+
+def skipIfNoModule(module, display_name=None):
+    display_name = display_name or module
+    return unittest.skipIf(not is_module_available(module), f'"{display_name}" is not available')
diff --git a/test/experimental/test_builtin_datasets.py b/test/experimental/test_builtin_datasets.py
index 8670bd9c2f..1eb7d635ef 100644
--- a/test/experimental/test_builtin_datasets.py
+++ b/test/experimental/test_builtin_datasets.py
@@ -3,6 +3,7 @@
 import torchtext
 import json
 import hashlib
+import unittest
 from parameterized import parameterized
 from ..common.torchtext_test_case import TorchtextTestCase
 from ..common.parameterized_utils import load_params
@@ -26,6 +27,7 @@ def setUpClass(cls):
     @parameterized.expand(
         load_params('raw_datasets.jsonl'),
         name_func=_raw_text_custom_name_func)
+    @unittest.skip("Skipping test due to invalid URL. Enable it back once WMT14 is fixed")
     def test_raw_text_name_property(self, info):
         dataset_name = info['dataset_name']
         split = info['split']
@@ -39,6 +41,7 @@ def test_raw_text_name_property(self, info):
     @parameterized.expand(
         load_params('raw_datasets.jsonl'),
         name_func=_raw_text_custom_name_func)
+    @unittest.skip("Skipping test due to invalid URL. Enable it back once WMT14 is fixed")
     def test_raw_text_classification(self, info):
         dataset_name = info['dataset_name']
         split = info['split']
diff --git a/test/experimental/test_datasets.py b/test/experimental/test_datasets.py
new file mode 100644
index 0000000000..2a9ff700ff
--- /dev/null
+++ b/test/experimental/test_datasets.py
@@ -0,0 +1,34 @@
+import hashlib
+import json
+
+from torchtext.experimental.datasets import sst2
+
+from ..common.case_utils import skipIfNoModule
+from ..common.torchtext_test_case import TorchtextTestCase
+
+
+class TestDataset(TorchtextTestCase):
+    @skipIfNoModule("torchdata")
+    def test_sst2_dataset(self):
+        split = ("train", "dev", "test")
+        train_dp, dev_dp, test_dp = sst2.SST2(split=split)
+
+        # verify hashes of first line in dataset
+        self.assertEqual(
+            hashlib.md5(
+                json.dumps(next(iter(train_dp)), sort_keys=True).encode("utf-8")
+            ).hexdigest(),
+            sst2._FIRST_LINE_MD5["train"],
+        )
+        self.assertEqual(
+            hashlib.md5(
+                json.dumps(next(iter(dev_dp)), sort_keys=True).encode("utf-8")
+            ).hexdigest(),
+            sst2._FIRST_LINE_MD5["dev"],
+        )
+        self.assertEqual(
+            hashlib.md5(
+                json.dumps(next(iter(test_dp)), sort_keys=True).encode("utf-8")
+            ).hexdigest(),
+            sst2._FIRST_LINE_MD5["test"],
+        )
diff --git a/torchtext/_extension.py b/torchtext/_extension.py
index 77dd101bf3..4400582113 100644
--- a/torchtext/_extension.py
+++ b/torchtext/_extension.py
@@ -1,46 +1,18 @@
-import importlib
-
-
-def is_module_available(*modules: str) -> bool:
-    r"""Returns if a top-level module with :attr:`name` exists *without**
-    importing it. This is generally safer than try-catch block around a
-    `import X`. It avoids third party libraries breaking assumptions of some of
-    our tests, e.g., setting multiprocessing start method when imported
-    (see librosa/#747, torchvision/#544).
-    """
-    return all(importlib.util.find_spec(m) is not None for m in modules)
+def _init_extension():
+    import os
+    import importlib
+    import torch
 
+    # load the custom_op_library and register the custom ops
+    lib_dir = os.path.dirname(__file__)
+    loader_details = (
+        importlib.machinery.ExtensionFileLoader,
+        importlib.machinery.EXTENSION_SUFFIXES
+    )
 
-def _init_extension():
-    if is_module_available("torchtext._torchtext"):
-        # Note this import has two purposes
-        # 1. Make _torchtext accessible by the other modules (regular import)
-        # 2. Register torchtext's custom ops bound via TorchScript
-        #
-        # For 2, normally function calls `torch.ops.load_library` and `torch.classes.load_library`
-        # are used. However, in our cases, this is inconvenient and unnecessary.
-        #
-        # - Why inconvenient?
-        # When torchtext is deployed with `pex` format, all the files are deployed as a single zip
-        # file, and the extension module is not present as a file with full path. Therefore it is not
-        # possible to pass the path to library to `torch.[ops|classes].load_library` functions.
-        #
-        # - Why unnecessary?
-        # When torchtext extension module (C++ module) is available, it is assumed that
-        # the extension contains both TorchScript-based binding and PyBind11-based binding.*
-        # Under this assumption, simply performing `from torchtext import _torchtext' will load the
-        # library which contains TorchScript-based binding as well, and the functions/classes bound
-        # via TorchScript become accessible under `torch.ops` and `torch.classes`.
-        #
-        # *Note that this holds true even when these two bindings are split into two library files and
-        # the library that contains PyBind11-based binding (`_torchtext.so` in the following diagram)
-        # depends on the other one (`libtorchtext.so`), because when the process tries to load
-        # `_torchtext.so` it detects undefined symbols from `libtorchtext.so` and will automatically
-        # loads `libtorchtext.so`. (given that the library is found in a search path)
-        #
-        # [libtorchtext.so] <- [_torchtext.so]
-        #
-        #
-        from torchtext import _torchtext  # noqa
-    else:
-        raise ImportError("torchtext C++ extension is not available.")
+    extfinder = importlib.machinery.FileFinder(lib_dir, loader_details)
+    ext_specs = extfinder.find_spec("_torchtext")
+    if ext_specs is None:
+        raise ImportError("torchtext C++ Extension is not found.")
+    torch.ops.load_library(ext_specs.origin)
+    torch.classes.load_library(ext_specs.origin)
diff --git a/torchtext/_internal/__init__.py b/torchtext/_internal/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/torchtext/_internal/module_utils.py b/torchtext/_internal/module_utils.py
new file mode 100644
index 0000000000..33ac388bc4
--- /dev/null
+++ b/torchtext/_internal/module_utils.py
@@ -0,0 +1,11 @@
+import importlib.util
+
+
+def is_module_available(*modules: str) -> bool:
+    r"""Returns if a top-level module with :attr:`name` exists *without**
+    importing it. This is generally safer than try-catch block around a
+    `import X`. It avoids third party libraries breaking assumptions of some of
+    our tests, e.g., setting multiprocessing start method when imported
+    (see librosa/#747, torchvision/#544).
+    """
+    return all(importlib.util.find_spec(m) is not None for m in modules)
diff --git a/torchtext/data/datasets_utils.py b/torchtext/data/datasets_utils.py
index 8cbe63105d..571b43c479 100644
--- a/torchtext/data/datasets_utils.py
+++ b/torchtext/data/datasets_utils.py
@@ -247,7 +247,7 @@ def decorator(func):
                 len(argspec.kwonlyargs) == 0 and
                 len(argspec.annotations) == 0
                 ):
-            raise ValueError("Internal Error: Given function {} did not adhere to standard signature.".format(func))
+            raise ValueError("Internal Error: Given function {} did not adhere to standard signature.".format(fn))
 
         @functools.wraps(func)
         def wrapper(root=os.path.expanduser('~/.torchtext/cache'), *args, **kwargs):
diff --git a/torchtext/experimental/datasets/__init__.py b/torchtext/experimental/datasets/__init__.py
index bf2cbaa924..81bc90a801 100644
--- a/torchtext/experimental/datasets/__init__.py
+++ b/torchtext/experimental/datasets/__init__.py
@@ -1,3 +1,4 @@
 from . import raw
+from . import sst2
 
-__all__ = ['raw']
+__all__ = ["raw", "sst2"]
diff --git a/torchtext/experimental/datasets/sst2.py b/torchtext/experimental/datasets/sst2.py
new file mode 100644
index 0000000000..85b892eb69
--- /dev/null
+++ b/torchtext/experimental/datasets/sst2.py
@@ -0,0 +1,90 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+import logging
+import os
+
+from torchtext._internal.module_utils import is_module_available
+from torchtext.data.datasets_utils import (
+    _add_docstring_header,
+    _create_dataset_directory,
+    _wrap_split_argument,
+)
+
+logger = logging.getLogger(__name__)
+
+if is_module_available("torchdata"):
+    from torchdata.datapipes.iter import (
+        HttpReader,
+        IterableWrapper,
+    )
+else:
+    logger.warning(
+        "Package `torchdata` is required to be installed to use this dataset."
+        "Please refer to https://github.com/pytorch/data for instructions on "
+        "how to install the package."
+    )
+
+
+NUM_LINES = {
+    "train": 67349,
+    "dev": 872,
+    "test": 1821,
+}
+
+MD5 = "9f81648d4199384278b86e315dac217c"
+URL = "https://dl.fbaipublicfiles.com/glue/data/SST-2.zip"
+
+_EXTRACTED_FILES = {
+    "train": f"{os.sep}".join(["SST-2", "train.tsv"]),
+    "dev": f"{os.sep}".join(["SST-2", "dev.tsv"]),
+    "test": f"{os.sep}".join(["SST-2", "test.tsv"]),
+}
+
+_EXTRACTED_FILES_MD5 = {
+    "train": "da409a0a939379ed32a470bc0f7fe99a",
+    "dev": "268856b487b2a31a28c0a93daaff7288",
+    "test": "3230e4efec76488b87877a56ae49675a",
+}
+
+_FIRST_LINE_MD5 = {
+    "train": "2552b8cecd57b2e022ef23411c688fa8",
+    "dev": "1b0ffd6aa5f2bf0fd9840a5f6f1a9f07",
+    "test": "f838c81fe40bfcd7e42e9ffc4dd004f7",
+}
+
+DATASET_NAME = "SST2"
+
+
+@_add_docstring_header(num_lines=NUM_LINES, num_classes=2)
+@_create_dataset_directory(dataset_name=DATASET_NAME)
+@_wrap_split_argument(("train", "dev", "test"))
+def SST2(root, split):
+    return SST2Dataset(root, split).get_datapipe()
+
+
+class SST2Dataset:
+    """The SST2 dataset uses torchdata datapipes end-2-end.
+    To avoid download at every epoch, we cache the data on-disk
+    We do sanity check on dowloaded and extracted data
+    """
+
+    def __init__(self, root, split):
+        self.root = root
+        self.split = split
+
+    def get_datapipe(self):
+        # cache data on-disk
+        cache_dp = IterableWrapper([URL]).on_disk_cache(
+            HttpReader,
+            op_map=lambda x: (x[0], x[1].read()),
+            filepath_fn=lambda x: os.path.join(self.root, os.path.basename(x)),
+        )
+
+        # extract data from zip
+        extracted_files = cache_dp.read_from_zip()
+
+        # Parse CSV file and yield data samples
+        return (
+            extracted_files.filter(lambda x: self.split in x[0])
+            .parse_csv(skip_lines=1, delimiter="\t")
+            .map(lambda x: (x[0], x[1]))
+        )
diff --git a/torchtext/vocab/vectors.py b/torchtext/vocab/vectors.py
index bec350b9a1..e04eb4ee33 100644
--- a/torchtext/vocab/vectors.py
+++ b/torchtext/vocab/vectors.py
@@ -187,7 +187,7 @@ def get_vecs_by_tokens(self, tokens, lower_case_backup=False):
         Examples:
             >>> examples = ['chip', 'baby', 'Beautiful']
             >>> vec = text.vocab.GloVe(name='6B', dim=50)
-            >>> ret = vec.get_vecs_by_tokens(tokens, lower_case_backup=True)
+            >>> ret = vec.get_vecs_by_tokens(examples, lower_case_backup=True)
         """
         to_reduce = False
 
diff --git a/version.txt b/version.txt
index d22e31d207..3db2940af4 100644
--- a/version.txt
+++ b/version.txt
@@ -1 +1 @@
-0.11.0a0
+0.12.0a0