From fa565ee4b6af286e4e89bf1c641428ead8ced421 Mon Sep 17 00:00:00 2001
From: nayef211 <n63ahmed@edu.uwaterloo.ca>
Date: Wed, 13 Oct 2021 16:11:30 -0700
Subject: [PATCH 01/14] added new SST2 dataset class based on sst2 functional
 dataset in torchdata

---
 test/experimental/test_datasets.py          | 14 ++++
 third_party/sentencepiece                   |  2 +-
 torchtext/experimental/datasets/__init__.py |  3 +-
 torchtext/experimental/datasets/sst2.py     | 87 +++++++++++++++++++++
 4 files changed, 104 insertions(+), 2 deletions(-)
 create mode 100644 test/experimental/test_datasets.py
 create mode 100644 torchtext/experimental/datasets/sst2.py

diff --git a/test/experimental/test_datasets.py b/test/experimental/test_datasets.py
new file mode 100644
index 0000000000..5d752c9062
--- /dev/null
+++ b/test/experimental/test_datasets.py
@@ -0,0 +1,14 @@
+from torchtext.experimental.datasets import sst2
+
+from ..common.torchtext_test_case import TorchtextTestCase
+
+
+class TestDataset(TorchtextTestCase):
+    def test_sst2_dataset(self):
+
+        split = ("train", "dev", "test")
+        train_dp, dev_dp, test_dp = sst2.SST2(split=split)
+
+        self.assertEqual(len(list(train_dp)), sst2.NUM_LINES["train"])
+        self.assertEqual(len(list(dev_dp)), sst2.NUM_LINES["dev"])
+        self.assertEqual(len(list(test_dp)), sst2.NUM_LINES["test"])
diff --git a/third_party/sentencepiece b/third_party/sentencepiece
index 0e6dfbf86e..e8a84a16d1 160000
--- a/third_party/sentencepiece
+++ b/third_party/sentencepiece
@@ -1 +1 @@
-Subproject commit 0e6dfbf86e2fa6d86a3d9a8a08a628da71c073e0
+Subproject commit e8a84a16d13e8bf92892a1cd92e4de3b0d0321fd
diff --git a/torchtext/experimental/datasets/__init__.py b/torchtext/experimental/datasets/__init__.py
index bf2cbaa924..81bc90a801 100644
--- a/torchtext/experimental/datasets/__init__.py
+++ b/torchtext/experimental/datasets/__init__.py
@@ -1,3 +1,4 @@
 from . import raw
+from . import sst2
 
-__all__ = ['raw']
+__all__ = ["raw", "sst2"]
diff --git a/torchtext/experimental/datasets/sst2.py b/torchtext/experimental/datasets/sst2.py
new file mode 100644
index 0000000000..c9c96a3e38
--- /dev/null
+++ b/torchtext/experimental/datasets/sst2.py
@@ -0,0 +1,87 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+import os
+
+from torchdata.datapipes.iter import (
+    HttpReader,
+    IterableWrapper,
+)
+from torchtext.data.datasets_utils import (
+    _add_docstring_header,
+    _create_dataset_directory,
+    _wrap_split_argument,
+)
+
+
+NUM_LINES = {
+    "train": 67349,
+    "dev": 872,
+    "test": 1821,
+}
+
+MD5 = "9f81648d4199384278b86e315dac217c"
+URL = "https://dl.fbaipublicfiles.com/glue/data/SST-2.zip"
+
+_EXTRACTED_FILES = {
+    "train": f"{os.sep}".join(["SST-2", "train.tsv"]),
+    "dev": f"{os.sep}".join(["SST-2", "dev.tsv"]),
+    "test": f"{os.sep}".join(["SST-2", "test.tsv"]),
+}
+
+_EXTRACTED_FILES_MD5 = {
+    "train": "da409a0a939379ed32a470bc0f7fe99a",
+    "dev": "268856b487b2a31a28c0a93daaff7288",
+    "test": "3230e4efec76488b87877a56ae49675a",
+}
+
+DATASET_NAME = "SST2"
+
+
+@_add_docstring_header(num_lines=NUM_LINES, num_classes=2)
+@_create_dataset_directory(dataset_name=DATASET_NAME)
+@_wrap_split_argument(("train", "dev", "test"))
+def SST2(root, split):
+    return SST2Dataset(root, split).get_datapipes()
+
+
+class SST2Dataset:
+    """The SST2 dataset uses torchdata datapipes end-2-end.
+    To avoid download at every epoch, we cache the data on-disk
+    We do sanity check on dowloaded and extracted data
+    """
+
+    def __init__(self, root, split):
+        self.root = root
+        self.split = split
+
+    def get_datapipes(self):
+        # cache data on-disk
+        cache_dp = IterableWrapper([URL]).on_disk_cache(
+            HttpReader,
+            op_map=lambda x: (x[0], x[1].read()),
+            filepath_fn=lambda x: os.path.join(self.root, os.path.basename(x)),
+        )
+
+        # do sanity check
+        check_cache_dp = cache_dp.check_hash(
+            {os.path.join(self.root, "SST-2.zip"): MD5}, "md5"
+        )
+
+        # extract data from zip
+        extracted_files = check_cache_dp.read_from_zip()
+
+        # Filter extracted files and do sanity check
+        check_extracted_files = extracted_files.filter(
+            lambda x: self.split in x[0]
+        ).check_hash(
+            {
+                os.path.join(
+                    self.root, _EXTRACTED_FILES[self.split]
+                ): _EXTRACTED_FILES_MD5[self.split]
+            },
+            "md5",
+        )
+
+        # Parse CSV file and yield data samples
+        return check_extracted_files.parse_csv(skip_header=True, delimiter="\t").map(
+            lambda x: (x[0], x[1])
+        )

From 3e9551b779f6fbcebd28210b12d145bcb8426529 Mon Sep 17 00:00:00 2001
From: nayef211 <n63ahmed@edu.uwaterloo.ca>
Date: Wed, 13 Oct 2021 16:28:28 -0700
Subject: [PATCH 02/14] Reset sp submodule to previous commit

---
 third_party/sentencepiece | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/third_party/sentencepiece b/third_party/sentencepiece
index e8a84a16d1..0e6dfbf86e 160000
--- a/third_party/sentencepiece
+++ b/third_party/sentencepiece
@@ -1 +1 @@
-Subproject commit e8a84a16d13e8bf92892a1cd92e4de3b0d0321fd
+Subproject commit 0e6dfbf86e2fa6d86a3d9a8a08a628da71c073e0

From 1d6d2895cde8e4e4958822c4992946faa30645bc Mon Sep 17 00:00:00 2001
From: nayef211 <n63ahmed@edu.uwaterloo.ca>
Date: Wed, 13 Oct 2021 16:34:13 -0700
Subject: [PATCH 03/14] Updated function name. Added torchdata as a dep

---
 requirements.txt                        | 3 +++
 torchtext/experimental/datasets/sst2.py | 4 ++--
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/requirements.txt b/requirements.txt
index fd100b8eb3..3f51455663 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -4,6 +4,9 @@ tqdm
 # Downloading data and other files
 requests
 
+# Torchdata
+git+https://github.com/pytorch/data.git
+
 # Optional NLP tools
 nltk
 spacy
diff --git a/torchtext/experimental/datasets/sst2.py b/torchtext/experimental/datasets/sst2.py
index c9c96a3e38..1b2062a567 100644
--- a/torchtext/experimental/datasets/sst2.py
+++ b/torchtext/experimental/datasets/sst2.py
@@ -40,7 +40,7 @@
 @_create_dataset_directory(dataset_name=DATASET_NAME)
 @_wrap_split_argument(("train", "dev", "test"))
 def SST2(root, split):
-    return SST2Dataset(root, split).get_datapipes()
+    return SST2Dataset(root, split).get_datapipe()
 
 
 class SST2Dataset:
@@ -53,7 +53,7 @@ def __init__(self, root, split):
         self.root = root
         self.split = split
 
-    def get_datapipes(self):
+    def get_datapipe(self):
         # cache data on-disk
         cache_dp = IterableWrapper([URL]).on_disk_cache(
             HttpReader,

From 38d3d58583ada13b1218c7e69a68398c3d8b4527 Mon Sep 17 00:00:00 2001
From: nayef211 <n63ahmed@edu.uwaterloo.ca>
Date: Wed, 13 Oct 2021 18:46:39 -0700
Subject: [PATCH 04/14] Added torchdata as a dep to setup.py

---
 setup.py                                | 88 ++++++++++++++-----------
 torchtext/experimental/datasets/sst2.py |  2 +-
 2 files changed, 50 insertions(+), 40 deletions(-)

diff --git a/setup.py b/setup.py
index 5db3388053..7a8b12eb51 100644
--- a/setup.py
+++ b/setup.py
@@ -1,45 +1,47 @@
 #!/usr/bin/env python
+import distutils.command.clean
 import io
 import os
 import shutil
 import subprocess
 from pathlib import Path
-import distutils.command.clean
-from setuptools import setup, find_packages
 
 from build_tools import setup_helpers
+from setuptools import setup, find_packages
 
 ROOT_DIR = Path(__file__).parent.resolve()
 
 
 def read(*names, **kwargs):
-    with io.open(ROOT_DIR.joinpath(*names), encoding=kwargs.get("encoding", "utf8")) as fp:
+    with io.open(
+        ROOT_DIR.joinpath(*names), encoding=kwargs.get("encoding", "utf8")
+    ) as fp:
         return fp.read()
 
 
 def _get_version():
     try:
-        cmd = ['git', 'rev-parse', 'HEAD']
-        sha = subprocess.check_output(cmd, cwd=str(ROOT_DIR)).decode('ascii').strip()
+        cmd = ["git", "rev-parse", "HEAD"]
+        sha = subprocess.check_output(cmd, cwd=str(ROOT_DIR)).decode("ascii").strip()
     except Exception:
         sha = None
 
-    if 'BUILD_VERSION' in os.environ:
-        version = os.environ['BUILD_VERSION']
+    if "BUILD_VERSION" in os.environ:
+        version = os.environ["BUILD_VERSION"]
     else:
-        with open(os.path.join(ROOT_DIR, 'version.txt'), 'r') as f:
+        with open(os.path.join(ROOT_DIR, "version.txt"), "r") as f:
             version = f.readline().strip()
         if sha is not None:
-            version += '+' + sha[:7]
+            version += "+" + sha[:7]
 
     if sha is None:
-        sha = 'Unknown'
+        sha = "Unknown"
     return version, sha
 
 
 def _export_version(version, sha):
-    version_path = ROOT_DIR / 'torchtext' / 'version.py'
-    with open(version_path, 'w') as fileobj:
+    version_path = ROOT_DIR / "torchtext" / "version.py"
+    with open(version_path, "w") as fileobj:
         fileobj.write("__version__ = '{}'\n".format(version))
         fileobj.write("git_version = {}\n".format(repr(sha)))
 
@@ -47,11 +49,11 @@ def _export_version(version, sha):
 VERSION, SHA = _get_version()
 _export_version(VERSION, SHA)
 
-print('-- Building version ' + VERSION)
+print("-- Building version " + VERSION)
 
-pytorch_package_version = os.getenv('PYTORCH_VERSION')
+pytorch_package_version = os.getenv("PYTORCH_VERSION")
 
-pytorch_package_dep = 'torch'
+pytorch_package_dep = "torch"
 if pytorch_package_version is not None:
     pytorch_package_dep += "==" + pytorch_package_version
 
@@ -62,53 +64,61 @@ def run(self):
         distutils.command.clean.clean.run(self)
 
         # Remove torchtext extension
-        for path in (ROOT_DIR / 'torchtext').glob('**/*.so'):
-            print(f'removing \'{path}\'')
+        for path in (ROOT_DIR / "torchtext").glob("**/*.so"):
+            print(f"removing '{path}'")
             path.unlink()
         # Remove build directory
         build_dirs = [
-            ROOT_DIR / 'build',
-            ROOT_DIR / 'third_party' / 'build',
+            ROOT_DIR / "build",
+            ROOT_DIR / "third_party" / "build",
         ]
         for path in build_dirs:
             if path.exists():
-                print(f'removing \'{path}\' (and everything under it)')
+                print(f"removing '{path}' (and everything under it)")
                 shutil.rmtree(str(path), ignore_errors=True)
 
 
 setup_info = dict(
     # Metadata
-    name='torchtext',
+    name="torchtext",
     version=VERSION,
-    author='PyTorch core devs and James Bradbury',
-    author_email='jekbradbury@gmail.com',
-    url='https://github.com/pytorch/text',
-    description='Text utilities and datasets for PyTorch',
-    long_description=read('README.rst'),
-    license='BSD',
-
+    author="PyTorch core devs and James Bradbury",
+    author_email="jekbradbury@gmail.com",
+    url="https://github.com/pytorch/text",
+    description="Text utilities and datasets for PyTorch",
+    long_description=read("README.rst"),
+    license="BSD",
     install_requires=[
-        'tqdm', 'requests', pytorch_package_dep, 'numpy'
+        "tqdm",
+        "requests",
+        pytorch_package_dep,
+        "numpy",
+        "torchdata==0.1.0a0+7772406",
+    ],
+    dependency_links=[
+        "git+https://github.com/pytorch/data.git@7772406#egg=torchdata-0.1.0a0+7772406",
     ],
-    python_requires='>=3.5',
+    python_requires=">=3.5",
     classifiers=[
-        'Programming Language :: Python :: 3',
-        'Programming Language :: Python :: 3.5',
-        'Programming Language :: Python :: 3.6',
-        'Programming Language :: Python :: 3.7',
-        'Programming Language :: Python :: 3.8',
-        'Programming Language :: Python :: 3 :: Only',
+        "Programming Language :: Python :: 3",
+        "Programming Language :: Python :: 3.5",
+        "Programming Language :: Python :: 3.6",
+        "Programming Language :: Python :: 3.7",
+        "Programming Language :: Python :: 3.8",
+        "Programming Language :: Python :: 3 :: Only",
     ],
     # Package info
-    packages=find_packages(exclude=('test*', 'build_tools*')),
+    packages=find_packages(exclude=("test*", "build_tools*")),
     zip_safe=False,
     # Extension info
     # If you are trying to use torchtext.so and see no registered op.
     # See here: https://github.com/pytorch/vision/issues/2134"
     ext_modules=setup_helpers.get_ext_modules(),
     cmdclass={
-        'build_ext': setup_helpers.BuildExtension.with_options(no_python_abi_suffix=True),
-        'clean': clean,
+        "build_ext": setup_helpers.BuildExtension.with_options(
+            no_python_abi_suffix=True
+        ),
+        "clean": clean,
     },
 )
 
diff --git a/torchtext/experimental/datasets/sst2.py b/torchtext/experimental/datasets/sst2.py
index 1b2062a567..d662fafd42 100644
--- a/torchtext/experimental/datasets/sst2.py
+++ b/torchtext/experimental/datasets/sst2.py
@@ -82,6 +82,6 @@ def get_datapipe(self):
         )
 
         # Parse CSV file and yield data samples
-        return check_extracted_files.parse_csv(skip_header=True, delimiter="\t").map(
+        return check_extracted_files.parse_csv(skip_lines=1, delimiter="\t").map(
             lambda x: (x[0], x[1])
         )

From 0f8968eae19cc7c8570481f22f149c751849d7ab Mon Sep 17 00:00:00 2001
From: nayef211 <n63ahmed@edu.uwaterloo.ca>
Date: Wed, 13 Oct 2021 19:54:21 -0700
Subject: [PATCH 05/14] Updated unit test to check hash of first line in
 dataset

---
 setup.py                                | 87 ++++++++++++-------------
 test/experimental/test_datasets.py      | 26 ++++++--
 torchtext/experimental/datasets/sst2.py |  6 ++
 3 files changed, 68 insertions(+), 51 deletions(-)

diff --git a/setup.py b/setup.py
index 7a8b12eb51..f3b7b989ed 100644
--- a/setup.py
+++ b/setup.py
@@ -1,47 +1,45 @@
 #!/usr/bin/env python
-import distutils.command.clean
 import io
 import os
 import shutil
 import subprocess
 from pathlib import Path
+import distutils.command.clean
+from setuptools import setup, find_packages
 
 from build_tools import setup_helpers
-from setuptools import setup, find_packages
 
 ROOT_DIR = Path(__file__).parent.resolve()
 
 
 def read(*names, **kwargs):
-    with io.open(
-        ROOT_DIR.joinpath(*names), encoding=kwargs.get("encoding", "utf8")
-    ) as fp:
+    with io.open(ROOT_DIR.joinpath(*names), encoding=kwargs.get("encoding", "utf8")) as fp:
         return fp.read()
 
 
 def _get_version():
     try:
-        cmd = ["git", "rev-parse", "HEAD"]
-        sha = subprocess.check_output(cmd, cwd=str(ROOT_DIR)).decode("ascii").strip()
+        cmd = ['git', 'rev-parse', 'HEAD']
+        sha = subprocess.check_output(cmd, cwd=str(ROOT_DIR)).decode('ascii').strip()
     except Exception:
         sha = None
 
-    if "BUILD_VERSION" in os.environ:
-        version = os.environ["BUILD_VERSION"]
+    if 'BUILD_VERSION' in os.environ:
+        version = os.environ['BUILD_VERSION']
     else:
-        with open(os.path.join(ROOT_DIR, "version.txt"), "r") as f:
+        with open(os.path.join(ROOT_DIR, 'version.txt'), 'r') as f:
             version = f.readline().strip()
         if sha is not None:
-            version += "+" + sha[:7]
+            version += '+' + sha[:7]
 
     if sha is None:
-        sha = "Unknown"
+        sha = 'Unknown'
     return version, sha
 
 
 def _export_version(version, sha):
-    version_path = ROOT_DIR / "torchtext" / "version.py"
-    with open(version_path, "w") as fileobj:
+    version_path = ROOT_DIR / 'torchtext' / 'version.py'
+    with open(version_path, 'w') as fileobj:
         fileobj.write("__version__ = '{}'\n".format(version))
         fileobj.write("git_version = {}\n".format(repr(sha)))
 
@@ -49,11 +47,11 @@ def _export_version(version, sha):
 VERSION, SHA = _get_version()
 _export_version(VERSION, SHA)
 
-print("-- Building version " + VERSION)
+print('-- Building version ' + VERSION)
 
-pytorch_package_version = os.getenv("PYTORCH_VERSION")
+pytorch_package_version = os.getenv('PYTORCH_VERSION')
 
-pytorch_package_dep = "torch"
+pytorch_package_dep = 'torch'
 if pytorch_package_version is not None:
     pytorch_package_dep += "==" + pytorch_package_version
 
@@ -64,61 +62,56 @@ def run(self):
         distutils.command.clean.clean.run(self)
 
         # Remove torchtext extension
-        for path in (ROOT_DIR / "torchtext").glob("**/*.so"):
-            print(f"removing '{path}'")
+        for path in (ROOT_DIR / 'torchtext').glob('**/*.so'):
+            print(f'removing \'{path}\'')
             path.unlink()
         # Remove build directory
         build_dirs = [
-            ROOT_DIR / "build",
-            ROOT_DIR / "third_party" / "build",
+            ROOT_DIR / 'build',
+            ROOT_DIR / 'third_party' / 'build',
         ]
         for path in build_dirs:
             if path.exists():
-                print(f"removing '{path}' (and everything under it)")
+                print(f'removing \'{path}\' (and everything under it)')
                 shutil.rmtree(str(path), ignore_errors=True)
 
 
 setup_info = dict(
     # Metadata
-    name="torchtext",
+    name='torchtext',
     version=VERSION,
-    author="PyTorch core devs and James Bradbury",
-    author_email="jekbradbury@gmail.com",
-    url="https://github.com/pytorch/text",
-    description="Text utilities and datasets for PyTorch",
-    long_description=read("README.rst"),
-    license="BSD",
+    author='PyTorch core devs and James Bradbury',
+    author_email='jekbradbury@gmail.com',
+    url='https://github.com/pytorch/text',
+    description='Text utilities and datasets for PyTorch',
+    long_description=read('README.rst'),
+    license='BSD',
+
     install_requires=[
-        "tqdm",
-        "requests",
-        pytorch_package_dep,
-        "numpy",
-        "torchdata==0.1.0a0+7772406",
+        'tqdm', 'requests', pytorch_package_dep, 'numpy', 'torchdata==0.1.0a0+7772406'
     ],
     dependency_links=[
-        "git+https://github.com/pytorch/data.git@7772406#egg=torchdata-0.1.0a0+7772406",
+        "https://github.com/pytorch/data.git#egg=torchdata",
     ],
-    python_requires=">=3.5",
+    python_requires='>=3.5',
     classifiers=[
-        "Programming Language :: Python :: 3",
-        "Programming Language :: Python :: 3.5",
-        "Programming Language :: Python :: 3.6",
-        "Programming Language :: Python :: 3.7",
-        "Programming Language :: Python :: 3.8",
-        "Programming Language :: Python :: 3 :: Only",
+        'Programming Language :: Python :: 3',
+        'Programming Language :: Python :: 3.5',
+        'Programming Language :: Python :: 3.6',
+        'Programming Language :: Python :: 3.7',
+        'Programming Language :: Python :: 3.8',
+        'Programming Language :: Python :: 3 :: Only',
     ],
     # Package info
-    packages=find_packages(exclude=("test*", "build_tools*")),
+    packages=find_packages(exclude=('test*', 'build_tools*')),
     zip_safe=False,
     # Extension info
     # If you are trying to use torchtext.so and see no registered op.
     # See here: https://github.com/pytorch/vision/issues/2134"
     ext_modules=setup_helpers.get_ext_modules(),
     cmdclass={
-        "build_ext": setup_helpers.BuildExtension.with_options(
-            no_python_abi_suffix=True
-        ),
-        "clean": clean,
+        'build_ext': setup_helpers.BuildExtension.with_options(no_python_abi_suffix=True),
+        'clean': clean,
     },
 )
 
diff --git a/test/experimental/test_datasets.py b/test/experimental/test_datasets.py
index 5d752c9062..f443761aa2 100644
--- a/test/experimental/test_datasets.py
+++ b/test/experimental/test_datasets.py
@@ -1,3 +1,6 @@
+import hashlib
+import json
+
 from torchtext.experimental.datasets import sst2
 
 from ..common.torchtext_test_case import TorchtextTestCase
@@ -5,10 +8,25 @@
 
 class TestDataset(TorchtextTestCase):
     def test_sst2_dataset(self):
-
         split = ("train", "dev", "test")
         train_dp, dev_dp, test_dp = sst2.SST2(split=split)
 
-        self.assertEqual(len(list(train_dp)), sst2.NUM_LINES["train"])
-        self.assertEqual(len(list(dev_dp)), sst2.NUM_LINES["dev"])
-        self.assertEqual(len(list(test_dp)), sst2.NUM_LINES["test"])
+        # verify hashes of first line in dataset
+        self.assertEqual(
+            hashlib.md5(
+                json.dumps(next(iter(train_dp)), sort_keys=True).encode("utf-8")
+            ).hexdigest(),
+            sst2._FIRST_LINE_MD5["train"],
+        )
+        self.assertEqual(
+            hashlib.md5(
+                json.dumps(next(iter(dev_dp)), sort_keys=True).encode("utf-8")
+            ).hexdigest(),
+            sst2._FIRST_LINE_MD5["dev"],
+        )
+        self.assertEqual(
+            hashlib.md5(
+                json.dumps(next(iter(test_dp)), sort_keys=True).encode("utf-8")
+            ).hexdigest(),
+            sst2._FIRST_LINE_MD5["test"],
+        )
diff --git a/torchtext/experimental/datasets/sst2.py b/torchtext/experimental/datasets/sst2.py
index d662fafd42..bac16d0b1c 100644
--- a/torchtext/experimental/datasets/sst2.py
+++ b/torchtext/experimental/datasets/sst2.py
@@ -33,6 +33,12 @@
     "test": "3230e4efec76488b87877a56ae49675a",
 }
 
+_FIRST_LINE_MD5 = {
+    "train": "2552b8cecd57b2e022ef23411c688fa8",
+    "dev": "1b0ffd6aa5f2bf0fd9840a5f6f1a9f07",
+    "test": "f838c81fe40bfcd7e42e9ffc4dd004f7",
+}
+
 DATASET_NAME = "SST2"
 
 

From d2bcf2fe19ce4d4a1b0fccf6ffd9537a4f3ddc34 Mon Sep 17 00:00:00 2001
From: nayef211 <n63ahmed@edu.uwaterloo.ca>
Date: Wed, 13 Oct 2021 20:06:34 -0700
Subject: [PATCH 06/14] Fixed dependency_link url for torchdata

---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index f3b7b989ed..a539a01ba2 100644
--- a/setup.py
+++ b/setup.py
@@ -91,7 +91,7 @@ def run(self):
         'tqdm', 'requests', pytorch_package_dep, 'numpy', 'torchdata==0.1.0a0+7772406'
     ],
     dependency_links=[
-        "https://github.com/pytorch/data.git#egg=torchdata",
+        "git+https://github.com/pytorch/data.git@7772406#egg=torchdata-0.1.0a0+7772406",
     ],
     python_requires='>=3.5',
     classifiers=[

From 62e6fb2dd57e7a1c6ef820b6378d1f29cd48da57 Mon Sep 17 00:00:00 2001
From: nayef211 <n63ahmed@edu.uwaterloo.ca>
Date: Thu, 14 Oct 2021 14:26:17 -0700
Subject: [PATCH 07/14] Added torchdata install to circleci config

---
 .circleci/config.yml    | 3 ++-
 .circleci/config.yml.in | 3 ++-
 setup.py                | 7 +++----
 3 files changed, 7 insertions(+), 6 deletions(-)

diff --git a/.circleci/config.yml b/.circleci/config.yml
index 21bcfb9954..5de1546e36 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -497,7 +497,7 @@ jobs:
             - v1-windows-dataset-vector-{{ checksum ".cachekey" }}
             - v1-windows-dataset-{{ checksum ".cachekey" }}
 
- 
+
       - run:
           name: Run tests
           # Downloading embedding vector takes long time.
@@ -545,6 +545,7 @@ jobs:
           command: |
             set -x
             conda install -y make python=${PYTHON_VERSION}
+            pip install git+https://github.com/pytorch/data#egg=torchdata
             pip install $(ls ~/workspace/torchtext*.whl) --pre -f "https://download.pytorch.org/whl/${UPLOAD_CHANNEL}/cpu/torch_${UPLOAD_CHANNEL}.html"
       - run:
           name: Build docs
diff --git a/.circleci/config.yml.in b/.circleci/config.yml.in
index 971f4eb971..11f5ca7d97 100644
--- a/.circleci/config.yml.in
+++ b/.circleci/config.yml.in
@@ -497,7 +497,7 @@ jobs:
             - v1-windows-dataset-vector-{{ checksum ".cachekey" }}
             - v1-windows-dataset-{{ checksum ".cachekey" }}
           {% endraw %}
- 
+
       - run:
           name: Run tests
           # Downloading embedding vector takes long time.
@@ -545,6 +545,7 @@ jobs:
           command: |
             set -x
             conda install -y make python=${PYTHON_VERSION}
+            pip install git+https://github.com/pytorch/data#egg=torchdata
             pip install $(ls ~/workspace/torchtext*.whl) --pre -f "https://download.pytorch.org/whl/${UPLOAD_CHANNEL}/cpu/torch_${UPLOAD_CHANNEL}.html"
       - run:
           name: Build docs
diff --git a/setup.py b/setup.py
index a539a01ba2..60584cc6ad 100644
--- a/setup.py
+++ b/setup.py
@@ -86,13 +86,12 @@ def run(self):
     description='Text utilities and datasets for PyTorch',
     long_description=read('README.rst'),
     license='BSD',
-
-    install_requires=[
-        'tqdm', 'requests', pytorch_package_dep, 'numpy', 'torchdata==0.1.0a0+7772406'
-    ],
     dependency_links=[
         "git+https://github.com/pytorch/data.git@7772406#egg=torchdata-0.1.0a0+7772406",
     ],
+    install_requires=[
+        'tqdm', 'requests', pytorch_package_dep, 'numpy', 'torchdata==0.1.0a0+7772406'
+    ],
     python_requires='>=3.5',
     classifiers=[
         'Programming Language :: Python :: 3',

From 846ee213e2808586a83f3313a5a477ff77577fa3 Mon Sep 17 00:00:00 2001
From: nayef211 <n63ahmed@edu.uwaterloo.ca>
Date: Thu, 14 Oct 2021 16:58:26 -0700
Subject: [PATCH 08/14] Updated commit id for torchdata install. Specified
 torchdata as an optional dependency

---
 .circleci/config.yml                    |  2 +-
 .circleci/config.yml.in                 |  2 +-
 torchtext/experimental/datasets/sst2.py | 18 ++++++++++++++----
 3 files changed, 16 insertions(+), 6 deletions(-)

diff --git a/.circleci/config.yml b/.circleci/config.yml
index 5de1546e36..3d374bb638 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -545,7 +545,7 @@ jobs:
           command: |
             set -x
             conda install -y make python=${PYTHON_VERSION}
-            pip install git+https://github.com/pytorch/data#egg=torchdata
+            pip install git+https://github.com/pytorch/data.git@7772406#egg=torchdata
             pip install $(ls ~/workspace/torchtext*.whl) --pre -f "https://download.pytorch.org/whl/${UPLOAD_CHANNEL}/cpu/torch_${UPLOAD_CHANNEL}.html"
       - run:
           name: Build docs
diff --git a/.circleci/config.yml.in b/.circleci/config.yml.in
index 11f5ca7d97..b69c492f40 100644
--- a/.circleci/config.yml.in
+++ b/.circleci/config.yml.in
@@ -545,7 +545,7 @@ jobs:
           command: |
             set -x
             conda install -y make python=${PYTHON_VERSION}
-            pip install git+https://github.com/pytorch/data#egg=torchdata
+            pip install git+https://github.com/pytorch/data.git@7772406#egg=torchdata
             pip install $(ls ~/workspace/torchtext*.whl) --pre -f "https://download.pytorch.org/whl/${UPLOAD_CHANNEL}/cpu/torch_${UPLOAD_CHANNEL}.html"
       - run:
           name: Build docs
diff --git a/torchtext/experimental/datasets/sst2.py b/torchtext/experimental/datasets/sst2.py
index bac16d0b1c..c7d8bc3e57 100644
--- a/torchtext/experimental/datasets/sst2.py
+++ b/torchtext/experimental/datasets/sst2.py
@@ -1,10 +1,20 @@
 # Copyright (c) Facebook, Inc. and its affiliates.
+import logging
 import os
 
-from torchdata.datapipes.iter import (
-    HttpReader,
-    IterableWrapper,
-)
+
+try:
+    from torchdata.datapipes.iter import (
+        HttpReader,
+        IterableWrapper,
+    )
+except ImportError:
+    logging.error(
+        "Package `torchdata` is required to be installed to use this dataset."
+        "Please use `pip install git+https://github.com/pytorch/data.git'"
+        "to install the package."
+    )
+
 from torchtext.data.datasets_utils import (
     _add_docstring_header,
     _create_dataset_directory,

From 6d21049555b71d4477f78e4f3f32b5aecc7bf9fa Mon Sep 17 00:00:00 2001
From: nayef211 <n63ahmed@edu.uwaterloo.ca>
Date: Thu, 14 Oct 2021 18:01:28 -0700
Subject: [PATCH 09/14] Removed additional hash checks during dataset
 construction

---
 torchtext/experimental/datasets/sst2.py | 23 ++++-------------------
 1 file changed, 4 insertions(+), 19 deletions(-)

diff --git a/torchtext/experimental/datasets/sst2.py b/torchtext/experimental/datasets/sst2.py
index c7d8bc3e57..7f86b8f45b 100644
--- a/torchtext/experimental/datasets/sst2.py
+++ b/torchtext/experimental/datasets/sst2.py
@@ -77,27 +77,12 @@ def get_datapipe(self):
             filepath_fn=lambda x: os.path.join(self.root, os.path.basename(x)),
         )
 
-        # do sanity check
-        check_cache_dp = cache_dp.check_hash(
-            {os.path.join(self.root, "SST-2.zip"): MD5}, "md5"
-        )
-
         # extract data from zip
-        extracted_files = check_cache_dp.read_from_zip()
-
-        # Filter extracted files and do sanity check
-        check_extracted_files = extracted_files.filter(
-            lambda x: self.split in x[0]
-        ).check_hash(
-            {
-                os.path.join(
-                    self.root, _EXTRACTED_FILES[self.split]
-                ): _EXTRACTED_FILES_MD5[self.split]
-            },
-            "md5",
-        )
+        extracted_files = cache_dp.read_from_zip()
 
         # Parse CSV file and yield data samples
-        return check_extracted_files.parse_csv(skip_lines=1, delimiter="\t").map(
+        return extracted_files.filter(
+            lambda x: self.split in x[0]
+        ).parse_csv(skip_lines=1, delimiter="\t").map(
             lambda x: (x[0], x[1])
         )

From 7a82ba0c7cc9f2e227eea1bab235376d30b75126 Mon Sep 17 00:00:00 2001
From: nayef211 <n63ahmed@edu.uwaterloo.ca>
Date: Thu, 14 Oct 2021 19:01:35 -0700
Subject: [PATCH 10/14] Removed new line from config.yml

---
 .circleci/config.yml    | 1 -
 .circleci/config.yml.in | 1 -
 2 files changed, 2 deletions(-)

diff --git a/.circleci/config.yml b/.circleci/config.yml
index 3d374bb638..39e2b6eaca 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -497,7 +497,6 @@ jobs:
             - v1-windows-dataset-vector-{{ checksum ".cachekey" }}
             - v1-windows-dataset-{{ checksum ".cachekey" }}
 
-
       - run:
           name: Run tests
           # Downloading embedding vector takes long time.
diff --git a/.circleci/config.yml.in b/.circleci/config.yml.in
index b69c492f40..08aebc3f9b 100644
--- a/.circleci/config.yml.in
+++ b/.circleci/config.yml.in
@@ -497,7 +497,6 @@ jobs:
             - v1-windows-dataset-vector-{{ checksum ".cachekey" }}
             - v1-windows-dataset-{{ checksum ".cachekey" }}
           {% endraw %}
-
       - run:
           name: Run tests
           # Downloading embedding vector takes long time.

From cdb5bac3f3c9961f20dfeec053ee02cfb613e13d Mon Sep 17 00:00:00 2001
From: nayef211 <n63ahmed@edu.uwaterloo.ca>
Date: Thu, 14 Oct 2021 23:11:45 -0700
Subject: [PATCH 11/14] Removed changes from config.yml, requirements.txt, and
 setup.py. Updated unittests to be skipped if module is not available

---
 .circleci/config.yml                    |  1 -
 .circleci/config.yml.in                 |  1 -
 requirements.txt                        |  3 ---
 setup.py                                |  5 +----
 test/common/case_utils.py               |  7 +++++++
 test/experimental/test_datasets.py      |  2 ++
 torchtext/_internal/__init__.py         |  0
 torchtext/_internal/module_utils.py     | 11 +++++++++++
 torchtext/experimental/datasets/sst2.py | 24 ++++++++++++------------
 9 files changed, 33 insertions(+), 21 deletions(-)
 create mode 100644 test/common/case_utils.py
 create mode 100644 torchtext/_internal/__init__.py
 create mode 100644 torchtext/_internal/module_utils.py

diff --git a/.circleci/config.yml b/.circleci/config.yml
index 39e2b6eaca..41746a7c64 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -544,7 +544,6 @@ jobs:
           command: |
             set -x
             conda install -y make python=${PYTHON_VERSION}
-            pip install git+https://github.com/pytorch/data.git@7772406#egg=torchdata
             pip install $(ls ~/workspace/torchtext*.whl) --pre -f "https://download.pytorch.org/whl/${UPLOAD_CHANNEL}/cpu/torch_${UPLOAD_CHANNEL}.html"
       - run:
           name: Build docs
diff --git a/.circleci/config.yml.in b/.circleci/config.yml.in
index 08aebc3f9b..d65718998f 100644
--- a/.circleci/config.yml.in
+++ b/.circleci/config.yml.in
@@ -544,7 +544,6 @@ jobs:
           command: |
             set -x
             conda install -y make python=${PYTHON_VERSION}
-            pip install git+https://github.com/pytorch/data.git@7772406#egg=torchdata
             pip install $(ls ~/workspace/torchtext*.whl) --pre -f "https://download.pytorch.org/whl/${UPLOAD_CHANNEL}/cpu/torch_${UPLOAD_CHANNEL}.html"
       - run:
           name: Build docs
diff --git a/requirements.txt b/requirements.txt
index 3f51455663..fd100b8eb3 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -4,9 +4,6 @@ tqdm
 # Downloading data and other files
 requests
 
-# Torchdata
-git+https://github.com/pytorch/data.git
-
 # Optional NLP tools
 nltk
 spacy
diff --git a/setup.py b/setup.py
index 60584cc6ad..e9ea6168a2 100644
--- a/setup.py
+++ b/setup.py
@@ -86,11 +86,8 @@ def run(self):
     description='Text utilities and datasets for PyTorch',
     long_description=read('README.rst'),
     license='BSD',
-    dependency_links=[
-        "git+https://github.com/pytorch/data.git@7772406#egg=torchdata-0.1.0a0+7772406",
-    ],
     install_requires=[
-        'tqdm', 'requests', pytorch_package_dep, 'numpy', 'torchdata==0.1.0a0+7772406'
+        'tqdm', 'requests', pytorch_package_dep, 'numpy'
     ],
     python_requires='>=3.5',
     classifiers=[
diff --git a/test/common/case_utils.py b/test/common/case_utils.py
new file mode 100644
index 0000000000..03eec2627f
--- /dev/null
+++ b/test/common/case_utils.py
@@ -0,0 +1,7 @@
+import unittest
+from torchtext._internal.module_utils import is_module_available
+
+
+def skipIfNoModule(module, display_name=None):
+    display_name = display_name or module
+    return unittest.skipIf(not is_module_available(module), f'"{display_name}" is not available')
diff --git a/test/experimental/test_datasets.py b/test/experimental/test_datasets.py
index f443761aa2..2a9ff700ff 100644
--- a/test/experimental/test_datasets.py
+++ b/test/experimental/test_datasets.py
@@ -3,10 +3,12 @@
 
 from torchtext.experimental.datasets import sst2
 
+from ..common.case_utils import skipIfNoModule
 from ..common.torchtext_test_case import TorchtextTestCase
 
 
 class TestDataset(TorchtextTestCase):
+    @skipIfNoModule("torchdata")
     def test_sst2_dataset(self):
         split = ("train", "dev", "test")
         train_dp, dev_dp, test_dp = sst2.SST2(split=split)
diff --git a/torchtext/_internal/__init__.py b/torchtext/_internal/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/torchtext/_internal/module_utils.py b/torchtext/_internal/module_utils.py
new file mode 100644
index 0000000000..33ac388bc4
--- /dev/null
+++ b/torchtext/_internal/module_utils.py
@@ -0,0 +1,11 @@
+import importlib.util
+
+
+def is_module_available(*modules: str) -> bool:
+    r"""Returns if a top-level module with :attr:`name` exists *without**
+    importing it. This is generally safer than try-catch block around a
+    `import X`. It avoids third party libraries breaking assumptions of some of
+    our tests, e.g., setting multiprocessing start method when imported
+    (see librosa/#747, torchvision/#544).
+    """
+    return all(importlib.util.find_spec(m) is not None for m in modules)
diff --git a/torchtext/experimental/datasets/sst2.py b/torchtext/experimental/datasets/sst2.py
index 7f86b8f45b..1b391ed2ec 100644
--- a/torchtext/experimental/datasets/sst2.py
+++ b/torchtext/experimental/datasets/sst2.py
@@ -2,25 +2,25 @@
 import logging
 import os
 
+from torchtext._internal.module_utils import is_module_available
+from torchtext.data.datasets_utils import (
+    _add_docstring_header,
+    _create_dataset_directory,
+    _wrap_split_argument,
+)
 
-try:
+if is_module_available("torchdata"):
     from torchdata.datapipes.iter import (
         HttpReader,
         IterableWrapper,
     )
-except ImportError:
+else:
     logging.error(
         "Package `torchdata` is required to be installed to use this dataset."
         "Please use `pip install git+https://github.com/pytorch/data.git'"
         "to install the package."
     )
 
-from torchtext.data.datasets_utils import (
-    _add_docstring_header,
-    _create_dataset_directory,
-    _wrap_split_argument,
-)
-
 
 NUM_LINES = {
     "train": 67349,
@@ -81,8 +81,8 @@ def get_datapipe(self):
         extracted_files = cache_dp.read_from_zip()
 
         # Parse CSV file and yield data samples
-        return extracted_files.filter(
-            lambda x: self.split in x[0]
-        ).parse_csv(skip_lines=1, delimiter="\t").map(
-            lambda x: (x[0], x[1])
+        return (
+            extracted_files.filter(lambda x: self.split in x[0])
+            .parse_csv(skip_lines=1, delimiter="\t")
+            .map(lambda x: (x[0], x[1]))
         )

From fc41492634246305988ec640aac2ae26555d96fc Mon Sep 17 00:00:00 2001
From: nayef211 <n63ahmed@edu.uwaterloo.ca>
Date: Fri, 15 Oct 2021 12:59:04 -0700
Subject: [PATCH 12/14] Incroporated review feedback

---
 torchtext/experimental/datasets/sst2.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/torchtext/experimental/datasets/sst2.py b/torchtext/experimental/datasets/sst2.py
index 1b391ed2ec..85b892eb69 100644
--- a/torchtext/experimental/datasets/sst2.py
+++ b/torchtext/experimental/datasets/sst2.py
@@ -9,16 +9,18 @@
     _wrap_split_argument,
 )
 
+logger = logging.getLogger(__name__)
+
 if is_module_available("torchdata"):
     from torchdata.datapipes.iter import (
         HttpReader,
         IterableWrapper,
     )
 else:
-    logging.error(
+    logger.warning(
         "Package `torchdata` is required to be installed to use this dataset."
-        "Please use `pip install git+https://github.com/pytorch/data.git'"
-        "to install the package."
+        "Please refer to https://github.com/pytorch/data for instructions on "
+        "how to install the package."
     )
 
 

From 535c0509678a41ab49693c21637f38d9b86ba97b Mon Sep 17 00:00:00 2001
From: nayef211 <n63ahmed@edu.uwaterloo.ca>
Date: Fri, 15 Oct 2021 14:20:42 -0700
Subject: [PATCH 13/14] Added torchdata installation for unittests

---
 .circleci/unittest/linux/scripts/install.sh   | 3 +++
 .circleci/unittest/windows/scripts/install.sh | 3 +++
 2 files changed, 6 insertions(+)

diff --git a/.circleci/unittest/linux/scripts/install.sh b/.circleci/unittest/linux/scripts/install.sh
index e9201b266b..a3ecba2770 100755
--- a/.circleci/unittest/linux/scripts/install.sh
+++ b/.circleci/unittest/linux/scripts/install.sh
@@ -13,6 +13,9 @@ conda activate ./env
 printf "* Installing PyTorch\n"
 conda install -y -c "pytorch-${UPLOAD_CHANNEL}" ${CONDA_CHANNEL_FLAGS} pytorch cpuonly
 
+printf "Installing torchdata from source\n"
+pip install git+https://github.com/pytorch/data.git
+
 printf "* Installing torchtext\n"
 git submodule update --init --recursive
 python setup.py develop
diff --git a/.circleci/unittest/windows/scripts/install.sh b/.circleci/unittest/windows/scripts/install.sh
index 622ebc1cd1..1922b9a78f 100644
--- a/.circleci/unittest/windows/scripts/install.sh
+++ b/.circleci/unittest/windows/scripts/install.sh
@@ -18,6 +18,9 @@ conda activate ./env
 printf "* Installing PyTorch\n"
 conda install -y -c "pytorch-${UPLOAD_CHANNEL}" ${CONDA_CHANNEL_FLAGS} pytorch cpuonly
 
+printf "Installing torchdata from source\n"
+pip install git+https://github.com/pytorch/data.git
+
 printf "* Installing torchtext\n"
 git submodule update --init --recursive
 "$root_dir/packaging/vc_env_helper.bat" python setup.py develop

From 80b83e5d6a889de7ff04bed65ae93dc448068f4d Mon Sep 17 00:00:00 2001
From: nayef211 <n63ahmed@edu.uwaterloo.ca>
Date: Fri, 15 Oct 2021 15:31:15 -0700
Subject: [PATCH 14/14] Removed newline changes

---
 .circleci/config.yml    | 1 +
 .circleci/config.yml.in | 1 +
 setup.py                | 1 +
 3 files changed, 3 insertions(+)

diff --git a/.circleci/config.yml b/.circleci/config.yml
index 41746a7c64..bc3f09d043 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -497,6 +497,7 @@ jobs:
             - v1-windows-dataset-vector-{{ checksum ".cachekey" }}
             - v1-windows-dataset-{{ checksum ".cachekey" }}
 
+
       - run:
           name: Run tests
           # Downloading embedding vector takes long time.
diff --git a/.circleci/config.yml.in b/.circleci/config.yml.in
index d65718998f..911295217b 100644
--- a/.circleci/config.yml.in
+++ b/.circleci/config.yml.in
@@ -497,6 +497,7 @@ jobs:
             - v1-windows-dataset-vector-{{ checksum ".cachekey" }}
             - v1-windows-dataset-{{ checksum ".cachekey" }}
           {% endraw %}
+
       - run:
           name: Run tests
           # Downloading embedding vector takes long time.
diff --git a/setup.py b/setup.py
index e9ea6168a2..5db3388053 100644
--- a/setup.py
+++ b/setup.py
@@ -86,6 +86,7 @@ def run(self):
     description='Text utilities and datasets for PyTorch',
     long_description=read('README.rst'),
     license='BSD',
+
     install_requires=[
         'tqdm', 'requests', pytorch_package_dep, 'numpy'
     ],