From 4800b2cf517bf1c51affb621b872331101062dd1 Mon Sep 17 00:00:00 2001 From: nayef211 Date: Thu, 27 Jan 2022 11:48:05 -0800 Subject: [PATCH 1/6] Added mock test for SST2 --- test/datasets/test_sst2.py | 101 +++++++++++++++++++++++++++++++++++++ 1 file changed, 101 insertions(+) create mode 100644 test/datasets/test_sst2.py diff --git a/test/datasets/test_sst2.py b/test/datasets/test_sst2.py new file mode 100644 index 0000000000..dfe962360a --- /dev/null +++ b/test/datasets/test_sst2.py @@ -0,0 +1,101 @@ +import os +import random +import string +import zipfile +from collections import defaultdict +from unittest.mock import patch + +from parameterized import parameterized +from torchtext.datasets.sst2 import SST2 + +from ..common.case_utils import TempDirMixin +from ..common.torchtext_test_case import TorchtextTestCase + + +def _get_mock_dataset(root_dir): + """ + root_dir: directory to the mocked dataset + """ + base_dir = os.path.join(root_dir, "SST2") + temp_dataset_dir = os.path.join(base_dir, "temp_dataset_dir") + os.makedirs(temp_dataset_dir, exist_ok=True) + + print(base_dir) + seed = 1 + mocked_data = defaultdict(list) + for file_name, (col1_name, col2_name) in zip( + ("train.tsv", "test.tsv", "dev.tsv"), + ((("sentence", "label"), ("sentence", "label"), ("index", "sentence"))), + ): + txt_file = os.path.join(temp_dataset_dir, file_name) + with open(txt_file, "w") as f: + f.write(f"{col1_name}\t{col2_name}\n") + for i in range(5): + label = seed % 2 + rand_string = " ".join( + random.choice(string.ascii_letters) for i in range(seed) + ) + if file_name == "test.tsv": + dataset_line = (f"{rand_string} .",) + f.write(f"{i}\t{rand_string} .\n") + else: + dataset_line = (f"{rand_string} .", label) + f.write(f"{rand_string} .\t{label}\n") + + # append line to correct dataset split + mocked_data[os.path.splitext(file_name)[0]].append(dataset_line) + seed += 1 + + compressed_dataset_path = os.path.join(base_dir, "SST-2.zip") + # create tar file from dataset folder + with zipfile.ZipFile(compressed_dataset_path, "w") as zip_file: + for file_name in ("train.tsv", "test.tsv", "dev.tsv"): + txt_file = os.path.join(temp_dataset_dir, file_name) + zip_file.write(txt_file, arcname=os.path.join("SST-2", file_name)) + + return mocked_data + + +class TestSST2(TempDirMixin, TorchtextTestCase): + root_dir = None + samples = [] + + @classmethod + def setUpClass(cls): + super().setUpClass() + cls.root_dir = cls.get_base_temp_dir() + cls.samples = _get_mock_dataset(cls.root_dir) + + @parameterized.expand(["train", "test", "dev"]) + def test_sst2(self, split): + with patch( + "torchdata.datapipes.iter.util.cacheholder._hash_check", return_value=True + ): + dataset = SST2(root=self.root_dir, split=split) + n_iter = 0 + + if split == "test": + for i, (text,) in enumerate(dataset): + expected_sample = self.samples[split][i] + assert text == expected_sample[0] + n_iter += 1 + else: + for i, (text, label) in enumerate(dataset): + expected_sample = self.samples[split][i] + assert text == expected_sample[0] + assert label == expected_sample[1] + n_iter += 1 + assert n_iter == len(self.samples[split]) + + @parameterized.expand( + [("train", ("train",)), ("dev", ("dev",)), ("test", ("test",))] + ) + def test_sst2_split_argument(self, split1, split2): + with patch( + "torchdata.datapipes.iter.util.cacheholder._hash_check", return_value=True + ): + dataset1 = SST2(root=self.root_dir, split=split1) + (dataset2,) = SST2(root=self.root_dir, split=split2) + + for d1, d2 in zip(dataset1, dataset2): + self.assertEqual(d1, d2) From cb6d616945c5139b2970e9f004e3aeca1b61b3f7 Mon Sep 17 00:00:00 2001 From: nayef211 Date: Thu, 27 Jan 2022 13:16:09 -0800 Subject: [PATCH 2/6] Remove print line --- test/datasets/test_sst2.py | 1 - 1 file changed, 1 deletion(-) diff --git a/test/datasets/test_sst2.py b/test/datasets/test_sst2.py index dfe962360a..63e1d571c9 100644 --- a/test/datasets/test_sst2.py +++ b/test/datasets/test_sst2.py @@ -20,7 +20,6 @@ def _get_mock_dataset(root_dir): temp_dataset_dir = os.path.join(base_dir, "temp_dataset_dir") os.makedirs(temp_dataset_dir, exist_ok=True) - print(base_dir) seed = 1 mocked_data = defaultdict(list) for file_name, (col1_name, col2_name) in zip( From 791a8d2649742482c715dae2bac40bfc0998c345 Mon Sep 17 00:00:00 2001 From: nayef211 Date: Thu, 27 Jan 2022 14:07:29 -0800 Subject: [PATCH 3/6] Resolving PR comments --- test/common/case_utils.py | 17 +++++++++++- test/datasets/test_sst2.py | 56 +++++++++++++++++--------------------- 2 files changed, 41 insertions(+), 32 deletions(-) diff --git a/test/common/case_utils.py b/test/common/case_utils.py index f8803894b0..9a9340fbff 100644 --- a/test/common/case_utils.py +++ b/test/common/case_utils.py @@ -1,6 +1,7 @@ import os.path import tempfile import unittest +from itertools import zip_longest from torchtext._internal.module_utils import is_module_available @@ -37,4 +38,18 @@ def get_temp_path(self, *paths): def skipIfNoModule(module, display_name=None): display_name = display_name or module - return unittest.skipIf(not is_module_available(module), f'"{display_name}" is not available') + return unittest.skipIf( + not is_module_available(module), f'"{display_name}" is not available' + ) + + +def zip_equal(*iterables): + """With the regular Python `zip` function, if one iterable is longer than the other, + the remainder portions are ignored.This is resolved in Python 3.10 where we can use + `strict=True` in the `zip` function + """ + sentinel = object() + for combo in zip_longest(*iterables, fillvalue=sentinel): + if sentinel in combo: + raise ValueError("Iterables have different lengths") + yield combo diff --git a/test/datasets/test_sst2.py b/test/datasets/test_sst2.py index 63e1d571c9..d20050504a 100644 --- a/test/datasets/test_sst2.py +++ b/test/datasets/test_sst2.py @@ -8,7 +8,7 @@ from parameterized import parameterized from torchtext.datasets.sst2 import SST2 -from ..common.case_utils import TempDirMixin +from ..common.case_utils import TempDirMixin, zip_equal from ..common.torchtext_test_case import TorchtextTestCase @@ -64,37 +64,31 @@ def setUpClass(cls): super().setUpClass() cls.root_dir = cls.get_base_temp_dir() cls.samples = _get_mock_dataset(cls.root_dir) + cls.patcher = patch( + "torchdata.datapipes.iter.util.cacheholder._hash_check", return_value=True + ) + cls.patcher.start() + + @classmethod + def tearDownClass(cls): + super().tearDownClass() + cls.patcher.stop() @parameterized.expand(["train", "test", "dev"]) def test_sst2(self, split): - with patch( - "torchdata.datapipes.iter.util.cacheholder._hash_check", return_value=True - ): - dataset = SST2(root=self.root_dir, split=split) - n_iter = 0 - - if split == "test": - for i, (text,) in enumerate(dataset): - expected_sample = self.samples[split][i] - assert text == expected_sample[0] - n_iter += 1 - else: - for i, (text, label) in enumerate(dataset): - expected_sample = self.samples[split][i] - assert text == expected_sample[0] - assert label == expected_sample[1] - n_iter += 1 - assert n_iter == len(self.samples[split]) - - @parameterized.expand( - [("train", ("train",)), ("dev", ("dev",)), ("test", ("test",))] - ) - def test_sst2_split_argument(self, split1, split2): - with patch( - "torchdata.datapipes.iter.util.cacheholder._hash_check", return_value=True - ): - dataset1 = SST2(root=self.root_dir, split=split1) - (dataset2,) = SST2(root=self.root_dir, split=split2) + dataset = SST2(root=self.root_dir, split=split) + n_iter = 0 + + for i, sample in enumerate(dataset): + expected_sample = self.samples[split][i] + assert sample == expected_sample + n_iter += 1 + assert n_iter == len(self.samples[split]) + + @parameterized.expand(["train", "dev", "test"]) + def test_sst2_split_argument(self, split): + dataset1 = SST2(root=self.root_dir, split=split) + (dataset2,) = SST2(root=self.root_dir, split=(split,)) - for d1, d2 in zip(dataset1, dataset2): - self.assertEqual(d1, d2) + for d1, d2 in zip_equal(dataset1, dataset2): + self.assertEqual(d1, d2) From 60ac1aa507c85ba926edf9370ab2fb4c80b70e5a Mon Sep 17 00:00:00 2001 From: nayef211 Date: Thu, 27 Jan 2022 14:10:26 -0800 Subject: [PATCH 4/6] Updated comment to say zip --- test/datasets/test_sst2.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/datasets/test_sst2.py b/test/datasets/test_sst2.py index d20050504a..a634ffe24c 100644 --- a/test/datasets/test_sst2.py +++ b/test/datasets/test_sst2.py @@ -46,7 +46,7 @@ def _get_mock_dataset(root_dir): seed += 1 compressed_dataset_path = os.path.join(base_dir, "SST-2.zip") - # create tar file from dataset folder + # create zip file from dataset folder with zipfile.ZipFile(compressed_dataset_path, "w") as zip_file: for file_name in ("train.tsv", "test.tsv", "dev.tsv"): txt_file = os.path.join(temp_dataset_dir, file_name) From 9bdfbcc605f7207ffd8704448c488d2f07a84512 Mon Sep 17 00:00:00 2001 From: nayef211 Date: Thu, 27 Jan 2022 14:11:15 -0800 Subject: [PATCH 5/6] updated ordering of splits in parameterization --- test/datasets/test_sst2.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/datasets/test_sst2.py b/test/datasets/test_sst2.py index a634ffe24c..5b09081ea8 100644 --- a/test/datasets/test_sst2.py +++ b/test/datasets/test_sst2.py @@ -85,7 +85,7 @@ def test_sst2(self, split): n_iter += 1 assert n_iter == len(self.samples[split]) - @parameterized.expand(["train", "dev", "test"]) + @parameterized.expand(["train", "test", "dev"]) def test_sst2_split_argument(self, split): dataset1 = SST2(root=self.root_dir, split=split) (dataset2,) = SST2(root=self.root_dir, split=(split,)) From f9550d7c4d2d6f79eef934ef986807fa2dc753b4 Mon Sep 17 00:00:00 2001 From: nayef211 Date: Thu, 27 Jan 2022 22:07:17 -0800 Subject: [PATCH 6/6] Using zip_equal for iteration in test_sst2 --- test/datasets/test_sst2.py | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/test/datasets/test_sst2.py b/test/datasets/test_sst2.py index 5b09081ea8..29fdb6fbed 100644 --- a/test/datasets/test_sst2.py +++ b/test/datasets/test_sst2.py @@ -71,19 +71,17 @@ def setUpClass(cls): @classmethod def tearDownClass(cls): - super().tearDownClass() cls.patcher.stop() + super().tearDownClass() @parameterized.expand(["train", "test", "dev"]) def test_sst2(self, split): dataset = SST2(root=self.root_dir, split=split) - n_iter = 0 - for i, sample in enumerate(dataset): - expected_sample = self.samples[split][i] - assert sample == expected_sample - n_iter += 1 - assert n_iter == len(self.samples[split]) + samples = list(dataset) + expected_samples = self.samples[split] + for sample, expected_sample in zip_equal(samples, expected_samples): + self.assertEqual(sample, expected_sample) @parameterized.expand(["train", "test", "dev"]) def test_sst2_split_argument(self, split):