Skip to content

Commit

Permalink
generate unicode strings to test utf-8 handling for all non-IWSLT dat…
Browse files Browse the repository at this point in the history
…aset tests. (#1599)
  • Loading branch information
erip authored Feb 12, 2022
1 parent d40c375 commit 2e93d94
Show file tree
Hide file tree
Showing 23 changed files with 104 additions and 142 deletions.
25 changes: 25 additions & 0 deletions test/common/case_utils.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import random
import os.path
import tempfile
import unittest
Expand Down Expand Up @@ -53,3 +54,27 @@ def zip_equal(*iterables):
if sentinel in combo:
raise ValueError("Iterables have different lengths")
yield combo


def get_random_unicode(length):
# taken from https://stackoverflow.com/a/21666621/2883245

# Update this to include code point ranges to be sampled
include_ranges = [
(0x0021, 0x0021),
(0x0023, 0x0026),
(0x0028, 0x007E),
(0x00A1, 0x00AC),
(0x00AE, 0x00FF),
(0x0100, 0x017F),
(0x0180, 0x024F),
(0x2C60, 0x2C7F),
(0x16A0, 0x16F0),
(0x0370, 0x0377),
(0x037A, 0x037E),
(0x0384, 0x038A),
(0x038C, 0x038C),
]

alphabet = [chr(code_point) for current_range in include_ranges for code_point in range(current_range[0], current_range[1] + 1)]
return ''.join(random.choice(alphabet) for i in range(length))
10 changes: 3 additions & 7 deletions test/datasets/test_agnews.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,11 @@
import os
import random
import string
from collections import defaultdict
from unittest.mock import patch

from parameterized import parameterized
from torchtext.datasets.ag_news import AG_NEWS

from ..common.case_utils import TempDirMixin, zip_equal
from ..common.case_utils import TempDirMixin, zip_equal, get_random_unicode
from ..common.torchtext_test_case import TorchtextTestCase


Expand All @@ -22,12 +20,10 @@ def _get_mock_dataset(root_dir):
mocked_data = defaultdict(list)
for file_name in ("train.csv", "test.csv"):
txt_file = os.path.join(base_dir, file_name)
with open(txt_file, "w") as f:
with open(txt_file, "w", encoding="utf-8") as f:
for i in range(5):
label = seed % 4 + 1
rand_string = " ".join(
random.choice(string.ascii_letters) for i in range(seed)
)
rand_string = get_random_unicode(seed)
dataset_line = (label, f"{rand_string} {rand_string}")
# append line to correct dataset split
mocked_data[os.path.splitext(file_name)[0]].append(dataset_line)
Expand Down
11 changes: 4 additions & 7 deletions test/datasets/test_amazonreviews.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,13 @@
import os
import random
import string
import tarfile
from collections import defaultdict
from unittest.mock import patch

from torchtext.datasets.amazonreviewfull import AmazonReviewFull
from torchtext.datasets.amazonreviewpolarity import AmazonReviewPolarity

from ..common.case_utils import TempDirMixin, zip_equal
from ..common.parameterized_utils import nested_params
from ..common.case_utils import TempDirMixin, zip_equal, get_random_unicode
from ..common.torchtext_test_case import TorchtextTestCase


Expand All @@ -26,15 +24,14 @@ def _get_mock_dataset(root_dir, base_dir_name):
mocked_data = defaultdict(list)
for file_name in ("train.csv", "test.csv"):
txt_file = os.path.join(temp_dataset_dir, file_name)
with open(txt_file, "w") as f:
with open(txt_file, "w", encoding="utf-8") as f:
for i in range(5):
if base_dir_name == AmazonReviewFull.__name__:
label = seed % 5 + 1
else:
label = seed % 2 + 1
rand_string = " ".join(
random.choice(string.ascii_letters) for i in range(seed)
)
label = seed % 2 + 1
rand_string = get_random_unicode(seed)
dataset_line = (label, f"{rand_string} {rand_string}")
# append line to correct dataset split
mocked_data[os.path.splitext(file_name)[0]].append(dataset_line)
Expand Down
12 changes: 4 additions & 8 deletions test/datasets/test_cc100.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,13 @@
import lzma
import os
import random
import string
import lzma
from collections import defaultdict
from unittest.mock import patch

from parameterized import parameterized
from torchtext.datasets import CC100
from torchtext.datasets.cc100 import VALID_CODES

from ..common.case_utils import TempDirMixin, zip_equal
from ..common.case_utils import TempDirMixin, zip_equal, get_random_unicode
from ..common.torchtext_test_case import TorchtextTestCase


Expand All @@ -26,11 +24,9 @@ def _get_mock_dataset(root_dir):
for language_code in VALID_CODES:
file_name = f"{language_code}.txt.xz"
compressed_file = os.path.join(base_dir, file_name)
with lzma.open(compressed_file, "wt") as f:
with lzma.open(compressed_file, "wt", encoding="utf-8") as f:
for i in range(5):
rand_string = " ".join(
random.choice(string.ascii_letters) for i in range(seed)
)
rand_string = get_random_unicode(seed)
content = f"{rand_string}\n"
f.write(content)
mocked_data[language_code].append((language_code, rand_string))
Expand Down
20 changes: 6 additions & 14 deletions test/datasets/test_conll2000chunking.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,12 @@
import gzip
import os
import random
import string
import gzip
from collections import defaultdict
from unittest.mock import patch

from parameterized import parameterized
from torchtext.datasets.conll2000chunking import CoNLL2000Chunking

from ..common.case_utils import TempDirMixin, zip_equal
from ..common.case_utils import TempDirMixin, zip_equal, get_random_unicode
from ..common.torchtext_test_case import TorchtextTestCase


Expand All @@ -25,17 +23,11 @@ def _get_mock_dataset(root_dir):
for file_name in ("train.txt", "test.txt"):
txt_file = os.path.join(temp_dataset_dir, file_name)
mocked_lines = mocked_data[os.path.splitext(file_name)[0]]
with open(txt_file, "w") as f:
with open(txt_file, "w", encoding="utf-8") as f:
for i in range(5):
rand_strings = [
random.choice(string.ascii_letters) for i in range(seed)
]
rand_label_1 = [
random.choice(string.ascii_letters) for i in range(seed)
]
rand_label_2 = [
random.choice(string.ascii_letters) for i in range(seed)
]
rand_strings = [get_random_unicode(seed)]
rand_label_1 = [get_random_unicode(seed)]
rand_label_2 = [get_random_unicode(seed)]
# one token per line (each sample ends with an extra \n)
for rand_string, label_1, label_2 in zip(
rand_strings, rand_label_1, rand_label_2
Expand Down
10 changes: 3 additions & 7 deletions test/datasets/test_dbpedia.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,12 @@
import os
import random
import string
import tarfile
from collections import defaultdict
from unittest.mock import patch

from parameterized import parameterized
from torchtext.datasets.dbpedia import DBpedia

from ..common.case_utils import TempDirMixin, zip_equal
from ..common.case_utils import TempDirMixin, zip_equal, get_random_unicode
from ..common.torchtext_test_case import TorchtextTestCase


Expand All @@ -25,12 +23,10 @@ def _get_mock_dataset(root_dir):
for file_name in ("train.csv", "test.csv"):
csv_file = os.path.join(temp_dataset_dir, file_name)
mocked_lines = mocked_data[os.path.splitext(file_name)[0]]
with open(csv_file, "w") as f:
with open(csv_file, "w", encoding="utf-8") as f:
for i in range(5):
label = seed % 14 + 1
rand_string = " ".join(
random.choice(string.ascii_letters) for i in range(seed)
)
rand_string = get_random_unicode(seed)
dataset_line = (label, rand_string + " " + rand_string)
f.write(f'{label},"{rand_string}","{rand_string}"\n')

Expand Down
12 changes: 3 additions & 9 deletions test/datasets/test_enwik9.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,10 @@
import os
import random
import string
import zipfile
from unittest.mock import patch

from torchtext.datasets.enwik9 import EnWik9

from ..common.case_utils import TempDirMixin, zip_equal
from ..common.case_utils import TempDirMixin, zip_equal, get_random_unicode
from ..common.torchtext_test_case import TorchtextTestCase


Expand All @@ -22,13 +20,9 @@ def _get_mock_dataset(root_dir):
file_name = "enwik9"
txt_file = os.path.join(temp_dataset_dir, file_name)
mocked_data = []
with open(txt_file, "w") as f:
with open(txt_file, "w", encoding="utf-8") as f:
for i in range(5):
rand_string = (
"<"
+ " ".join(random.choice(string.ascii_letters) for i in range(seed))
+ ">"
)
rand_string = "<" + get_random_unicode(seed) + ">"
dataset_line = f"'{rand_string}'"
f.write(f"'{rand_string}'\n")

Expand Down
10 changes: 3 additions & 7 deletions test/datasets/test_imdb.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,12 @@
import os
import random
import string
import tarfile
from collections import defaultdict
from unittest.mock import patch

from parameterized import parameterized
from torchtext.datasets.imdb import IMDB

from ..common.case_utils import TempDirMixin, zip_equal
from ..common.case_utils import TempDirMixin, zip_equal, get_random_unicode
from ..common.torchtext_test_case import TorchtextTestCase


Expand All @@ -34,10 +32,8 @@ def _get_mock_dataset(root_dir):
label = "neg" if i < 2 else "pos"
cur_dir = pos_dir if label == "pos" else neg_dir
txt_file = os.path.join(cur_dir, f"{i}{i}_{i}.txt")
with open(txt_file, "w") as f:
rand_string = " ".join(
random.choice(string.ascii_letters) for i in range(seed)
)
with open(txt_file, "w", encoding="utf-8") as f:
rand_string = get_random_unicode(seed)
dataset_line = (label, rand_string)
# append line to correct dataset split
mocked_data[split].append(dataset_line)
Expand Down
10 changes: 3 additions & 7 deletions test/datasets/test_multi30k.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,12 @@
import os
import random
import string
import tarfile
from collections import defaultdict
from unittest.mock import patch

from torchtext.datasets import Multi30k

from ..common.case_utils import TempDirMixin, zip_equal
from ..common.parameterized_utils import nested_params
from ..common.case_utils import TempDirMixin, zip_equal, get_random_unicode
from ..common.torchtext_test_case import TorchtextTestCase


Expand All @@ -24,11 +22,9 @@ def _get_mock_dataset(root_dir):
mocked_data = defaultdict(list)
for file_name in ("train.de", "train.en", "val.de", "val.en", "test.de", "test.en"):
txt_file = os.path.join(temp_dataset_dir, file_name)
with open(txt_file, "w") as f:
with open(txt_file, "w", encoding="utf-8") as f:
for i in range(5):
rand_string = " ".join(
random.choice(string.ascii_letters) for i in range(seed)
)
rand_string = get_random_unicode(seed)
f.write(rand_string + "\n")
mocked_data[file_name].append(rand_string)
seed += 1
Expand Down
10 changes: 3 additions & 7 deletions test/datasets/test_penntreebank.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,11 @@
import os
import random
import string
from collections import defaultdict
from unittest.mock import patch

from parameterized import parameterized
from torchtext.datasets.penntreebank import PennTreebank

from ..common.case_utils import TempDirMixin, zip_equal
from ..common.case_utils import TempDirMixin, zip_equal, get_random_unicode
from ..common.torchtext_test_case import TorchtextTestCase


Expand All @@ -22,11 +20,9 @@ def _get_mock_dataset(root_dir):
mocked_data = defaultdict(list)
for file_name in ("ptb.train.txt", "ptb.valid.txt", "ptb.test.txt"):
txt_file = os.path.join(base_dir, file_name)
with open(txt_file, "w") as f:
with open(txt_file, "w", encoding="utf-8") as f:
for i in range(5):
rand_string = " ".join(
random.choice(string.ascii_letters) for i in range(seed)
)
rand_string = get_random_unicode(seed)
dataset_line = f"{rand_string}"
# append line to correct dataset split
split = file_name.replace("ptb.", "").replace(".txt", "")
Expand Down
10 changes: 3 additions & 7 deletions test/datasets/test_sogounews.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,12 @@
import os
import random
import string
import tarfile
from collections import defaultdict
from unittest.mock import patch

from parameterized import parameterized
from torchtext.datasets.sogounews import SogouNews

from ..common.case_utils import TempDirMixin, zip_equal
from ..common.case_utils import TempDirMixin, zip_equal, get_random_unicode
from ..common.torchtext_test_case import TorchtextTestCase


Expand All @@ -24,12 +22,10 @@ def _get_mock_dataset(root_dir):
mocked_data = defaultdict(list)
for file_name in ("train.csv", "test.csv"):
txt_file = os.path.join(temp_dataset_dir, file_name)
with open(txt_file, "w") as f:
with open(txt_file, "w", encoding="utf-8") as f:
for i in range(5):
label = seed % 5 + 1
rand_string = " ".join(
random.choice(string.ascii_letters) for i in range(seed)
)
rand_string = get_random_unicode(seed)
dataset_line = (label, f"{rand_string} {rand_string}")
# append line to correct dataset split
mocked_data[os.path.splitext(file_name)[0]].append(dataset_line)
Expand Down
8 changes: 3 additions & 5 deletions test/datasets/test_squads.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,5 @@
import json
import os
import random
import string
import uuid
from collections import defaultdict
from random import randint
Expand All @@ -11,13 +9,13 @@
from torchtext.datasets.squad1 import SQuAD1
from torchtext.datasets.squad2 import SQuAD2

from ..common.case_utils import TempDirMixin, zip_equal
from ..common.case_utils import TempDirMixin, zip_equal, get_random_unicode
from ..common.parameterized_utils import nested_params
from ..common.torchtext_test_case import TorchtextTestCase


def _get_mock_json_data():
rand_string = " ".join(random.choice(string.ascii_letters) for i in range(10))
rand_string = get_random_unicode(10)
mock_json_data = {
"data": [
{
Expand Down Expand Up @@ -60,7 +58,7 @@ def _get_mock_dataset(root_dir, base_dir_name):
mocked_data = defaultdict(list)
for file_name in file_names:
txt_file = os.path.join(base_dir, file_name)
with open(txt_file, "w") as f:
with open(txt_file, "w", encoding="utf-8") as f:
mock_json_data = _get_mock_json_data()
f.write(json.dumps(mock_json_data))

Expand Down
Loading

0 comments on commit 2e93d94

Please sign in to comment.