From f898310d0cfc1b4cbb38b36f8bfc160108ac9c33 Mon Sep 17 00:00:00 2001 From: Abhinav Arora Date: Wed, 8 Dec 2021 09:41:50 -0800 Subject: [PATCH] Import torchtext #1437 2cebac3 Summary: Imports [#1437](https://github.com/pytorch/text/pull/1437) from OSS Torchtext that removes the legacy folder. Reviewed By: parmeet Differential Revision: D32923084 fbshipit-source-id: 83411efd62cd527c518e36279bdbf586435ac9e5 --- benchmark/benchmark_vocab.py | 134 +-- .../legacy_tutorial/migration_tutorial.ipynb | 520 ---------- examples/vocab/vocab.py | 37 - test/experimental/test_datasets.py | 2 +- test/legacy/__init__.py | 0 test/legacy/babi.py | 50 - test/legacy/data.py | 27 - test/legacy/data/__init__.py | 0 test/legacy/data/test_batch.py | 43 - test/legacy/data/test_dataset.py | 511 ---------- test/legacy/data/test_field.py | 901 ------------------ test/legacy/data/test_pipeline.py | 52 - test/legacy/data/test_subword.py | 25 - test/legacy/imdb.py | 43 - test/legacy/language_modeling.py | 38 - test/legacy/nli.py | 304 ------ test/legacy/sequence_tagging.py | 86 -- test/legacy/sst.py | 69 -- test/legacy/test_vocab.py | 131 --- test/legacy/translation.py | 102 -- test/legacy/trec.py | 46 - test/test_build.py | 261 +---- test/test_transforms.py | 3 - torchtext/__init__.py | 4 +- torchtext/experimental/datasets/sst2.py | 15 +- torchtext/legacy/README.rst | 53 -- torchtext/legacy/__init__.py | 11 - torchtext/legacy/data/__init__.py | 34 - torchtext/legacy/data/batch.py | 101 -- torchtext/legacy/data/dataset.py | 362 ------- torchtext/legacy/data/example.py | 99 -- torchtext/legacy/data/field.py | 735 -------------- torchtext/legacy/data/iterator.py | 297 ------ torchtext/legacy/data/pipeline.py | 85 -- torchtext/legacy/datasets/__init__.py | 42 - torchtext/legacy/datasets/babi.py | 140 --- torchtext/legacy/datasets/imdb.py | 80 -- .../legacy/datasets/language_modeling.py | 217 ----- torchtext/legacy/datasets/nli.py | 191 ---- torchtext/legacy/datasets/sequence_tagging.py | 102 -- torchtext/legacy/datasets/sst.py | 104 -- .../legacy/datasets/text_classification.py | 452 --------- torchtext/legacy/datasets/translation.py | 234 ----- torchtext/legacy/datasets/trec.py | 85 -- .../legacy/datasets/unsupervised_learning.py | 136 --- torchtext/legacy/vocab.py | 294 ------ torchtext/transforms.py | 2 +- 47 files changed, 60 insertions(+), 7200 deletions(-) delete mode 100644 examples/legacy_tutorial/migration_tutorial.ipynb delete mode 100755 examples/vocab/vocab.py delete mode 100644 test/legacy/__init__.py delete mode 100644 test/legacy/babi.py delete mode 100644 test/legacy/data.py delete mode 100644 test/legacy/data/__init__.py delete mode 100644 test/legacy/data/test_batch.py delete mode 100644 test/legacy/data/test_dataset.py delete mode 100644 test/legacy/data/test_field.py delete mode 100644 test/legacy/data/test_pipeline.py delete mode 100644 test/legacy/data/test_subword.py delete mode 100644 test/legacy/imdb.py delete mode 100644 test/legacy/language_modeling.py delete mode 100644 test/legacy/nli.py delete mode 100644 test/legacy/sequence_tagging.py delete mode 100644 test/legacy/sst.py delete mode 100644 test/legacy/test_vocab.py delete mode 100644 test/legacy/translation.py delete mode 100644 test/legacy/trec.py delete mode 100644 torchtext/legacy/README.rst delete mode 100644 torchtext/legacy/__init__.py delete mode 100644 torchtext/legacy/data/__init__.py delete mode 100644 torchtext/legacy/data/batch.py delete mode 100644 torchtext/legacy/data/dataset.py delete mode 100644 torchtext/legacy/data/example.py delete mode 100644 torchtext/legacy/data/field.py delete mode 100644 torchtext/legacy/data/iterator.py delete mode 100644 torchtext/legacy/data/pipeline.py delete mode 100644 torchtext/legacy/datasets/__init__.py delete mode 100644 torchtext/legacy/datasets/babi.py delete mode 100644 torchtext/legacy/datasets/imdb.py delete mode 100644 torchtext/legacy/datasets/language_modeling.py delete mode 100644 torchtext/legacy/datasets/nli.py delete mode 100644 torchtext/legacy/datasets/sequence_tagging.py delete mode 100644 torchtext/legacy/datasets/sst.py delete mode 100644 torchtext/legacy/datasets/text_classification.py delete mode 100644 torchtext/legacy/datasets/translation.py delete mode 100644 torchtext/legacy/datasets/trec.py delete mode 100644 torchtext/legacy/datasets/unsupervised_learning.py delete mode 100755 torchtext/legacy/vocab.py diff --git a/benchmark/benchmark_vocab.py b/benchmark/benchmark_vocab.py index 544144d119..f8d23f875f 100644 --- a/benchmark/benchmark_vocab.py +++ b/benchmark/benchmark_vocab.py @@ -1,10 +1,6 @@ import argparse from collections import (Counter, OrderedDict) import time -import random -import string -from timeit import default_timer as timer -from matplotlib import pyplot as plt import torch from torchtext.datasets import DATASETS from torchtext.experimental.vocab_factory import ( @@ -13,15 +9,12 @@ ) from torchtext.vocab import build_vocab_from_iterator from torchtext.vocab import vocab as VocabNew -from torchtext.legacy.vocab import ( - Vocab, - build_vocab_from_iterator as build_vocab_from_iterator_legacy, -) -from torchtext.experimental.transforms import( +from torchtext.experimental.transforms import ( basic_english_normalize, ) from torchtext.data.utils import get_tokenizer + def build_vocab(data, transforms): def apply_transforms(data): for _, line in data: @@ -31,96 +24,16 @@ def apply_transforms(data): return vocab -def compare_legacy_and_new_batch_lookup(): - num_tokens = 1000 - num_letters = 6 - num_lines = 100000 - vocab = [''.join(random.sample(string.ascii_letters * num_letters, num_letters)) for _ in range(num_tokens)] - counter = Counter() - counter.update(vocab) - legacy_vocab = Vocab(counter) - new_vocab = VocabNew(counter) - speed_ups = [] - token_lengths = [i for i in range(2, 100)] - for i in token_lengths: - lines = [random.sample(vocab, i) for _ in range(num_lines)] - start_time = timer() - for text in lines: - legacy_vocab.lookup_indices(text) - legacy_time = timer() - start_time - - start_time = timer() - for text in lines: - new_vocab.lookup_indices(text) - - new_time = timer() - start_time - - speed_ups.append(legacy_time / new_time) - print("speed-up={} for average length={}".format(legacy_time / new_time, i)) - del lines - - plt.close() - fig, ax = plt.subplots(1, 1) - ax.plot(token_lengths, speed_ups) - ax.set_xlabel('Average Tokens per line') - ax.set_ylabel('Speed-up') - plt.savefig("speedup.jpg") - - -def legacy_vocab_from_file_object(file_like_object, **kwargs): - r"""Create a `Vocab` object from a file like object. - - The `file_like_object` should contain tokens seperated by new lines. Note that the vocab - will be created in the order that the tokens first appear in the file (and not by the frequency of tokens). - - Format for txt file: - token1 - token2 - ... - token_n - - Args: - file_like_object (FileObject): a file like object to read data from. - Remaining keyword arguments: Passed to the constructor of Vocab class. - - Returns: - Vocab: a `Vocab` object. - - Examples: - >>> from torchtext.vocab import vocab_from_file_object - >>> f = open('vocab.txt', 'r') - >>> v = vocab_from_file_object(f, specials=('', '', ''), specials_first=False) - """ - tokenizer = basic_english_normalize() - - def tokenize(line): - return tokenizer(line) - - def token_iterator(lines): - for line in lines: - for token in tokenize(line): - yield token - - return build_vocab_from_iterator_legacy(token_iterator(file_like_object)) - - -def benchmark_new_vocab_construction(vocab_file_path, is_raw_text=True, is_legacy=True, num_iters=1): +def benchmark_new_vocab_construction(vocab_file_path, is_raw_text=True, num_iters=1): f = open(vocab_file_path, 'r') t0 = time.monotonic() if is_raw_text: - if is_legacy: - print("Loading from raw text file with legacy python function") - for _ in range(num_iters): - legacy_vocab_from_file_object(f) - - print("Construction time:", time.monotonic() - t0) - else: - print("Loading from raw text file with basic_english_normalize tokenizer") - for _ in range(num_iters): - tokenizer = basic_english_normalize() - jited_tokenizer = torch.jit.script(tokenizer) - build_vocab_from_text_file(vocab_file_path, jited_tokenizer, num_cpus=1) - print("Construction time:", time.monotonic() - t0) + print("Loading from raw text file with basic_english_normalize tokenizer") + for _ in range(num_iters): + tokenizer = basic_english_normalize() + jited_tokenizer = torch.jit.script(tokenizer) + build_vocab_from_text_file(vocab_file_path, jited_tokenizer, num_cpus=1) + print("Construction time:", time.monotonic() - t0) else: for _ in range(num_iters): load_vocab_from_file(f) @@ -146,9 +59,9 @@ def _run_benchmark_lookup(tokens, vocab): tokens_lists = [] tokenizer = get_tokenizer("basic_english") for (_, text) in DATASETS[dataset](split='train'): - cur_tokens = tokenizer(text) - tokens_lists.append(cur_tokens) - tokens += cur_tokens + cur_tokens = tokenizer(text) + tokens_lists.append(cur_tokens) + tokens += cur_tokens if vocab_file_path: print("Loading Vocab from file {}".format(vocab_file_path)) @@ -158,12 +71,6 @@ def token_iterator(file_path): for token in f: yield token - # existing Vocab construction - print("Vocab") - t0 = time.monotonic() - v_existing = build_vocab_from_iterator_legacy(token_iterator(vocab_file_path)) - print("Construction time:", time.monotonic() - t0) - # new Vocab construction print("Vocab New") t0 = time.monotonic() @@ -176,12 +83,6 @@ def token_iterator(file_path): sorted_by_freq_tuples = sorted(counter.items(), key=lambda x: x[1], reverse=True) ordered_dict = OrderedDict(sorted_by_freq_tuples) - # existing Vocab construction - print("Vocab") - t0 = time.monotonic() - v_existing = Vocab(counter) - print("Construction time:", time.monotonic() - t0) - # new Vocab construction print("Vocab New") t0 = time.monotonic() @@ -189,12 +90,6 @@ def token_iterator(file_path): print("Construction time:", time.monotonic() - t0) jit_v_new = torch.jit.script(v_new) - # existing Vocab eager lookup - print("Vocab - Eager Mode") - _run_benchmark_lookup(tokens, v_existing) - _run_benchmark_lookup([tokens], v_existing) - _run_benchmark_lookup(tokens_lists, v_existing) - # new Vocab eager lookup print("Vocab New - Eager Mode") _run_benchmark_lookup(tokens, v_new) @@ -215,8 +110,6 @@ def token_iterator(file_path): help='run benchmark for constructing a vocab (default=False)') parser.add_argument('--is-raw-text', type=bool, default=True, help='construct vocab from raw text file (default=True)') - parser.add_argument('--is-legacy', type=bool, default=False, - help='construct vocab using legacy implementation (default=False)') parser.add_argument('--vocab-filename-construction', type=str, default='vocab.txt', help='The name of vocab file used for construction') parser.add_argument('--vocab-filename-lookup', type=str, default=None, @@ -226,8 +119,7 @@ def token_iterator(file_path): args = parser.parse_args() if args.run_construction_benchmark: - print("is_legacy", args.is_legacy) benchmark_new_vocab_construction(args.vocab_filename_construction, - is_raw_text=args.is_raw_text, is_legacy=args.is_legacy) + is_raw_text=args.is_raw_text) else: benchmark_new_vocab_lookup(args.vocab_filename_lookup, args.dataset) diff --git a/examples/legacy_tutorial/migration_tutorial.ipynb b/examples/legacy_tutorial/migration_tutorial.ipynb deleted file mode 100644 index 7be3ade927..0000000000 --- a/examples/legacy_tutorial/migration_tutorial.ipynb +++ /dev/null @@ -1,520 +0,0 @@ -{ - "nbformat": 4, - "nbformat_minor": 0, - "metadata": { - "colab": { - "name": "Migrate torchtext from the legacy API to the new API", - "provenance": [], - "collapsed_sections": [], - "include_colab_link": true - }, - "kernelspec": { - "name": "python3", - "display_name": "Python 3" - } - }, - "cells": [ - { - "cell_type": "markdown", - "metadata": { - "id": "view-in-github", - "colab_type": "text" - }, - "source": [ - "\"Open" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "HMGWxQCO7s0e" - }, - "source": [ - "!pip install -U torch==1.8.0 torchtext==0.9.0\n", - "\n", - "# Reload environment\n", - "exit()" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "jXUgsnxw70-M" - }, - "source": [ - "This is a tutorial to show how to migrate from the legacy API in torchtext to the new API in 0.9.0 release. Here, we take the IMDB dataset as an example for the sentiment analysis. Both legacy and new APIs in torchtext can preprocess the text input and prepare the data to train/validate a model with the following steps:\n", - "\n", - "* Train/validate/test split: generate train/validate/test data set if they are available\n", - "* Tokenization: break a raw text string sentence into a list of words\n", - "* Vocab: define a \"contract\" from tokens to indexes\n", - "* Numericalize: convert a list of tokens to the corresponding indexes\n", - "* Batch: generate batches of data samples and add padding if necessary\n", - "\n", - "It should be noted that all the legacy features are still available, but within torchtext.legacy instead of torchtext." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "WWRW4bsL8UL0" - }, - "source": [ - "## Step 1: Create a dataset object\n", - "----------------------------\n", - "\n", - "Fist of all, we create a dataset for the sentiment analysis. The individual data sample contains a label and a text string.\n", - "\n", - "### *Legacy*\n", - "In the legacy code, `Field` class is used for data processing, including tokenizer and numberzation. To check out the dataset, users need to first set up the TEXT/LABEL fields." - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "FttPxcbc70j1" - }, - "source": [ - "import torchtext\n", - "import torch\n", - "from torchtext.legacy import data\n", - "from torchtext.legacy import datasets\n", - "\n", - "TEXT = data.Field()\n", - "LABEL = data.LabelField(dtype = torch.long)\n", - "legacy_train, legacy_test = datasets.IMDB.splits(TEXT, LABEL) # datasets here refers to torchtext.legacy.datasets" - ], - "execution_count": 7, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "ssXfxJJSq7WT" - }, - "source": [ - "You can print out the raw data by checking out Dataset.examples. The entire text data are stored as a list of tokens." - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "7DRXJFgzriaH" - }, - "source": [ - "legacy_examples = legacy_train.examples\n", - "print(legacy_examples[0].text, legacy_examples[0].label)" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "eQMfAN_Fz3aa" - }, - "source": [ - "### *New*\n", - "The new dataset API returns the train/test dataset split directly without the preprocessing information. Each split is an iterator which yields the raw texts and labels line-by-line." - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "YHUYZ7yt0Lb5" - }, - "source": [ - "from torchtext.datasets import IMDB\n", - "train_iter, test_iter = IMDB(split=('train', 'test'))" - ], - "execution_count": 9, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "yB7MShEBsd3P" - }, - "source": [ - "To print out the raw data, you can call the next() function on the IterableDataset." - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "wUkWE1KWsPqy" - }, - "source": [ - "next(train_iter)" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "ycL7xqRP0eLU" - }, - "source": [ - "## Step 2 Build the data processing pipeline\n", - "----------------------------\n", - "\n", - "### *Legacy*\n", - "\n", - "The default tokenizer implemented in the `Field` class is the built-in python `split()` function. Users choose the tokenizer by calling `data.get_tokenizer()`, and add it to the `Field` constructor. For the sequence model, it's common to append `` (begin-of-sentence) and `` (end-of-sentence) tokens, and the special tokens need to be defined in the `Field` class." - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "8H_I_XW8gSR1" - }, - "source": [ - "TEXT = data.Field(tokenize=data.get_tokenizer('basic_english'),\n", - " init_token='', eos_token='', lower=True)\n", - "LABEL = data.LabelField(dtype = torch.long)\n", - "legacy_train, legacy_test = datasets.IMDB.splits(TEXT, LABEL) # datasets here refers to torchtext.legacy.datasets" - ], - "execution_count": 11, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "opQ6LcnigTKx" - }, - "source": [ - "Now you can create a vocabulary of the words from the text file stored in the predefined `Field` object, `TEXT`. You fist have to build a vocabulary in your `Field` object by passing the dataset to the `build_vocab` func. The Field object builds the vocabulary (`TEXT.vocab`) on a specific data split." - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "Cffl6ueN8T5X" - }, - "source": [ - "TEXT.build_vocab(legacy_train)\n", - "LABEL.build_vocab(legacy_train)" - ], - "execution_count": 12, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "OXQ9rmiHt58H" - }, - "source": [ - "Things you can do with a vocabuary object\n", - "\n", - "\n", - "* Total length of the vocabulary\n", - "* String2Index (stoi) and Index2String (itos)\n", - "* A purpose-specific vocabulary which contains word appearing more than N times\n", - "\n" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "YzweKLh5uSNC" - }, - "source": [ - "legacy_vocab = TEXT.vocab\n", - "print(\"The length of the legacy vocab is\", len(legacy_vocab))\n", - "legacy_stoi = legacy_vocab.stoi\n", - "print(\"The index of 'example' is\", legacy_stoi['example'])\n", - "legacy_itos = legacy_vocab.itos\n", - "print(\"The token at index 686 is\", legacy_itos[686])\n", - "\n", - "# Set up the mim_freq value in the Vocab class\n", - "TEXT.build_vocab(legacy_train, min_freq=10)\n", - "legacy_vocab2 = TEXT.vocab\n", - "print(\"The length of the legacy vocab is\", len(legacy_vocab2))" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "LXTibHc00olW" - }, - "source": [ - "### *New*\n", - "\n", - "Users have the access to different kinds of tokenizers directly via `data.get_tokenizer()` function." - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "QavK23zjhNlx" - }, - "source": [ - "from torchtext.data.utils import get_tokenizer\n", - "tokenizer = get_tokenizer('basic_english')" - ], - "execution_count": 14, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "TnNpf4mWF5pe" - }, - "source": [ - "To have more flexibility, users can build the vocabulary directly with the Vocab class. For example, the argument `min_freq` is to set up the cutoff frequency to in the vocabulary. The special tokens, like `` and `` can be assigned to the special symbols in the constructor of the Vocab class." - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "Ro8HXPwmwtp7" - }, - "source": [ - "from collections import Counter\n", - "from torchtext.vocab import Vocab\n", - "\n", - "train_iter = IMDB(split='train')\n", - "counter = Counter()\n", - "for (label, line) in train_iter:\n", - " counter.update(tokenizer(line))\n", - "vocab = Vocab(counter, min_freq=10, specials=('', '', '', ''))" - ], - "execution_count": 15, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": { - "id": "xGuqqa7CxLq8" - }, - "source": [ - "print(\"The length of the new vocab is\", len(vocab))\n", - "new_stoi = vocab.stoi\n", - "print(\"The index of '' is\", new_stoi[''])\n", - "new_itos = vocab.itos\n", - "print(\"The token at index 2 is\", new_itos[2])" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "l31FBekVr9j8" - }, - "source": [ - "Both `text_transform` and `label_transform` are the callable object, such as a lambda func here, to process the raw text and label data from the dataset iterators. Users can add the special symbols `` and `` to the sentence in `text_transform`." - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "ez2lT2QO0sNj" - }, - "source": [ - "text_transform = lambda x: [vocab['']] + [vocab[token] for token in tokenizer(x)] + [vocab['']]\n", - "label_transform = lambda x: 1 if x == 'pos' else 0\n", - "\n", - "# Print out the output of text_transform\n", - "print(\"input to the text_transform:\", \"here is an example\")\n", - "print(\"output of the text_transform:\", text_transform(\"here is an example\"))" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "4dEG7pyi1ElM" - }, - "source": [ - "## Step 3: Generate batch iterator\n", - "--------------------------------\n", - "\n", - "To train a model efficiently, it's recommended to build an iterator to generate data batch.\n", - "\n", - "### *Legacy*\n", - "The legacy `Iterator` class is used to batch the dataset and send to the target device, like CPU or GPU." - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "NN67ofUB-sz1" - }, - "source": [ - "import torch\n", - "device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n", - "legacy_train, legacy_test = datasets.IMDB.splits(TEXT, LABEL) # datasets here refers to torchtext.legacy.datasets\n", - "legacy_train_iterator, legacy_test_iterator = data.Iterator.splits(\n", - " (legacy_train, legacy_test), batch_size=8, device = device)" - ], - "execution_count": 18, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "vBMjFVvsMPqR" - }, - "source": [ - "For a NLP workflow, it's also common to define an iterator and batch texts with similar lengths together. The legacy `BucketIterator` class in torchtext library minimizes the amount of padding needed." - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "PgC6dhDqMOjp" - }, - "source": [ - "from torchtext.legacy.data import BucketIterator\n", - "legacy_train, legacy_test = datasets.IMDB.splits(TEXT, LABEL)\n", - "legacy_train_bucketiterator, legacy_test_bucketiterator = data.BucketIterator.splits(\n", - " (legacy_train, legacy_test),\n", - " sort_key=lambda x: len(x.text),\n", - " batch_size=8, device = device)" - ], - "execution_count": 19, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "kBV-Wvlo07ye" - }, - "source": [ - "### *New*\n", - "\n", - "`torch.utils.data.DataLoader` is used to generate data batch. Users could customize the data batch by defining a function with the `collate_fn` argument in the DataLoader. Here, in the `collate_batch` func, we process the raw text data and add padding to dynamically match the longest sentence in a batch." - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "EC054Wlr0-xB" - }, - "source": [ - "from torch.utils.data import DataLoader\n", - "from torch.nn.utils.rnn import pad_sequence\n", - "\n", - "def collate_batch(batch):\n", - " label_list, text_list = [], []\n", - " for (_label, _text) in batch:\n", - " label_list.append(label_transform(_label))\n", - " processed_text = torch.tensor(text_transform(_text))\n", - " text_list.append(processed_text)\n", - " return torch.tensor(label_list), pad_sequence(text_list, padding_value=3.0)\n", - "\n", - "train_iter = IMDB(split='train')\n", - "train_dataloader = DataLoader(list(train_iter), batch_size=8, shuffle=True, \n", - " collate_fn=collate_batch)" - ], - "execution_count": 20, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "Jky4X-iFU4HK" - }, - "source": [ - "To group the texts with similar length together, like introduced in the legacy `BucketIterator` class, first of all, we randomly create multiple \"pools\", and each of them has a size of `batch_size * 100`. Then, we sort the samples within the individual pool by length. This idea can be implemented succintly through `batch_sampler` argument of PyTorch `Dataloader`. `batch_sampler` accepts 'Sampler' or Iterable object that yields indices of next batch. In the code below, we implemented a generator that yields batch of indices for which the corresponding batch of data is of similar length. " - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "zCvxeLbYW3I_" - }, - "source": [ - "import random\n", - "\n", - "train_iter = IMDB(split='train')\n", - "train_list = list(train_iter)\n", - "batch_size = 8 # A batch size of 8\n", - "\n", - "def batch_sampler():\n", - " indices = [(i, len(tokenizer(s[1]))) for i, s in enumerate(train_list)]\n", - " random.shuffle(indices)\n", - " pooled_indices = []\n", - " # create pool of indices with similar lengths \n", - " for i in range(0, len(indices), batch_size * 100):\n", - " pooled_indices.extend(sorted(indices[i:i + batch_size * 100], key=lambda x: x[1]))\n", - "\n", - " pooled_indices = [x[0] for x in pooled_indices]\n", - "\n", - " # yield indices for current batch\n", - " for i in range(0, len(pooled_indices), batch_size):\n", - " yield pooled_indices[i:i + batch_size]\n", - "\n", - "bucket_dataloader = DataLoader(train_list, batch_sampler=batch_sampler(),\n", - " collate_fn=collate_batch)\n", - "\n", - "print(next(iter(bucket_dataloader)))" - ], - "execution_count": 24, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "0wrbC_v01Ib9" - }, - "source": [ - "## Step 4: Iterate batch to train a model\n", - "-------------------------------\n", - "\n", - "It's almost same for both legacy and new APIs to iterate the data for batches during training and validating a model.\n", - "\n", - "### *Legacy*\n", - "\n", - "The legacy batch iterator can be iterated or executed with `next()` method." - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "X_tml54u-6AS" - }, - "source": [ - "# for item in legacy_train_iterator:\n", - "# model(item)\n", - "\n", - "# Or\n", - "next(iter(legacy_train_iterator))" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "sRTvfxMB1P2P" - }, - "source": [ - "### *New*\n", - "\n", - "The batch iterator can be iterated or executed with `next()` method." - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "iTotRtXe1CWn" - }, - "source": [ - "# for idx, (label, text) in enumerate(train_dataloader):\n", - "# model(item)\n", - "\n", - "# Or\n", - "next(iter(train_dataloader))" - ], - "execution_count": null, - "outputs": [] - } - ] -} diff --git a/examples/vocab/vocab.py b/examples/vocab/vocab.py deleted file mode 100755 index d76bfd94e4..0000000000 --- a/examples/vocab/vocab.py +++ /dev/null @@ -1,37 +0,0 @@ -import logging -import argparse - -import torch -import io - -from torchtext.legacy.vocab import build_vocab_from_iterator -from torchtext.data.utils import ngrams_iterator -from torchtext.data.utils import get_tokenizer -from torchtext.utils import unicode_csv_reader - - -def csv_iterator(data_path, ngrams): - tokenizer = get_tokenizer("basic_english") - with io.open(data_path, encoding="utf8") as f: - reader = unicode_csv_reader(f) - for row in reader: - tokens = ' '.join(row[1:]) - yield ngrams_iterator(tokenizer(tokens), ngrams) - - -parser = argparse.ArgumentParser( - description='Train a text classification model on AG_NEWS') -parser.add_argument('data_path') -parser.add_argument('save_vocab_path') -parser.add_argument('--ngrams', type=int, default=2) -parser.add_argument('--logging-level', default='WARNING') -args = parser.parse_args() - -ngrams = args.ngrams - -logging.basicConfig(level=getattr(logging, args.logging_level)) - -vocab = build_vocab_from_iterator(csv_iterator(args.data_path, ngrams)) - -print("Saving vocab to {}".format(args.save_vocab_path)) -torch.save(vocab, args.save_vocab_path) diff --git a/test/experimental/test_datasets.py b/test/experimental/test_datasets.py index 1868f6ee9d..18807c4958 100644 --- a/test/experimental/test_datasets.py +++ b/test/experimental/test_datasets.py @@ -10,7 +10,7 @@ class TestDataset(TorchtextTestCase): @skipIfNoModule("torchdata") - def test_sst2_dataset(self): + def test_sst2__dataset(self): split = ("train", "dev", "test") train_dataset, dev_dataset, test_dataset = sst2.SST2( diff --git a/test/legacy/__init__.py b/test/legacy/__init__.py deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/test/legacy/babi.py b/test/legacy/babi.py deleted file mode 100644 index cef3ff965d..0000000000 --- a/test/legacy/babi.py +++ /dev/null @@ -1,50 +0,0 @@ -from torchtext.legacy import datasets - -# en-valid -TRAIN_NUM = [0] + [900] * 16 + [904, 905, 900, 904] -VAL_NUM = [0] + [100] * 16 + [96, 95, 100, 96] -TEST_NUM = [0] + [1000] * 20 - -# Testcase 1 (joint training) -train_iter, val_iter, test_iter = datasets.BABI20.iters(task=1, joint=True) -assert len(train_iter.dataset) == sum(TRAIN_NUM) -assert len(val_iter.dataset) == VAL_NUM[1] -assert len(test_iter.dataset) == TEST_NUM[1] - -# Testcase 2 (only supporting) -train_iter, val_iter, test_iter = datasets.BABI20.iters(task=1, only_supporting=True) -assert len(train_iter.dataset) == TRAIN_NUM[2] -assert len(val_iter.dataset) == VAL_NUM[2] -assert len(test_iter.dataset) == TEST_NUM[2] - -# Testcase 3 (single task) -for i in range(1, 21): - train_iter, val_iter, test_iter = datasets.BABI20.iters(task=i) - assert len(train_iter.dataset) == TRAIN_NUM[i] - assert len(val_iter.dataset) == VAL_NUM[i] - assert len(test_iter.dataset) == TEST_NUM[i] - -# en-valid-10k -TRAIN_NUM = [0] + [9000] * 17 + [8996, 9000, 9002] -VAL_NUM = [0] + [1000] * 17 + [1004, 1000, 998] -TEST_NUM = [0] + [1000] * 20 - -# Testcase 1 (joint training) -train_iter, val_iter, test_iter = datasets.BABI20.iters(task=1, joint=True, tenK=True) -assert len(train_iter.dataset) == sum(TRAIN_NUM) -assert len(val_iter.dataset) == VAL_NUM[1] -assert len(test_iter.dataset) == TEST_NUM[1] - -# Testcase 2 (only supporting) -train_iter, val_iter, test_iter = datasets.BABI20.iters(task=1, only_supporting=True, - tenK=True) -assert len(train_iter.dataset) == TRAIN_NUM[2] -assert len(val_iter.dataset) == VAL_NUM[2] -assert len(test_iter.dataset) == TEST_NUM[2] - -# Testcase 3 (single task) -for i in range(1, 21): - train_iter, val_iter, test_iter = datasets.BABI20.iters(task=i, tenK=True) - assert len(train_iter.dataset) == TRAIN_NUM[i] - assert len(val_iter.dataset) == VAL_NUM[i] - assert len(test_iter.dataset) == TEST_NUM[i] diff --git a/test/legacy/data.py b/test/legacy/data.py deleted file mode 100644 index 98071a2fca..0000000000 --- a/test/legacy/data.py +++ /dev/null @@ -1,27 +0,0 @@ -from torchtext.legacy import data - - -TEXT = data.Field() -LABELS = data.Field() - -train, val, test = data.TabularDataset.splits( - path='~/chainer-research/jmt-data/pos_wsj/pos_wsj', train='.train', - validation='.dev', test='.test', format='tsv', - fields=[('text', TEXT), ('labels', LABELS)]) - -print(train.fields) -print(len(train)) -print(vars(train[0])) - -train_iter, val_iter, test_iter = data.BucketIterator.splits( - (train, val, test), batch_size=3, sort_key=lambda x: len(x.text), device="cuda:0") - -LABELS.build_vocab(train.labels) -TEXT.build_vocab(train.text) - -print(TEXT.vocab.freqs.most_common(10)) -print(LABELS.vocab.itos) - -batch = next(iter(train_iter)) -print(batch.text) -print(batch.labels) diff --git a/test/legacy/data/__init__.py b/test/legacy/data/__init__.py deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/test/legacy/data/test_batch.py b/test/legacy/data/test_batch.py deleted file mode 100644 index cf4a1f76a4..0000000000 --- a/test/legacy/data/test_batch.py +++ /dev/null @@ -1,43 +0,0 @@ -import torch -import torchtext.legacy.data as data - -from ...common.torchtext_test_case import TorchtextTestCase - - -class TestDataset(TorchtextTestCase): - def test_batch_with_missing_field(self): - # smoke test to see if batches with missing attributes are shown properly - with open(self.test_missing_field_dataset_path, "wt") as f: - f.write("text,label\n1,0") - - dst = data.TabularDataset(path=self.test_missing_field_dataset_path, - format="csv", skip_header=True, - fields=[("text", data.Field(use_vocab=False, - sequential=False)), - ("label", None)]) - itr = data.Iterator(dst, batch_size=64) - str(next(itr.__iter__())) - - def test_batch_iter(self): - self.write_test_numerical_features_dataset() - FLOAT = data.Field(use_vocab=False, sequential=False, - dtype=torch.float) - INT = data.Field(use_vocab=False, sequential=False, is_target=True) - TEXT = data.Field(sequential=False) - - dst = data.TabularDataset(path=self.test_numerical_features_dataset_path, - format="tsv", skip_header=False, - fields=[("float", FLOAT), - ("int", INT), - ("text", TEXT)]) - TEXT.build_vocab(dst) - itr = data.Iterator(dst, batch_size=2, device=-1, shuffle=False) - fld_order = [k for k, v in dst.fields.items() if - v is not None and not v.is_target] - batch = next(iter(itr)) - (x1, x2), y = batch - x = (x1, x2)[fld_order.index("float")] - self.assertEqual(y.data[0], 1) - self.assertEqual(y.data[1], 12) - self.assertAlmostEqual(x.data[0], 0.1, places=4) - self.assertAlmostEqual(x.data[1], 0.5, places=4) diff --git a/test/legacy/data/test_dataset.py b/test/legacy/data/test_dataset.py deleted file mode 100644 index 80484b126b..0000000000 --- a/test/legacy/data/test_dataset.py +++ /dev/null @@ -1,511 +0,0 @@ -# -*- coding: utf-8 -*- -from torchtext.legacy import data -import os -import sys -import tempfile -import unittest - -import pytest - -from ...common.torchtext_test_case import TorchtextTestCase -from ...common.assets import conditional_remove - - -class TestDataset(TorchtextTestCase): - - def test_wikitext2_legacy(self): - from torchtext.legacy.datasets import WikiText2 - cachedir = os.path.join(self.project_root, ".data", "wikitext-2") - conditional_remove(cachedir) - - ds = WikiText2 - TEXT = data.Field(lower=True, batch_first=True) - train, valid, test = ds.splits(TEXT) - TEXT.build_vocab(train) - train_iter, valid_iter, test_iter = data.BPTTIterator.splits( - (train, valid, test), batch_size=3, bptt_len=30) - - train_iter, valid_iter, test_iter = ds.iters(batch_size=4, - bptt_len=30) - - conditional_remove(cachedir) - - def test_penntreebank_legacy(self): - from torchtext.legacy.datasets import PennTreebank - # smoke test to ensure penn treebank works properly - TEXT = data.Field(lower=True, batch_first=True) - ds = PennTreebank - train, valid, test = ds.splits(TEXT) - TEXT.build_vocab(train) - train_iter, valid_iter, test_iter = data.BPTTIterator.splits( - (train, valid, test), batch_size=3, bptt_len=30) - - train_iter, valid_iter, test_iter = ds.iters(batch_size=4, - bptt_len=30) - - def test_tabular_simple_data(self): - for data_format in ["csv", "tsv", "json"]: - self.write_test_ppid_dataset(data_format=data_format) - - if data_format == "json": - question_field = data.Field(sequential=True) - label_field = data.Field(sequential=False) - fields = {"question1": ("q1", question_field), - "question2": ("q2", question_field), - "label": ("label", label_field)} - else: - question_field = data.Field(sequential=True) - label_field = data.Field(sequential=False) - fields = [("id", None), ("q1", question_field), - ("q2", question_field), ("label", label_field)] - - dataset = data.TabularDataset( - path=self.test_ppid_dataset_path, format=data_format, fields=fields) - - assert len(dataset) == 3 - - expected_examples = [ - (["When", "do", "you", "use", "シ", "instead", "of", "し?"], - ["When", "do", "you", "use", "\"&\"", - "instead", "of", "\"and\"?"], "0"), - (["Where", "was", "Lincoln", "born?"], - ["Which", "location", "was", "Abraham", "Lincoln", "born?"], "1"), - (["What", "is", "2+2"], ["2+2=?"], "1")] - - # Ensure examples have correct contents / test __getitem__ - for i in range(len(dataset)): - self.assertEqual(dataset[i].q1, expected_examples[i][0]) - self.assertEqual(dataset[i].q2, expected_examples[i][1]) - self.assertEqual(dataset[i].label, expected_examples[i][2]) - - # Test __getattr__ - for i, (q1, q2, label) in enumerate(zip(dataset.q1, dataset.q2, - dataset.label)): - self.assertEqual(q1, expected_examples[i][0]) - self.assertEqual(q2, expected_examples[i][1]) - self.assertEqual(label, expected_examples[i][2]) - - # Test __iter__ - for i, example in enumerate(dataset): - self.assertEqual(example.q1, expected_examples[i][0]) - self.assertEqual(example.q2, expected_examples[i][1]) - self.assertEqual(example.label, expected_examples[i][2]) - - def test_json_valid_and_invalid_nested_key(self): - self.write_test_nested_key_json_dataset() - valid_fields = {'foods.vegetables.name': ('vegs', data.Field()), - 'foods.fruits': ('fruits', data.Field())} - invalid_fields = {'foods.vegetables.color': ('vegs', data.Field())} - - expected_examples = [ - {"fruits": ["Apple", "Banana"], - "vegs": ["Broccoli", "Cabbage"]}, - {"fruits": ["Cherry", "Grape", "Lemon"], - "vegs": ["Cucumber", "Lettuce"]}, - {"fruits": ["Orange", "Pear", "Strawberry"], - "vegs": ["Marrow", "Spinach"]} - ] - dataset = data.TabularDataset( - path=self.test_nested_key_json_dataset_path, - format="json", - fields=valid_fields) - # check results - for example, expect in zip(dataset.examples, expected_examples): - self.assertEqual(example.vegs, expect['vegs']) - self.assertEqual(example.fruits, expect['fruits']) - - with self.assertRaises(ValueError): - data.TabularDataset( - path=self.test_nested_key_json_dataset_path, - format="json", - fields=invalid_fields) - - def test_errors(self): - # Ensure that trying to retrieve a key not in JSON data errors - self.write_test_ppid_dataset(data_format="json") - - question_field = data.Field(sequential=True) - label_field = data.Field(sequential=False) - fields = {"qeustion1": ("q1", question_field), - "question2": ("q2", question_field), - "label": ("label", label_field)} - - with self.assertRaises(ValueError): - data.TabularDataset( - path=self.test_ppid_dataset_path, format="json", fields=fields) - - def test_input_with_newlines_in_text(self): - # Smoke test for ensuring that TabularDataset works with files with newlines - example_with_newlines = [("\"hello \n world\"", "1"), - ("\"there is a \n newline\"", "0"), - ("\"there is no newline\"", "1")] - fields = [("text", data.Field(lower=True)), - ("label", data.Field(sequential=False))] - - for delim in [",", "\t"]: - with open(self.test_newline_dataset_path, "wt") as f: - for line in example_with_newlines: - f.write("{}\n".format(delim.join(line))) - - format_ = "csv" if delim == "," else "tsv" - dataset = data.TabularDataset( - path=self.test_newline_dataset_path, format=format_, fields=fields) - # if the newline is not parsed correctly, this should raise an error - for example in dataset: - self.assertTrue(hasattr(example, "text")) - self.assertTrue(hasattr(example, "label")) - - def test_csv_file_with_header(self): - example_with_header = [("text", "label"), - ("HELLO WORLD", "0"), - ("goodbye world", "1")] - - TEXT = data.Field(lower=True, tokenize=lambda x: x.split()) - fields = { - "label": ("label", data.Field(use_vocab=False, - sequential=False)), - "text": ("text", TEXT) - } - - for format_, delim in zip(["csv", "tsv"], [",", "\t"]): - with open(self.test_has_header_dataset_path, "wt") as f: - for line in example_with_header: - f.write("{}\n".format(delim.join(line))) - - # check that an error is raised here if a non-existent field is specified - with self.assertRaises(ValueError): - data.TabularDataset( - path=self.test_has_header_dataset_path, format=format_, - fields={"non_existent": ("label", data.Field())}) - - dataset = data.TabularDataset( - path=self.test_has_header_dataset_path, format=format_, - skip_header=False, fields=fields) - - TEXT.build_vocab(dataset) - - for i, example in enumerate(dataset): - self.assertEqual(example.text, - example_with_header[i + 1][0].lower().split()) - self.assertEqual(example.label, example_with_header[i + 1][1]) - - # check that the vocabulary is built correctly (#225) - expected_freqs = {"hello": 1, "world": 2, "goodbye": 1, "text": 0} - for k, v in expected_freqs.items(): - self.assertEqual(TEXT.vocab.freqs[k], v) - - data_iter = data.Iterator(dataset, batch_size=1, - sort_within_batch=False, repeat=False) - next(data_iter.__iter__()) - - @unittest.skipIf(sys.platform == "win32", "FIXME: tempfile could not be opened twice on Windows") - def test_csv_dataset_quotechar(self): - # Based on issue #349 - example_data = [("text", "label"), - ('" hello world', "0"), - ('goodbye " world', "1"), - ('this is a pen " ', "0")] - - with tempfile.NamedTemporaryFile(dir=self.test_dir) as f: - for example in example_data: - f.write("{}\n".format(",".join(example)).encode("latin-1")) - - TEXT = data.Field(lower=True, tokenize=lambda x: x.split()) - fields = { - "label": ("label", data.Field(use_vocab=False, - sequential=False)), - "text": ("text", TEXT) - } - - f.seek(0) - - dataset = data.TabularDataset( - path=f.name, format="csv", - skip_header=False, fields=fields, - csv_reader_params={"quotechar": None}) - - TEXT.build_vocab(dataset) - - self.assertEqual(len(dataset), len(example_data) - 1) - - for i, example in enumerate(dataset): - self.assertEqual(example.text, - example_data[i + 1][0].lower().split()) - self.assertEqual(example.label, example_data[i + 1][1]) - - def test_dataset_split_arguments(self): - num_examples, num_labels = 30, 3 - self.write_test_splitting_dataset(num_examples=num_examples, - num_labels=num_labels) - text_field = data.Field() - label_field = data.LabelField() - fields = [('text', text_field), ('label', label_field)] - - dataset = data.TabularDataset( - path=self.test_dataset_splitting_path, format="csv", fields=fields) - - # Test default split ratio (0.7) - expected_train_size = 21 - expected_test_size = 9 - - train, test = dataset.split() - assert len(train) == expected_train_size - assert len(test) == expected_test_size - - # Test array arguments with same ratio - split_ratio = [0.7, 0.3] - train, test = dataset.split(split_ratio=split_ratio) - assert len(train) == expected_train_size - assert len(test) == expected_test_size - - # Add validation set - split_ratio = [0.6, 0.3, 0.1] - expected_train_size = 18 - expected_valid_size = 3 - expected_test_size = 9 - - train, valid, test = dataset.split(split_ratio=split_ratio) - assert len(train) == expected_train_size - assert len(valid) == expected_valid_size - assert len(test) == expected_test_size - - # Test ratio normalization - split_ratio = [6, 3, 1] - train, valid, test = dataset.split(split_ratio=split_ratio) - assert len(train) == expected_train_size - assert len(valid) == expected_valid_size - assert len(test) == expected_test_size - - # Test only two splits returned for too small valid split size - split_ratio = [0.66, 0.33, 0.01] - expected_length = 2 - splits = dataset.split(split_ratio=split_ratio) - assert len(splits) == expected_length - - # Test invalid arguments - split_ratio = 1.1 - with pytest.raises(AssertionError): - dataset.split(split_ratio=split_ratio) - - split_ratio = -1. - with pytest.raises(AssertionError): - dataset.split(split_ratio=split_ratio) - - split_ratio = [0.7] - with pytest.raises(AssertionError): - dataset.split(split_ratio=split_ratio) - - split_ratio = [1, 2, 3, 4] - with pytest.raises(AssertionError): - dataset.split(split_ratio=split_ratio) - - split_ratio = "string" - with pytest.raises(ValueError): - dataset.split(split_ratio=split_ratio) - - def test_stratified_dataset_split(self): - num_examples, num_labels = 30, 3 - self.write_test_splitting_dataset(num_examples=num_examples, - num_labels=num_labels) - text_field = data.Field() - label_field = data.LabelField() - fields = [('text', text_field), ('label', label_field)] - - dataset = data.TabularDataset( - path=self.test_dataset_splitting_path, format="csv", fields=fields) - - # Default split ratio - expected_train_size = 21 - expected_test_size = 9 - - train, test = dataset.split(stratified=True) - assert len(train) == expected_train_size - assert len(test) == expected_test_size - - # Test array arguments with same ratio - split_ratio = [0.7, 0.3] - train, test = dataset.split(split_ratio=split_ratio, stratified=True) - assert len(train) == expected_train_size - assert len(test) == expected_test_size - - # Test strata_field argument - train, test = dataset.split(split_ratio=split_ratio, stratified=True, - strata_field='label') - assert len(train) == expected_train_size - assert len(test) == expected_test_size - - # Test invalid field name - strata_field = 'dummy' - with pytest.raises(ValueError): - dataset.split(split_ratio=split_ratio, stratified=True, - strata_field=strata_field) - - # Test uneven stratify sizes - num_examples, num_labels = 28, 3 - self.write_test_splitting_dataset(num_examples=num_examples, - num_labels=num_labels) - # 10 examples for class 1 and 9 examples for classes 2,3 - dataset = data.TabularDataset( - path=self.test_dataset_splitting_path, format="csv", fields=fields) - - expected_train_size = 7 + 6 + 6 - expected_test_size = 3 + 3 + 3 - train, test = dataset.split(split_ratio=split_ratio, stratified=True) - assert len(train) == expected_train_size - assert len(test) == expected_test_size - - split_ratio = [0.7, 0.3] - train, test = dataset.split(split_ratio=split_ratio, stratified=True) - assert len(train) == expected_train_size - assert len(test) == expected_test_size - - # Add validation set - split_ratio = [0.6, 0.3, 0.1] - expected_train_size = 6 + 5 + 5 - expected_valid_size = 1 + 1 + 1 - expected_test_size = 3 + 3 + 3 - train, valid, test = dataset.split(split_ratio=split_ratio, stratified=True) - assert len(train) == expected_train_size - assert len(valid) == expected_valid_size - assert len(test) == expected_test_size - - def test_filter(self): - # Create test examples - sentence11 = [["who", "is", "there"]] - sentence12 = [["bernardo", "is", "there"]] - label1 = [1] - sentence21 = [["nay", "answer", "me"]] - sentence22 = [["stand", "unfold", "yourself"]] - label2 = [0] - sentence31 = [["is", "Horatio", "there"]] - sentence32 = [["a", "piece", "of", "him"]] - label3 = [0] - - example1_values = sentence11 + sentence12 + label1 - example2_values = sentence21 + sentence22 + label2 - example3_values = sentence31 + sentence32 + label3 - - # Test filter remove words from single field only - dataset, text_field = filter_init( - example1_values, - example2_values, - example3_values - ) - - text_field.vocab.stoi.pop("there") - text_field.vocab.stoi.pop("bernardo") - - dataset.filter_examples(["text1"]) - - assert dataset[0].text1 == ["who", "is"] - assert dataset[0].text2 == ["bernardo", "is", "there"] - assert dataset[0].label == 1 - - assert dataset[1].text1 == ["nay", "answer", "me"] - assert dataset[1].text2 == ["stand", "unfold", "yourself"] - assert dataset[1].label == 0 - - assert dataset[2].text1 == ["is", "Horatio"] - assert dataset[2].text2 == ["a", "piece", "of", "him"] - assert dataset[2].label == 0 - - # Test filter remove words from multiple fields - dataset, text_field = filter_init( - example1_values, - example2_values, - example3_values - ) - - text_field.vocab.stoi.pop("there") - text_field.vocab.stoi.pop("bernardo") - - dataset.filter_examples(["text1", "text2"]) - - assert dataset[0].text1 == ["who", "is"] - assert dataset[0].text2 == ["is"] - assert dataset[0].label == 1 - - assert dataset[1].text1 == ["nay", "answer", "me"] - assert dataset[1].text2 == ["stand", "unfold", "yourself"] - assert dataset[1].label == 0 - - assert dataset[2].text1 == ["is", "Horatio"] - assert dataset[2].text2 == ["a", "piece", "of", "him"] - assert dataset[2].label == 0 - - # Test filter remove all words in example - dataset, text_field = filter_init( - example1_values, - example2_values, - example3_values - ) - - text_field.vocab.stoi.pop("who") - text_field.vocab.stoi.pop("is") - text_field.vocab.stoi.pop("there") - - dataset.filter_examples(["text1", "text2"]) - - assert dataset[0].text1 == [] - assert dataset[0].text2 == ["bernardo"] - assert dataset[0].label == 1 - - assert dataset[1].text1 == ["nay", "answer", "me"] - assert dataset[1].text2 == ["stand", "unfold", "yourself"] - assert dataset[1].label == 0 - - assert dataset[2].text1 == ["Horatio"] - assert dataset[2].text2 == ["a", "piece", "of", "him"] - assert dataset[2].label == 0 - - def test_gz_extraction(self): - # tar.gz file contains train.txt and test.txt - tgz = (b'\x1f\x8b\x08\x00\x1e\xcc\xd5Z\x00\x03\xed\xd1;\n\x800\x10E' - b'\xd1,%+\x90\xc9G\xb3\x1e\x0b\x0b\x1b\x03q\x04\x97\xef\xa7' - b'\xb0\xb0P,R\x08\xf74o`\x9aa\x9e\x96~\x9c\x1a]\xd5\xd4#\xbb' - b'\x94\xd2\x99\xbb{\x9e\xb3\x0b\xbekC\x8c\x12\x9c\x11\xe7b\x10c' - b'\xa5\xe2M\x97e\xd6\xbeXkJ\xce\x8f?x\xdb\xff\x94\x0e\xb3V\xae' - b'\xff[\xffQ\x8e\xfe}\xf2\xf4\x0f\x00\x00\x00\x00\x00\x00\x00' - b'\x00\x00\x00\x00\x00\x00O6\x1c\xc6\xbd\x89\x00(\x00\x00') - - # .gz file contains dummy.txt - gz = (b'\x1f\x8b\x08\x08W\xce\xd5Z\x00\x03dummy.txt\x00\x0bq\r\x0e\x01' - b'\x00\xb8\x93\xea\xee\x04\x00\x00\x00') - - # Create both files - with open(os.path.join(self.test_dir, 'dummy.tar.gz'), 'wb') as fp: - fp.write(tgz) - - with open(os.path.join(self.test_dir, 'dummy.txt.gz'), 'wb') as fp: - fp.write(gz) - - # Set the urls in a dummy class - class DummyDataset(data.Dataset): - urls = ['dummy.tar.gz', 'dummy.txt.gz'] - name = '' - dirname = '' - - # Run extraction - DummyDataset.download(self.test_dir, check='') - - # Check if files were extracted correctly - assert os.path.isfile(os.path.join(self.test_dir, 'dummy.txt')) - assert os.path.isfile(os.path.join(self.test_dir, 'train.txt')) - assert os.path.isfile(os.path.join(self.test_dir, 'test.txt')) - - -def filter_init(ex_val1, ex_val2, ex_val3): - text_field = data.Field(sequential=True) - label_field = data.Field(sequential=False) - fields = [("text1", text_field), ("text2", text_field), - ("label", label_field)] - - example1 = data.Example.fromlist(ex_val1, fields) - example2 = data.Example.fromlist(ex_val2, fields) - example3 = data.Example.fromlist(ex_val3, fields) - examples = [example1, example2, example3] - - dataset = data.Dataset(examples, fields) - text_field.build_vocab(dataset) - - return dataset, text_field diff --git a/test/legacy/data/test_field.py b/test/legacy/data/test_field.py deleted file mode 100644 index b66f5870e4..0000000000 --- a/test/legacy/data/test_field.py +++ /dev/null @@ -1,901 +0,0 @@ -# -*- coding: utf-8 -*- -from collections import Counter -import os - -import torch -import torchtext.legacy.data as data -import pytest - -from ...common.torchtext_test_case import TorchtextTestCase, verify_numericalized_example - - -class TestField(TorchtextTestCase): - def test_process(self): - raw_field = data.RawField() - field = data.Field(sequential=True, use_vocab=False, batch_first=True) - - # Test tensor-like batch data which is accepted by both RawField and Field - batch = [[1, 2, 3], [2, 3, 4]] - batch_tensor = torch.LongTensor(batch) - - raw_field_processed = raw_field.process(batch) - field_processed = field.process(batch) - - assert raw_field_processed == batch - assert field_processed.data.equal(batch_tensor) - - # Test non-tensor data which is only accepted by RawField - any_obj = [object() for _ in range(5)] - - raw_field_processed = raw_field.process(any_obj) - assert any_obj == raw_field_processed - - with pytest.raises(TypeError): - field.process(any_obj) - - def test_preprocess(self): - # Default case. - field = data.Field() - assert field.preprocess("Test string.") == ["Test", "string."] - - # Test that lowercase is properly applied. - field_lower = data.Field(lower=True) - assert field_lower.preprocess("Test string.") == ["test", "string."] - - # Test that custom preprocessing pipelines are properly applied. - preprocess_pipeline = data.Pipeline(lambda x: x + "!") - field_preprocessing = data.Field(preprocessing=preprocess_pipeline, - lower=True) - assert field_preprocessing.preprocess("Test string.") == ["test!", "string.!"] - - # Test that non-sequential data is properly handled. - field_not_sequential = data.Field(sequential=False, lower=True, - preprocessing=preprocess_pipeline) - assert field_not_sequential.preprocess("Test string.") == "test string.!" - - # Non-regression test that we do not try to decode unicode strings to unicode - field_not_sequential = data.Field(sequential=False, lower=True, - preprocessing=preprocess_pipeline) - assert field_not_sequential.preprocess("ᑌᑎIᑕOᗪᕮ_Tᕮ᙭T") == "ᑌᑎiᑕoᗪᕮ_tᕮ᙭t!" - - def test_pad(self): - # Default case. - field = data.Field() - minibatch = [["a", "sentence", "of", "data", "."], - ["yet", "another"], - ["one", "last", "sent"]] - expected_padded_minibatch = [["a", "sentence", "of", "data", "."], - ["yet", "another", "", "", ""], - ["one", "last", "sent", "", ""]] - expected_lengths = [5, 2, 3] - assert field.pad(minibatch) == expected_padded_minibatch - field = data.Field(include_lengths=True) - assert field.pad(minibatch) == (expected_padded_minibatch, expected_lengths) - - # Test fix_length properly truncates and pads. - field = data.Field(fix_length=3) - minibatch = [["a", "sentence", "of", "data", "."], - ["yet", "another"], - ["one", "last", "sent"]] - expected_padded_minibatch = [["a", "sentence", "of"], - ["yet", "another", ""], - ["one", "last", "sent"]] - expected_lengths = [3, 2, 3] - assert field.pad(minibatch) == expected_padded_minibatch - field = data.Field(fix_length=3, include_lengths=True) - assert field.pad(minibatch) == (expected_padded_minibatch, expected_lengths) - field = data.Field(fix_length=3, truncate_first=True) - expected_padded_minibatch = [["of", "data", "."], - ["yet", "another", ""], - ["one", "last", "sent"]] - assert field.pad(minibatch) == expected_padded_minibatch - - # Test init_token is properly handled. - field = data.Field(fix_length=4, init_token="") - minibatch = [["a", "sentence", "of", "data", "."], - ["yet", "another"], - ["one", "last", "sent"]] - expected_padded_minibatch = [["", "a", "sentence", "of"], - ["", "yet", "another", ""], - ["", "one", "last", "sent"]] - expected_lengths = [4, 3, 4] - assert field.pad(minibatch) == expected_padded_minibatch - field = data.Field(fix_length=4, init_token="", include_lengths=True) - assert field.pad(minibatch) == (expected_padded_minibatch, expected_lengths) - - # Test init_token and eos_token are properly handled. - field = data.Field(init_token="", eos_token="") - minibatch = [["a", "sentence", "of", "data", "."], - ["yet", "another"], - ["one", "last", "sent"]] - expected_padded_minibatch = [ - ["", "a", "sentence", "of", "data", ".", ""], - ["", "yet", "another", "", "", "", ""], - ["", "one", "last", "sent", "", "", ""]] - expected_lengths = [7, 4, 5] - assert field.pad(minibatch) == expected_padded_minibatch - field = data.Field(init_token="", eos_token="", include_lengths=True) - assert field.pad(minibatch) == (expected_padded_minibatch, expected_lengths) - - # Test that non-sequential data is properly handled. - field = data.Field(init_token="", eos_token="", sequential=False) - minibatch = [["contradiction"], - ["neutral"], - ["entailment"]] - assert field.pad(minibatch) == minibatch - field = data.Field(init_token="", eos_token="", - sequential=False, include_lengths=True) - assert field.pad(minibatch) == minibatch - - def test_build_vocab(self): - # Set up fields - question_field = data.Field(sequential=True) - label_field = data.Field(sequential=False) - - # Write TSV dataset and construct a Dataset - self.write_test_ppid_dataset(data_format="tsv") - tsv_fields = [("id", None), ("q1", question_field), - ("q2", question_field), ("label", label_field)] - tsv_dataset = data.TabularDataset( - path=self.test_ppid_dataset_path, format="tsv", - fields=tsv_fields) - - # Write JSON dataset and construct a Dataset - self.write_test_ppid_dataset(data_format="json") - json_fields = {"question1": ("q1", question_field), - "question2": ("q2", question_field), - "label": ("label", label_field)} - json_dataset = data.TabularDataset( - path=self.test_ppid_dataset_path, format="json", - fields=json_fields) - - # Test build_vocab default - question_field.build_vocab(tsv_dataset, json_dataset, specials=['']) - assert question_field.vocab.freqs == Counter( - {'When': 4, 'do': 4, 'you': 4, 'use': 4, 'instead': 4, - 'of': 4, 'was': 4, 'Lincoln': 4, 'born?': 4, 'シ': 2, - 'し?': 2, 'Where': 2, 'What': 2, 'is': 2, '2+2': 2, - '"&"': 2, '"and"?': 2, 'Which': 2, 'location': 2, - 'Abraham': 2, '2+2=?': 2}) - expected_stoi = {'': 0, '': 1, '': 2, - 'Lincoln': 3, 'When': 4, - 'born?': 5, 'do': 6, 'instead': 7, 'of': 8, - 'use': 9, 'was': 10, 'you': 11, '"&"': 12, - '"and"?': 13, '2+2': 14, '2+2=?': 15, 'Abraham': 16, - 'What': 17, 'Where': 18, 'Which': 19, 'is': 20, - 'location': 21, 'し?': 22, 'シ': 23} - assert dict(question_field.vocab.stoi) == expected_stoi - # Turn the stoi dictionary into an itos list - expected_itos = [x[0] for x in sorted(expected_stoi.items(), - key=lambda tup: tup[1])] - assert question_field.vocab.itos == expected_itos - - label_field.build_vocab(tsv_dataset, json_dataset) - assert label_field.vocab.freqs == Counter({'1': 4, '0': 2}) - expected_stoi = {'1': 1, '0': 2, '': 0} - assert dict(label_field.vocab.stoi) == expected_stoi - # Turn the stoi dictionary into an itos list - expected_itos = [x[0] for x in sorted(expected_stoi.items(), - key=lambda tup: tup[1])] - assert label_field.vocab.itos == expected_itos - - # Test build_vocab default - question_field.build_vocab(tsv_dataset, json_dataset) - assert question_field.vocab.freqs == Counter( - {'When': 4, 'do': 4, 'you': 4, 'use': 4, 'instead': 4, - 'of': 4, 'was': 4, 'Lincoln': 4, 'born?': 4, 'シ': 2, - 'し?': 2, 'Where': 2, 'What': 2, 'is': 2, '2+2': 2, - '"&"': 2, '"and"?': 2, 'Which': 2, 'location': 2, - 'Abraham': 2, '2+2=?': 2}) - expected_stoi = {'': 0, '': 1, 'Lincoln': 2, 'When': 3, - 'born?': 4, 'do': 5, 'instead': 6, 'of': 7, - 'use': 8, 'was': 9, 'you': 10, '"&"': 11, - '"and"?': 12, '2+2': 13, '2+2=?': 14, 'Abraham': 15, - 'What': 16, 'Where': 17, 'Which': 18, 'is': 19, - 'location': 20, 'し?': 21, 'シ': 22} - assert dict(question_field.vocab.stoi) == expected_stoi - # Turn the stoi dictionary into an itos list - expected_itos = [x[0] for x in sorted(expected_stoi.items(), - key=lambda tup: tup[1])] - assert question_field.vocab.itos == expected_itos - - label_field.build_vocab(tsv_dataset, json_dataset) - assert label_field.vocab.freqs == Counter({'1': 4, '0': 2}) - expected_stoi = {'1': 1, '0': 2, '': 0} - assert dict(label_field.vocab.stoi) == expected_stoi - # Turn the stoi dictionary into an itos list - expected_itos = [x[0] for x in sorted(expected_stoi.items(), - key=lambda tup: tup[1])] - assert label_field.vocab.itos == expected_itos - - # Test build_vocab with extra kwargs passed to Vocab - question_field.build_vocab(tsv_dataset, json_dataset, max_size=8, - min_freq=3) - assert question_field.vocab.freqs == Counter( - {'When': 4, 'do': 4, 'you': 4, 'use': 4, 'instead': 4, - 'of': 4, 'was': 4, 'Lincoln': 4, 'born?': 4, 'シ': 2, - 'し?': 2, 'Where': 2, 'What': 2, 'is': 2, '2+2': 2, - '"&"': 2, '"and"?': 2, 'Which': 2, 'location': 2, - 'Abraham': 2, '2+2=?': 2}) - expected_stoi = {'': 0, '': 1, 'Lincoln': 2, 'When': 3, - 'born?': 4, 'do': 5, 'instead': 6, 'of': 7, - 'use': 8, 'was': 9} - assert dict(question_field.vocab.stoi) == expected_stoi - # Turn the stoi dictionary into an itos list - expected_itos = [x[0] for x in sorted(expected_stoi.items(), - key=lambda tup: tup[1])] - assert question_field.vocab.itos == expected_itos - - def test_numericalize_basic(self): - self.write_test_ppid_dataset(data_format="tsv") - question_field = data.Field(sequential=True) - tsv_fields = [("id", None), ("q1", question_field), - ("q2", question_field), ("label", None)] - tsv_dataset = data.TabularDataset( - path=self.test_ppid_dataset_path, format="tsv", - fields=tsv_fields) - question_field.build_vocab(tsv_dataset) - - test_example_data = [["When", "do", "you", "use", "シ", - "instead", "of", "し?"], - ["What", "is", "2+2", "", "", - "", "", ""], - ["Here", "is", "a", "sentence", "with", - "some", "oovs", ""]] - - # Test default - default_numericalized = question_field.numericalize(test_example_data) - verify_numericalized_example(question_field, test_example_data, - default_numericalized) - - def test_numericalize_include_lengths(self): - self.write_test_ppid_dataset(data_format="tsv") - question_field = data.Field(sequential=True, include_lengths=True) - tsv_fields = [("id", None), ("q1", question_field), - ("q2", question_field), ("label", None)] - tsv_dataset = data.TabularDataset( - path=self.test_ppid_dataset_path, format="tsv", - fields=tsv_fields) - question_field.build_vocab(tsv_dataset) - - test_example_data = [["When", "do", "you", "use", "シ", - "instead", "of", "し?"], - ["What", "is", "2+2", "", "", - "", "", ""], - ["Here", "is", "a", "sentence", "with", - "some", "oovs", ""]] - test_example_lengths = [8, 3, 7] - - # Test with include_lengths - include_lengths_numericalized = question_field.numericalize( - (test_example_data, test_example_lengths)) - verify_numericalized_example(question_field, - test_example_data, - include_lengths_numericalized, - test_example_lengths) - - def test_numericalize_batch_first(self): - self.write_test_ppid_dataset(data_format="tsv") - question_field = data.Field(sequential=True, batch_first=True) - tsv_fields = [("id", None), ("q1", question_field), - ("q2", question_field), ("label", None)] - tsv_dataset = data.TabularDataset( - path=self.test_ppid_dataset_path, format="tsv", - fields=tsv_fields) - question_field.build_vocab(tsv_dataset) - - test_example_data = [["When", "do", "you", "use", "シ", - "instead", "of", "し?"], - ["What", "is", "2+2", "", "", - "", "", ""], - ["Here", "is", "a", "sentence", "with", - "some", "oovs", ""]] - - # Test with batch_first - include_lengths_numericalized = question_field.numericalize( - test_example_data) - verify_numericalized_example(question_field, - test_example_data, - include_lengths_numericalized, - batch_first=True) - - def test_numericalize_postprocessing(self): - self.write_test_ppid_dataset(data_format="tsv") - - def reverse_postprocess(arr, vocab): - return [list(reversed(sentence)) for sentence in arr] - - question_field = data.Field(sequential=True, - postprocessing=reverse_postprocess) - tsv_fields = [("id", None), ("q1", question_field), - ("q2", question_field), ("label", None)] - - tsv_dataset = data.TabularDataset( - path=self.test_ppid_dataset_path, format="tsv", - fields=tsv_fields) - question_field.build_vocab(tsv_dataset) - - test_example_data = [["When", "do", "you", "use", "シ", - "instead", "of", "し?"], - ["What", "is", "2+2", "", "", - "", "", ""], - ["Here", "is", "a", "sentence", "with", - "some", "oovs", ""]] - reversed_test_example_data = [list(reversed(sentence)) for sentence in - test_example_data] - - postprocessed_numericalized = question_field.numericalize( - (test_example_data)) - verify_numericalized_example(question_field, - reversed_test_example_data, - postprocessed_numericalized) - - def test_numericalize_stop_words(self): - # Based on request from #354 - self.write_test_ppid_dataset(data_format="tsv") - question_field = data.Field(sequential=True, batch_first=True, - stop_words=set(["do", "you"])) - tsv_fields = [("id", None), ("q1", question_field), - ("q2", question_field), ("label", None)] - tsv_dataset = data.TabularDataset( - path=self.test_ppid_dataset_path, format="tsv", - fields=tsv_fields) - question_field.build_vocab(tsv_dataset) - - test_example_data = question_field.pad( - [question_field.preprocess(x) for x in - [["When", "do", "you", "use", "シ", - "instead", "of", "し?"], - ["What", "is", "2+2", "", "", - "", "", ""], - ["Here", "is", "a", "sentence", "with", - "some", "oovs", ""]]] - ) - - # Test with batch_first - stopwords_removed_numericalized = question_field.numericalize(test_example_data) - verify_numericalized_example(question_field, - test_example_data, - stopwords_removed_numericalized, - batch_first=True) - - def test_numerical_features_no_vocab(self): - self.write_test_numerical_features_dataset() - # Test basic usage - int_field = data.Field(sequential=False, use_vocab=False) - float_field = data.Field(sequential=False, use_vocab=False, - dtype=torch.float) - tsv_fields = [("int", int_field), ("float", float_field), ("string", None)] - tsv_dataset = data.TabularDataset( - path=self.test_numerical_features_dataset_path, format="tsv", - fields=tsv_fields) - int_field.build_vocab(tsv_dataset) - float_field.build_vocab(tsv_dataset) - test_int_data = ["1", "0", "1", "3", "19"] - test_float_data = ["1.1", "0.1", "3.91", "0.2", "10.2"] - - numericalized_int = int_field.numericalize(test_int_data) - self.assertEqual(numericalized_int.data, [1, 0, 1, 3, 19]) - numericalized_float = float_field.numericalize(test_float_data) - self.assertEqual(numericalized_float.data, [1.1, 0.1, 3.91, 0.2, 10.2]) - - # Test with postprocessing applied - int_field = data.Field(sequential=False, use_vocab=False, - postprocessing=lambda arr, _: [x + 1 for x in arr]) - float_field = data.Field(sequential=False, use_vocab=False, - dtype=torch.float, - postprocessing=lambda arr, _: [x * 0.5 for x in arr]) - tsv_fields = [("int", int_field), ("float", float_field), ("string", None)] - tsv_dataset = data.TabularDataset( - path=self.test_numerical_features_dataset_path, format="tsv", - fields=tsv_fields) - int_field.build_vocab(tsv_dataset) - float_field.build_vocab(tsv_dataset) - test_int_data = ["1", "0", "1", "3", "19"] - test_float_data = ["1.1", "0.1", "3.91", "0.2", "10.2"] - - numericalized_int = int_field.numericalize(test_int_data) - self.assertEqual(numericalized_int.data, [2, 1, 2, 4, 20]) - numericalized_float = float_field.numericalize(test_float_data) - self.assertEqual(numericalized_float.data, [0.55, 0.05, 1.955, 0.1, 5.1]) - - def test_errors(self): - # Test that passing a non-tuple (of data and length) to numericalize - # with Field.include_lengths = True raises an error. - with self.assertRaises(ValueError): - self.write_test_ppid_dataset(data_format="tsv") - question_field = data.Field(sequential=True, include_lengths=True) - tsv_fields = [("id", None), ("q1", question_field), - ("q2", question_field), ("label", None)] - tsv_dataset = data.TabularDataset( - path=self.test_ppid_dataset_path, format="tsv", - fields=tsv_fields) - question_field.build_vocab(tsv_dataset) - test_example_data = [["When", "do", "you", "use", "シ", - "instead", "of", "し?"], - ["What", "is", "2+2", "", "", - "", "", ""], - ["Here", "is", "a", "sentence", "with", - "some", "oovs", ""]] - question_field.numericalize( - test_example_data) - - def test_serialization_pre_build(self): - self.write_test_ppid_dataset(data_format="tsv") - question_field = data.Field(sequential=True) - - question_pickle_filename = "question.pl" - question_pickle_path = os.path.join(self.test_dir, question_pickle_filename) - torch.save(question_field, question_pickle_path) - - loaded_question_field = torch.load(question_pickle_path) - - assert loaded_question_field == question_field - - def test_serialization_built_vocab(self): - self.write_test_ppid_dataset(data_format="tsv") - question_field = data.Field(sequential=True) - tsv_fields = [("id", None), ("q1", question_field), - ("q2", question_field), ("label", None)] - tsv_dataset = data.TabularDataset( - path=self.test_ppid_dataset_path, format="tsv", - fields=tsv_fields) - - question_field.build_vocab(tsv_dataset) - - question_pickle_filename = "question.pl" - question_pickle_path = os.path.join(self.test_dir, question_pickle_filename) - torch.save(question_field, question_pickle_path) - - loaded_question_field = torch.load(question_pickle_path) - - assert loaded_question_field == question_field - - test_example_data = [["When", "do", "you", "use", "シ", - "instead", "of", "し?"], - ["What", "is", "2+2", "", "", - "", "", ""], - ["Here", "is", "a", "sentence", "with", - "some", "oovs", ""]] - - # Test results of numericalization - original_numericalization = question_field.numericalize(test_example_data) - pickled_numericalization = loaded_question_field.numericalize(test_example_data) - - assert torch.all(torch.eq(original_numericalization, pickled_numericalization)) - - -class TestNestedField(TorchtextTestCase): - def test_init_minimal(self): - nesting_field = data.Field() - field = data.NestedField(nesting_field) - - assert isinstance(field, data.Field) - assert field.nesting_field is nesting_field - assert field.sequential - assert field.use_vocab - assert field.init_token is None - assert field.eos_token is None - assert field.unk_token == nesting_field.unk_token - assert field.fix_length is None - assert field.dtype is torch.long - assert field.preprocessing is None - assert field.postprocessing is None - assert field.lower == nesting_field.lower - assert field.tokenize("a b c") == "a b c".split() - assert not field.include_lengths - assert field.batch_first - assert field.pad_token == nesting_field.pad_token - assert not field.pad_first - - def test_init_when_nesting_field_is_not_sequential(self): - nesting_field = data.Field(sequential=False) - field = data.NestedField(nesting_field) - - assert field.pad_token == "" - - def test_init_when_nesting_field_has_include_lengths_equal_true(self): - nesting_field = data.Field(include_lengths=True) - - with pytest.raises(ValueError) as excinfo: - data.NestedField(nesting_field) - assert "nesting field cannot have include_lengths=True" in str(excinfo.value) - - def test_init_with_nested_field_as_nesting_field(self): - nesting_field = data.NestedField(data.Field()) - - with pytest.raises(ValueError) as excinfo: - data.NestedField(nesting_field) - assert "nesting field must not be another NestedField" in str(excinfo.value) - - def test_init_full(self): - nesting_field = data.Field() - field = data.NestedField( - nesting_field, - use_vocab=False, - init_token="", - eos_token="", - fix_length=10, - dtype=torch.float, - preprocessing=lambda xs: list(reversed(xs)), - postprocessing=lambda xs: [x.upper() for x in xs], - tokenize=list, - pad_first=True, - ) - - assert not field.use_vocab - assert field.init_token == "" - assert field.eos_token == "" - assert field.fix_length == 10 - assert field.dtype is torch.float - assert field.preprocessing("a b c".split()) == "c b a".split() - assert field.postprocessing("a b c".split()) == "A B C".split() - assert field.tokenize("abc") == ["a", "b", "c"] - assert field.pad_first - - def test_preprocess(self): - nesting_field = data.Field( - tokenize=list, preprocessing=lambda xs: [x.upper() for x in xs]) - field = data.NestedField(nesting_field, preprocessing=lambda xs: reversed(xs)) - preprocessed = field.preprocess("john loves mary") - - assert preprocessed == [list("MARY"), list("LOVES"), list("JOHN")] - - def test_build_vocab_from_dataset(self): - nesting_field = data.Field(tokenize=list, unk_token="", pad_token="", - init_token="", eos_token="") - CHARS = data.NestedField(nesting_field, init_token="", eos_token="") - ex1 = data.Example.fromlist(["aaa bbb c"], [("chars", CHARS)]) - ex2 = data.Example.fromlist(["bbb aaa"], [("chars", CHARS)]) - dataset = data.Dataset([ex1, ex2], [("chars", CHARS)]) - - CHARS.build_vocab(dataset, min_freq=2) - - expected = "a b ".split() - assert len(CHARS.vocab) == len(expected) - for c in expected: - assert c in CHARS.vocab.stoi - - expected_freqs = Counter({"a": 6, "b": 6, "c": 1}) - assert CHARS.vocab.freqs == CHARS.nesting_field.vocab.freqs == expected_freqs - - def test_build_vocab_from_iterable(self): - nesting_field = data.Field(unk_token="", pad_token="") - CHARS = data.NestedField(nesting_field) - CHARS.build_vocab( - [[list("aaa"), list("bbb"), ["c"]], [list("bbb"), list("aaa")]], - [[list("ccc"), list("bbb")], [list("bbb")]], - ) - - expected = "a b c ".split() - assert len(CHARS.vocab) == len(expected) - for c in expected: - assert c in CHARS.vocab.stoi - - expected_freqs = Counter({"a": 6, "b": 12, "c": 4}) - assert CHARS.vocab.freqs == CHARS.nesting_field.vocab.freqs == expected_freqs - - def test_pad(self): - nesting_field = data.Field(tokenize=list, unk_token="", pad_token="", - init_token="", eos_token="") - CHARS = data.NestedField(nesting_field, init_token="", eos_token="") - minibatch = [ - [list("john"), list("loves"), list("mary")], - [list("mary"), list("cries")], - ] - expected = [ - [ - ["", "", ""] + [""] * 4, - [""] + list("john") + ["", ""], - [""] + list("loves") + [""], - [""] + list("mary") + ["", ""], - ["", "", ""] + [""] * 4, - ], - [ - ["", "", ""] + [""] * 4, - [""] + list("mary") + ["", ""], - [""] + list("cries") + [""], - ["", "", ""] + [""] * 4, - [""] * 7, - ] - ] - - assert CHARS.pad(minibatch) == expected - - # test include_length - nesting_field = data.Field(tokenize=list, unk_token="", pad_token="", - init_token="", eos_token="") - CHARS = data.NestedField(nesting_field, init_token="", - eos_token="", include_lengths=True) - arr, seq_len, words_len = CHARS.pad(minibatch) - assert arr == expected - assert seq_len == [5, 4] - assert words_len == [[3, 6, 7, 6, 3], [3, 6, 7, 3, 0]] - - def test_pad_when_nesting_field_is_not_sequential(self): - nesting_field = data.Field(sequential=False, unk_token="", - pad_token="", init_token="", eos_token="") - CHARS = data.NestedField(nesting_field, init_token="", eos_token="") - minibatch = [ - ["john", "loves", "mary"], - ["mary", "cries"] - ] - expected = [ - ["", "john", "loves", "mary", ""], - ["", "mary", "cries", "", ""], - ] - - assert CHARS.pad(minibatch) == expected - - def test_pad_when_nesting_field_has_fix_length(self): - nesting_field = data.Field(tokenize=list, unk_token="", pad_token="", - init_token="", eos_token="", fix_length=5) - CHARS = data.NestedField(nesting_field, init_token="", eos_token="") - minibatch = [ - ["john", "loves", "mary"], - ["mary", "cries"] - ] - expected = [ - [ - ["", "", ""] + [""] * 2, - [""] + list("joh") + [""], - [""] + list("lov") + [""], - [""] + list("mar") + [""], - ["", "", ""] + [""] * 2, - ], - [ - ["", "", ""] + [""] * 2, - [""] + list("mar") + [""], - [""] + list("cri") + [""], - ["", "", ""] + [""] * 2, - [""] * 5, - ] - ] - - assert CHARS.pad(minibatch) == expected - - # test include length - nesting_field = data.Field(tokenize=list, unk_token="", pad_token="", - init_token="", eos_token="", fix_length=5) - CHARS = data.NestedField(nesting_field, init_token="", - eos_token="", include_lengths=True) - arr, seq_len, words_len = CHARS.pad(minibatch) - assert arr == expected - assert seq_len == [5, 4] - assert words_len == [[3, 5, 5, 5, 3], [3, 5, 5, 3, 0]] - - def test_pad_when_fix_length_is_not_none(self): - nesting_field = data.Field(tokenize=list, unk_token="", pad_token="", - init_token="", eos_token="") - CHARS = data.NestedField( - nesting_field, init_token="", eos_token="", fix_length=3) - minibatch = [ - ["john", "loves", "mary"], - ["mary", "cries"] - ] - expected = [ - [ - ["", "", ""] + [""] * 4, - [""] + list("john") + ["", ""], - ["", "", ""] + [""] * 4, - ], - [ - ["", "", ""] + [""] * 4, - [""] + list("mary") + ["", ""], - ["", "", ""] + [""] * 4, - ] - ] - - assert CHARS.pad(minibatch) == expected - - # test include length - nesting_field = data.Field(tokenize=list, unk_token="", pad_token="", - init_token="", eos_token="") - CHARS = data.NestedField(nesting_field, init_token="", - eos_token="", include_lengths=True, fix_length=3) - arr, seq_len, words_len = CHARS.pad(minibatch) - assert arr == expected - assert seq_len == [3, 3] - assert words_len == [[3, 6, 3], [3, 6, 3]] - - def test_pad_when_no_init_and_eos_tokens(self): - nesting_field = data.Field(tokenize=list, unk_token="", pad_token="", - init_token="", eos_token="") - CHARS = data.NestedField(nesting_field) - minibatch = [ - ["john", "loves", "mary"], - ["mary", "cries"] - ] - expected = [ - [ - [""] + list("john") + ["", ""], - [""] + list("loves") + [""], - [""] + list("mary") + ["", ""], - ], - [ - [""] + list("mary") + ["", ""], - [""] + list("cries") + [""], - [""] * 7, - ] - ] - - assert CHARS.pad(minibatch) == expected - - def test_pad_when_pad_first_is_true(self): - nesting_field = data.Field(tokenize=list, unk_token="", pad_token="", - init_token="", eos_token="") - CHARS = data.NestedField(nesting_field, init_token="", eos_token="", - pad_first=True) - minibatch = [ - [list("john"), list("loves"), list("mary")], - [list("mary"), list("cries")], - ] - expected = [ - [ - ["", "", ""] + [""] * 4, - [""] + list("john") + ["", ""], - [""] + list("loves") + [""], - [""] + list("mary") + ["", ""], - ["", "", ""] + [""] * 4, - ], - [ - [""] * 7, - ["", "", ""] + [""] * 4, - [""] + list("mary") + ["", ""], - [""] + list("cries") + [""], - ["", "", ""] + [""] * 4, - ] - ] - - assert CHARS.pad(minibatch) == expected - - # test include_length - nesting_field = data.Field(tokenize=list, unk_token="", pad_token="", - init_token="", eos_token="") - CHARS = data.NestedField(nesting_field, init_token="", - eos_token="", include_lengths=True, - pad_first=True) - arr, seq_len, words_len = CHARS.pad(minibatch) - assert arr == expected - assert seq_len == [5, 4] - assert words_len == [[3, 6, 7, 6, 3], [0, 3, 6, 7, 3]] - - def test_numericalize(self): - nesting_field = data.Field(batch_first=True) - field = data.NestedField(nesting_field) - ex1 = data.Example.fromlist(["john loves mary"], [("words", field)]) - ex2 = data.Example.fromlist(["mary cries"], [("words", field)]) - dataset = data.Dataset([ex1, ex2], [("words", field)]) - field.build_vocab(dataset) - examples_data = [ - [ - ["", "", ""] + [""] * 4, - [""] + list("john") + ["", ""], - [""] + list("loves") + [""], - [""] + list("mary") + ["", ""], - ["", "", ""] + [""] * 4, - ], - [ - ["", "", ""] + [""] * 4, - [""] + list("mary") + ["", ""], - [""] + list("cries") + [""], - ["", "", ""] + [""] * 4, - [""] * 7, - ] - ] - numericalized = field.numericalize(examples_data) - - assert numericalized.dim() == 3 - assert numericalized.size(0) == len(examples_data) - for example, numericalized_example in zip(examples_data, numericalized): - verify_numericalized_example( - field, example, numericalized_example, batch_first=True) - - # test include_lengths - nesting_field = data.Field(batch_first=True) - field = data.NestedField(nesting_field, include_lengths=True) - ex1 = data.Example.fromlist(["john loves mary"], [("words", field)]) - ex2 = data.Example.fromlist(["mary cries"], [("words", field)]) - dataset = data.Dataset([ex1, ex2], [("words", field)]) - field.build_vocab(dataset) - examples_data = [ - [ - ["", "", ""] + [""] * 4, - [""] + list("john") + ["", ""], - [""] + list("loves") + [""], - [""] + list("mary") + ["", ""], - ["", "", ""] + [""] * 4, - ], - [ - ["", "", ""] + [""] * 4, - [""] + list("mary") + ["", ""], - [""] + list("cries") + [""], - ["", "", ""] + [""] * 4, - [""] * 7, - ] - ] - - numericalized, seq_len, word_len = field.numericalize( - (examples_data, [5, 4], [[3, 6, 7, 6, 3], [3, 6, 7, 3, 0]])) - - assert numericalized.dim() == 3 - assert len(seq_len) == 2 - assert len(word_len) == 2 - - assert numericalized.size(0) == len(examples_data) - for example, numericalized_example in zip(examples_data, numericalized): - verify_numericalized_example( - field, example, numericalized_example, batch_first=True) - - def test_serialization(self): - nesting_field = data.Field(batch_first=True) - field = data.NestedField(nesting_field) - ex1 = data.Example.fromlist(["john loves mary"], [("words", field)]) - ex2 = data.Example.fromlist(["mary cries"], [("words", field)]) - dataset = data.Dataset([ex1, ex2], [("words", field)]) - field.build_vocab(dataset) - examples_data = [ - [ - ["", "", ""] + [""] * 4, - [""] + list("john") + ["", ""], - [""] + list("loves") + [""], - [""] + list("mary") + ["", ""], - ["", "", ""] + [""] * 4, - ], - [ - ["", "", ""] + [""] * 4, - [""] + list("mary") + ["", ""], - [""] + list("cries") + [""], - ["", "", ""] + [""] * 4, - [""] * 7, - ] - ] - - field_pickle_filename = "char_field.pl" - field_pickle_path = os.path.join(self.test_dir, field_pickle_filename) - torch.save(field, field_pickle_path) - - loaded_field = torch.load(field_pickle_path) - assert loaded_field == field - - original_numericalization = field.numericalize(examples_data) - pickled_numericalization = loaded_field.numericalize(examples_data) - - assert torch.all(torch.eq(original_numericalization, pickled_numericalization)) - - -class TestLabelField(TorchtextTestCase): - def test_init(self): - # basic init - label_field = data.LabelField() - assert label_field.sequential is False - assert label_field.unk_token is None - - # init with preset fields - label_field = data.LabelField(sequential=True, unk_token="") - assert label_field.sequential is False - assert label_field.unk_token is None - - def test_vocab_size(self): - # Set up fields - question_field = data.Field(sequential=True) - label_field = data.LabelField() - - # Copied from test_build_vocab with minor changes - # Write TSV dataset and construct a Dataset - self.write_test_ppid_dataset(data_format="tsv") - tsv_fields = [("id", None), ("q1", question_field), - ("q2", question_field), ("label", label_field)] - tsv_dataset = data.TabularDataset( - path=self.test_ppid_dataset_path, format="tsv", - fields=tsv_fields) - - # Skipping json dataset as we can rely on the original build vocab test - label_field.build_vocab(tsv_dataset) - assert label_field.vocab.freqs == Counter({'1': 2, '0': 1}) - expected_stoi = {'1': 0, '0': 1} # No - assert dict(label_field.vocab.stoi) == expected_stoi - # Turn the stoi dictionary into an itos list - expected_itos = [x[0] for x in sorted(expected_stoi.items(), - key=lambda tup: tup[1])] - assert label_field.vocab.itos == expected_itos diff --git a/test/legacy/data/test_pipeline.py b/test/legacy/data/test_pipeline.py deleted file mode 100644 index 7a39c081fa..0000000000 --- a/test/legacy/data/test_pipeline.py +++ /dev/null @@ -1,52 +0,0 @@ -# -*- coding: utf-8 -*- -import torchtext.legacy.data as data - -from ...common.torchtext_test_case import TorchtextTestCase - - -class TestPipeline(TorchtextTestCase): - @staticmethod - def repeat_n(x, n=3): - """ - Given a sequence, repeat it n times. - """ - return x * n - - def test_pipeline(self): - id_pipeline = data.Pipeline() - assert id_pipeline("Test STring") == "Test STring" - assert id_pipeline("ᑌᑎIᑕOᗪᕮ_Tᕮ᙭T") == "ᑌᑎIᑕOᗪᕮ_Tᕮ᙭T" - assert id_pipeline(["1241", "Some String"]) == ["1241", "Some String"] - - pipeline = data.Pipeline(str.lower) - assert pipeline("Test STring") == "test string" - assert pipeline("ᑌᑎIᑕOᗪᕮ_Tᕮ᙭T") == "ᑌᑎiᑕoᗪᕮ_tᕮ᙭t" - assert pipeline(["1241", "Some String"]) == ["1241", "some string"] - - args_pipeline = data.Pipeline(TestPipeline.repeat_n) - assert args_pipeline("test", 5) == "testtesttesttesttest" - assert args_pipeline(["ele1", "ele2"], 2) == ["ele1ele1", "ele2ele2"] - - def test_composition(self): - id_pipeline = data.Pipeline() - pipeline = data.Pipeline(TestPipeline.repeat_n) - pipeline.add_before(id_pipeline) - pipeline.add_after(id_pipeline) - pipeline.add_before(str.lower) - pipeline.add_after(str.capitalize) - - other_pipeline = data.Pipeline(str.swapcase) - other_pipeline.add_before(pipeline) - - # Assert pipeline gives proper results after composition - # (test that we aren't modfifying pipes member) - assert pipeline("teST") == "Testtesttest" - assert pipeline(["ElE1", "eLe2"]) == ["Ele1ele1ele1", "Ele2ele2ele2"] - - # Assert pipeline that we added to gives proper results - assert other_pipeline("teST") == "tESTTESTTEST" - assert other_pipeline(["ElE1", "eLe2"]) == ["eLE1ELE1ELE1", "eLE2ELE2ELE2"] - - def test_exceptions(self): - with self.assertRaises(ValueError): - data.Pipeline("Not Callable") diff --git a/test/legacy/data/test_subword.py b/test/legacy/data/test_subword.py deleted file mode 100644 index 83aec7df0f..0000000000 --- a/test/legacy/data/test_subword.py +++ /dev/null @@ -1,25 +0,0 @@ -#!/usr/bin/env python3 -# Note that all the tests in this module require dataset (either network access or cached) -import unittest - -from torchtext.legacy import data -from torchtext.legacy.datasets import TREC - - -class TestSubword(unittest.TestCase): - def test_subword_trec(self): - TEXT = data.SubwordField() - LABEL = data.Field(sequential=False) - RAW = data.Field(sequential=False, use_vocab=False) - raw, _ = TREC.splits(RAW, LABEL) - cooked, _ = TREC.splits(TEXT, LABEL) - LABEL.build_vocab(cooked) - TEXT.build_vocab(cooked, max_size=100) - TEXT.segment(cooked) - print(cooked[0].text) - batch = next(iter(data.Iterator(cooked, 1, shuffle=False))) - self.assertEqual(TEXT.reverse(batch.text.data)[0], raw[0].text) - - -if __name__ == '__main__': - unittest.main() diff --git a/test/legacy/imdb.py b/test/legacy/imdb.py deleted file mode 100644 index eec869cc83..0000000000 --- a/test/legacy/imdb.py +++ /dev/null @@ -1,43 +0,0 @@ -from torchtext.legacy import data -from torchtext.legacy import datasets -from torchtext.vocab import GloVe - - -# Approach 1: -# set up fields -TEXT = data.Field(lower=True, include_lengths=True, batch_first=True) -LABEL = data.Field(sequential=False) - - -# make splits for data -train, test = datasets.IMDB.splits(TEXT, LABEL) - -# print information about the data -print('train.fields', train.fields) -print('len(train)', len(train)) -print('vars(train[0])', vars(train[0])) - -# build the vocabulary -TEXT.build_vocab(train, vectors=GloVe(name='6B', dim=300)) -LABEL.build_vocab(train) - -# print vocab information -print('len(TEXT.vocab)', len(TEXT.vocab)) -print('TEXT.vocab.vectors.size()', TEXT.vocab.vectors.size()) - -# make iterator for splits -train_iter, test_iter = data.BucketIterator.splits( - (train, test), batch_size=3, device="cuda:0") - -# print batch information -batch = next(iter(train_iter)) -print(batch.text) -print(batch.label) - -# Approach 2: -train_iter, test_iter = datasets.IMDB.iters(batch_size=4) - -# print batch information -batch = next(iter(train_iter)) -print(batch.text) -print(batch.label) diff --git a/test/legacy/language_modeling.py b/test/legacy/language_modeling.py deleted file mode 100644 index fc41b57c5a..0000000000 --- a/test/legacy/language_modeling.py +++ /dev/null @@ -1,38 +0,0 @@ -from torchtext.legacy import data -from torchtext.legacy import datasets -from torchtext.vocab import GloVe - -# Approach 1: -# set up fields -TEXT = data.Field(lower=True, batch_first=True) - -# make splits for data -train, valid, test = datasets.WikiText2.splits(TEXT) - -# print information about the data -print('train.fields', train.fields) -print('len(train)', len(train)) -print('vars(train[0])', vars(train[0])['text'][0:10]) - -# build the vocabulary -TEXT.build_vocab(train, vectors=GloVe(name='6B', dim=300)) - -# print vocab information -print('len(TEXT.vocab)', len(TEXT.vocab)) - -# make iterator for splits -train_iter, valid_iter, test_iter = data.BPTTIterator.splits( - (train, valid, test), batch_size=3, bptt_len=30, device="cuda:0") - -# print batch information -batch = next(iter(train_iter)) -print(batch.text) -print(batch.target) - -# Approach 2: -train_iter, valid_iter, test_iter = datasets.WikiText2.iters(batch_size=4, bptt_len=30) - -# print batch information -batch = next(iter(train_iter)) -print(batch.text) -print(batch.target) diff --git a/test/legacy/nli.py b/test/legacy/nli.py deleted file mode 100644 index 860064f929..0000000000 --- a/test/legacy/nli.py +++ /dev/null @@ -1,304 +0,0 @@ -import torch -from ..common.torchtext_test_case import TorchtextTestCase - -from torchtext.legacy.datasets import SNLI, MultiNLI, XNLI -from torchtext.legacy.datasets.nli import ParsedTextField, ShiftReduceField -from torchtext.legacy.data import Field, LabelField, Iterator - -import shutil - - -class TestNLI(TorchtextTestCase): - - def test_snli(self): - batch_size = 4 - - # create fields - TEXT = ParsedTextField() - TREE = ShiftReduceField() - LABEL = LabelField() - - # create train/val/test splits - train, val, test = SNLI.splits(TEXT, LABEL, TREE) - - # check all are SNLI datasets - assert type(train) == type(val) == type(test) == SNLI - - # check all have correct number of fields - assert len(train.fields) == len(val.fields) == len(test.fields) == 5 - - # check fields are the correct type - assert type(train.fields['premise']) == ParsedTextField - assert type(train.fields['premise_transitions']) == ShiftReduceField - assert type(train.fields['hypothesis']) == ParsedTextField - assert type(train.fields['hypothesis_transitions']) == ShiftReduceField - assert type(train.fields['label']) == LabelField - - assert type(val.fields['premise']) == ParsedTextField - assert type(val.fields['premise_transitions']) == ShiftReduceField - assert type(val.fields['hypothesis']) == ParsedTextField - assert type(val.fields['hypothesis_transitions']) == ShiftReduceField - assert type(val.fields['label']) == LabelField - - assert type(test.fields['premise']) == ParsedTextField - assert type(test.fields['premise_transitions']) == ShiftReduceField - assert type(test.fields['hypothesis']) == ParsedTextField - assert type(test.fields['hypothesis_transitions']) == ShiftReduceField - assert type(test.fields['label']) == LabelField - - # check each is the correct length - assert len(train) == 549367 - assert len(val) == 9842 - assert len(test) == 9824 - - # build vocabulary - TEXT.build_vocab(train) - LABEL.build_vocab(train) - - # ensure vocabulary has been created - assert hasattr(TEXT, 'vocab') - assert hasattr(TEXT.vocab, 'itos') - assert hasattr(TEXT.vocab, 'stoi') - - # create iterators - train_iter, val_iter, test_iter = Iterator.splits((train, val, test), - batch_size=batch_size) - - # get a batch to test - batch = next(iter(train_iter)) - - # split premise and hypothesis from tuples to tensors - premise, premise_transitions = batch.premise - hypothesis, hypothesis_transitions = batch.hypothesis - label = batch.label - - # check each is actually a tensor - assert type(premise) == torch.Tensor - assert type(premise_transitions) == torch.Tensor - assert type(hypothesis) == torch.Tensor - assert type(hypothesis_transitions) == torch.Tensor - assert type(label) == torch.Tensor - - # check have the correct batch dimension - assert premise.shape[-1] == batch_size - assert premise_transitions.shape[-1] == batch_size - assert hypothesis.shape[-1] == batch_size - assert hypothesis_transitions.shape[-1] == batch_size - assert label.shape[-1] == batch_size - - # repeat the same tests with iters instead of split - train_iter, val_iter, test_iter = SNLI.iters(batch_size=batch_size, - trees=True) - - # split premise and hypothesis from tuples to tensors - premise, premise_transitions = batch.premise - hypothesis, hypothesis_transitions = batch.hypothesis - label = batch.label - - # check each is actually a tensor - assert type(premise) == torch.Tensor - assert type(premise_transitions) == torch.Tensor - assert type(hypothesis) == torch.Tensor - assert type(hypothesis_transitions) == torch.Tensor - assert type(label) == torch.Tensor - - # check have the correct batch dimension - assert premise.shape[-1] == batch_size - assert premise_transitions.shape[-1] == batch_size - assert hypothesis.shape[-1] == batch_size - assert hypothesis_transitions.shape[-1] == batch_size - assert label.shape[-1] == batch_size - - # remove downloaded snli directory - shutil.rmtree('.data/snli') - - def test_multinli(self): - batch_size = 4 - - # create fields - TEXT = ParsedTextField() - TREE = ShiftReduceField() - GENRE = LabelField() - LABEL = LabelField() - - # create train/val/test splits - train, val, test = MultiNLI.splits(TEXT, LABEL, TREE, GENRE) - - # check all are MultiNLI datasets - assert type(train) == type(val) == type(test) == MultiNLI - - # check all have correct number of fields - assert len(train.fields) == len(val.fields) == len(test.fields) == 6 - - # check fields are the correct type - assert type(train.fields['premise']) == ParsedTextField - assert type(train.fields['premise_transitions']) == ShiftReduceField - assert type(train.fields['hypothesis']) == ParsedTextField - assert type(train.fields['hypothesis_transitions']) == ShiftReduceField - assert type(train.fields['label']) == LabelField - assert type(train.fields['genre']) == LabelField - - assert type(val.fields['premise']) == ParsedTextField - assert type(val.fields['premise_transitions']) == ShiftReduceField - assert type(val.fields['hypothesis']) == ParsedTextField - assert type(val.fields['hypothesis_transitions']) == ShiftReduceField - assert type(val.fields['label']) == LabelField - assert type(val.fields['genre']) == LabelField - - assert type(test.fields['premise']) == ParsedTextField - assert type(test.fields['premise_transitions']) == ShiftReduceField - assert type(test.fields['hypothesis']) == ParsedTextField - assert type(test.fields['hypothesis_transitions']) == ShiftReduceField - assert type(test.fields['label']) == LabelField - assert type(test.fields['genre']) == LabelField - - # check each is the correct length - assert len(train) == 392702 - assert len(val) == 9815 - assert len(test) == 9832 - - # build vocabulary - TEXT.build_vocab(train) - LABEL.build_vocab(train) - GENRE.build_vocab(train) - - # ensure vocabulary has been created - assert hasattr(TEXT, 'vocab') - assert hasattr(TEXT.vocab, 'itos') - assert hasattr(TEXT.vocab, 'stoi') - - # create iterators - train_iter, val_iter, test_iter = Iterator.splits((train, val, test), - batch_size=batch_size) - - # get a batch to test - batch = next(iter(train_iter)) - - # split premise and hypothesis from tuples to tensors - premise, premise_transitions = batch.premise - hypothesis, hypothesis_transitions = batch.hypothesis - label = batch.label - genre = batch.genre - - # check each is actually a tensor - assert type(premise) == torch.Tensor - assert type(premise_transitions) == torch.Tensor - assert type(hypothesis) == torch.Tensor - assert type(hypothesis_transitions) == torch.Tensor - assert type(label) == torch.Tensor - assert type(genre) == torch.Tensor - - # check have the correct batch dimension - assert premise.shape[-1] == batch_size - assert premise_transitions.shape[-1] == batch_size - assert hypothesis.shape[-1] == batch_size - assert hypothesis_transitions.shape[-1] == batch_size - assert label.shape[-1] == batch_size - assert genre.shape[-1] == batch_size - - # repeat the same tests with iters instead of split - train_iter, val_iter, test_iter = MultiNLI.iters(batch_size=batch_size, - trees=True) - - # split premise and hypothesis from tuples to tensors - premise, premise_transitions = batch.premise - hypothesis, hypothesis_transitions = batch.hypothesis - label = batch.label - - # check each is actually a tensor - assert type(premise) == torch.Tensor - assert type(premise_transitions) == torch.Tensor - assert type(hypothesis) == torch.Tensor - assert type(hypothesis_transitions) == torch.Tensor - assert type(label) == torch.Tensor - - # check have the correct batch dimension - assert premise.shape[-1] == batch_size - assert premise_transitions.shape[-1] == batch_size - assert hypothesis.shape[-1] == batch_size - assert hypothesis_transitions.shape[-1] == batch_size - assert label.shape[-1] == batch_size - - # remove downloaded multinli directory - shutil.rmtree('.data/multinli') - - def test_xnli(self): - batch_size = 4 - - # create fields - TEXT = Field() - GENRE = LabelField() - LABEL = LabelField() - LANGUAGE = LabelField() - - # create val/test splits, XNLI does not have a test set - val, test = XNLI.splits(TEXT, LABEL, GENRE, LANGUAGE) - - # check both are XNLI datasets - assert type(val) == type(test) == XNLI - - # check all have the correct number of fields - assert len(val.fields) == len(test.fields) == 5 - - # check fields are the correct type - assert type(val.fields['premise']) == Field - assert type(val.fields['hypothesis']) == Field - assert type(val.fields['label']) == LabelField - assert type(val.fields['genre']) == LabelField - assert type(val.fields['language']) == LabelField - - assert type(test.fields['premise']) == Field - assert type(test.fields['hypothesis']) == Field - assert type(test.fields['label']) == LabelField - assert type(test.fields['genre']) == LabelField - assert type(test.fields['language']) == LabelField - - # check each is the correct length - assert len(val) == 37350 - assert len(test) == 75150 - - # build vocabulary - TEXT.build_vocab(val) - LABEL.build_vocab(val) - GENRE.build_vocab(val) - LANGUAGE.build_vocab(val) - - # ensure vocabulary has been created - assert hasattr(TEXT, 'vocab') - assert hasattr(TEXT.vocab, 'itos') - assert hasattr(TEXT.vocab, 'stoi') - - # create iterators - val_iter, test_iter = Iterator.splits((val, test), - batch_size=batch_size) - - # get a batch to test - batch = next(iter(val_iter)) - - # split premise and hypothesis from tuples to tensors - premise = batch.premise - hypothesis = batch.hypothesis - label = batch.label - genre = batch.genre - language = batch.language - - # check each is actually a tensor - assert type(premise) == torch.Tensor - assert type(hypothesis) == torch.Tensor - assert type(label) == torch.Tensor - assert type(genre) == torch.Tensor - assert type(language) == torch.Tensor - - # check have the correct batch dimension - assert premise.shape[-1] == batch_size - assert hypothesis.shape[-1] == batch_size - assert label.shape[-1] == batch_size - assert genre.shape[-1] == batch_size - assert language.shape[-1] == batch_size - - # xnli cannot use the iters method, ensure raises error - with self.assertRaises(NotImplementedError): - val_iter, test_iter = XNLI.iters(batch_size=batch_size) - - # remove downloaded xnli directory - shutil.rmtree('.data/xnli') diff --git a/test/legacy/sequence_tagging.py b/test/legacy/sequence_tagging.py deleted file mode 100644 index 76f65448e8..0000000000 --- a/test/legacy/sequence_tagging.py +++ /dev/null @@ -1,86 +0,0 @@ -from torchtext.legacy import data -from torchtext.legacy import datasets -from torchtext.vocab import GloVe - -# Define the fields associated with the sequences. -WORD = data.Field(init_token="", eos_token="") -UD_TAG = data.Field(init_token="", eos_token="") - -# Download and the load default data. -train, val, test = datasets.UDPOS.splits( - fields=(('word', WORD), ('udtag', UD_TAG), (None, None))) - -print(train.fields) -print(len(train)) -print(vars(train[0])) - -# We can also define more than two columns. -WORD = data.Field(init_token="", eos_token="") -UD_TAG = data.Field(init_token="", eos_token="") -PTB_TAG = data.Field(init_token="", eos_token="") - -# Load the specified data. -train, val, test = datasets.UDPOS.splits( - fields=(('word', WORD), ('udtag', UD_TAG), ('ptbtag', PTB_TAG)), - path=".data/sequence-labeling/en-ud-v2", - train="en-ud-tag.v2.train.txt", - validation="en-ud-tag.v2.dev.txt", - test="en-ud-tag.v2.test.txt") - -print(train.fields) -print(len(train)) -print(vars(train[0])) - -WORD.build_vocab(train.word, min_freq=3) -UD_TAG.build_vocab(train.udtag) -PTB_TAG.build_vocab(train.ptbtag) - -print(UD_TAG.vocab.freqs) -print(PTB_TAG.vocab.freqs) - -train_iter, val_iter = data.BucketIterator.splits( - (train, val), batch_size=3, device="cuda:0") - -batch = next(iter(train_iter)) - -print("words", batch.word) -print("udtags", batch.udtag) -print("ptbtags", batch.ptbtag) - -# Now lets try both word and character embeddings -WORD = data.Field(init_token="", eos_token="") -PTB_TAG = data.Field(init_token="", eos_token="") - -# We'll use NestedField to tokenize each word into list of chars -CHAR_NESTING = data.Field(tokenize=list, init_token="", eos_token="") -CHAR = data.NestedField(CHAR_NESTING, init_token="", eos_token="") - -fields = [(('word', 'char'), (WORD, CHAR)), (None, None), ('ptbtag', PTB_TAG)] -train, val, test = datasets.UDPOS.splits(fields=fields) - -print(train.fields) -print(len(train)) -print(vars(train[0])) - -WORD.build_vocab(train.word, val.word, test.word, vectors=[GloVe(name='6B', dim='300')]) -CHAR.build_vocab(train.char, val.char, test.char) -PTB_TAG.build_vocab(train.ptbtag) - -print(CHAR.vocab.freqs) -train_iter, val_iter = data.BucketIterator.splits( - (train, val), batch_size=3) - -batch = next(iter(train_iter)) - -print("words", batch.word) -print("chars", batch.char) -print("ptbtags", batch.ptbtag) - -# Using the CoNLL 2000 Chunking dataset: -INPUTS = data.Field(init_token="", eos_token="") -CHUNK_TAGS = data.Field(init_token="", eos_token="") - -train, val, test = datasets.CoNLL2000Chunking.splits( - fields=(('inputs', INPUTS), (None, None), ('tags', CHUNK_TAGS)) -) -print(len(train), len(val), len(test)) diff --git a/test/legacy/sst.py b/test/legacy/sst.py deleted file mode 100644 index 6ba50fbbee..0000000000 --- a/test/legacy/sst.py +++ /dev/null @@ -1,69 +0,0 @@ -from torchtext.legacy import data -from torchtext.legacy import datasets -from torchtext.vocab import Vectors, GloVe, CharNGram, FastText - - -# Approach 1: -# set up fields -TEXT = data.Field() -LABEL = data.Field(sequential=False) - -# make splits for data -train, val, test = datasets.SST.splits( - TEXT, LABEL, fine_grained=True, train_subtrees=True, - filter_pred=lambda ex: ex.label != 'neutral') - -# print information about the data -print('train.fields', train.fields) -print('len(train)', len(train)) -print('vars(train[0])', vars(train[0])) - -# build the vocabulary -url = 'https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.simple.vec' -TEXT.build_vocab(train, vectors=Vectors('wiki.simple.vec', url=url)) -LABEL.build_vocab(train) - -# print vocab information -print('len(TEXT.vocab)', len(TEXT.vocab)) -print('TEXT.vocab.vectors.size()', TEXT.vocab.vectors.size()) - -# make iterator for splits -train_iter, val_iter, test_iter = data.BucketIterator.splits( - (train, val, test), batch_size=3) - -# print batch information -batch = next(iter(train_iter)) -print(batch.text) -print(batch.label) - -# Approach 2: -TEXT.build_vocab(train, vectors=[GloVe(name='840B', dim='300'), CharNGram(), FastText()]) -LABEL.build_vocab(train) - -# print vocab information -print('len(TEXT.vocab)', len(TEXT.vocab)) -print('TEXT.vocab.vectors.size()', TEXT.vocab.vectors.size()) - -train_iter, val_iter, test_iter = datasets.SST.iters(batch_size=4) - -# print batch information -batch = next(iter(train_iter)) -print(batch.text) -print(batch.label) - -# Approach 3: -f = FastText() -TEXT.build_vocab(train, vectors=f) -TEXT.vocab.extend(f) -LABEL.build_vocab(train) - -# print vocab information -print('len(TEXT.vocab)', len(TEXT.vocab)) -print('TEXT.vocab.vectors.size()', TEXT.vocab.vectors.size()) - -train_iter, val_iter, test_iter = datasets.SST.iters(batch_size=4) - -# print batch information -batch = next(iter(train_iter)) -print(batch.text) -print(batch.label) diff --git a/test/legacy/test_vocab.py b/test/legacy/test_vocab.py deleted file mode 100644 index 083a738483..0000000000 --- a/test/legacy/test_vocab.py +++ /dev/null @@ -1,131 +0,0 @@ -# -*- coding: utf-8 -*- -from collections import Counter -import os -import pickle - - -import numpy as np -import torch -from torchtext.legacy import vocab - -from ..common.torchtext_test_case import TorchtextTestCase - - -def conditional_remove(f): - if os.path.isfile(f): - os.remove(f) - - -class TestVocab(TorchtextTestCase): - - def test_vocab_basic(self): - c = Counter({'hello': 4, 'world': 3, 'ᑌᑎIᑕOᗪᕮ_Tᕮ᙭T': 5, 'freq_too_low': 2}) - v = vocab.Vocab(c, min_freq=3, specials=['', '', '']) - - expected_itos = ['', '', '', - 'ᑌᑎIᑕOᗪᕮ_Tᕮ᙭T', 'hello', 'world'] - expected_stoi = {x: index for index, x in enumerate(expected_itos)} - self.assertEqual(v.itos, expected_itos) - self.assertEqual(dict(v.stoi), expected_stoi) - - def test_vocab_specials_first(self): - c = Counter("a a b b c c".split()) - - # add specials into vocabulary at first - v = vocab.Vocab(c, max_size=2, specials=['', '']) - expected_itos = ['', '', 'a', 'b'] - expected_stoi = {x: index for index, x in enumerate(expected_itos)} - self.assertEqual(v.itos, expected_itos) - self.assertEqual(dict(v.stoi), expected_stoi) - - # add specials into vocabulary at last - v = vocab.Vocab(c, max_size=2, specials=['', ''], specials_first=False) - expected_itos = ['a', 'b', '', ''] - expected_stoi = {x: index for index, x in enumerate(expected_itos)} - self.assertEqual(v.itos, expected_itos) - self.assertEqual(dict(v.stoi), expected_stoi) - - def test_vocab_without_unk(self): - c = Counter({'hello': 4, 'world': 3, 'ᑌᑎIᑕOᗪᕮ_Tᕮ᙭T': 5, 'freq_too_low': 2}) - oov_word = 'OOVWORD' - self.assertNotIn(oov_word, c) - - # tests for specials_first=True - v_first = vocab.Vocab(c, min_freq=3, specials=[''], specials_first=True) - expected_itos_first = ['', 'ᑌᑎIᑕOᗪᕮ_Tᕮ᙭T', 'hello', 'world'] - expected_stoi_first = {x: index for index, x in enumerate(expected_itos_first)} - self.assertEqual(v_first.itos, expected_itos_first) - self.assertEqual(dict(v_first.stoi), expected_stoi_first) - self.assertNotIn(oov_word, v_first.itos) - self.assertNotIn(oov_word, v_first.stoi) - - # tests for specials_first=False - v_last = vocab.Vocab(c, min_freq=3, specials=[''], specials_first=False) - expected_itos_last = ['ᑌᑎIᑕOᗪᕮ_Tᕮ᙭T', 'hello', 'world', ''] - expected_stoi_last = {x: index for index, x in enumerate(expected_itos_last)} - self.assertEqual(v_last.itos, expected_itos_last) - self.assertEqual(dict(v_last.stoi), expected_stoi_last) - self.assertNotIn(oov_word, v_last.itos) - self.assertNotIn(oov_word, v_last.stoi) - - # check if pad is mapped to the first index - self.assertEqual(v_first.stoi[''], 0) - # check if pad is mapped to the last index - self.assertEqual(v_last.stoi[''], max(v_last.stoi.values())) - - # check if an oovword is not in vocab and a default unk_id is not assigned to it - self.assertRaises(KeyError, v_first.stoi.__getitem__, oov_word) - self.assertRaises(KeyError, v_last.stoi.__getitem__, oov_word) - - def test_vocab_set_vectors(self): - c = Counter({'hello': 4, 'world': 3, 'ᑌᑎIᑕOᗪᕮ_Tᕮ᙭T': 5, - 'test': 4, 'freq_too_low': 2}) - v = vocab.Vocab(c, min_freq=3, specials=['', '', '']) - stoi = {"hello": 0, "world": 1, "test": 2} - vectors = torch.FloatTensor([[0.1, 0.2], [0.3, 0.4], [0.5, 0.6]]) - dim = 2 - v.set_vectors(stoi, vectors, dim) - expected_vectors = np.array([[0.0, 0.0], [0.0, 0.0], [0.0, 0.0], - [0.0, 0.0], [0.1, 0.2], [0.5, 0.6], - [0.3, 0.4]]) - self.assertEqual(v.vectors, expected_vectors, exact_dtype=False) - - def test_errors(self): - c = Counter({'hello': 4, 'world': 3, 'ᑌᑎIᑕOᗪᕮ_Tᕮ᙭T': 5, 'freq_too_low': 2}) - with self.assertRaises(ValueError): - # Test proper error raised when using unknown string alias - vocab.Vocab(c, min_freq=3, specials=['', '', ''], - vectors=["fasttext.english.300d"]) - vocab.Vocab(c, min_freq=3, specials=['', '', ''], - vectors="fasttext.english.300d") - with self.assertRaises(ValueError): - # Test proper error is raised when vectors argument is - # non-string or non-Vectors - vocab.Vocab(c, min_freq=3, specials=['', '', ''], - vectors={"word": [1, 2, 3]}) - - def test_serialization(self): - c = Counter({'hello': 4, 'world': 3, 'ᑌᑎIᑕOᗪᕮ_Tᕮ᙭T': 5, 'freq_too_low': 2}) - v = vocab.Vocab(c, min_freq=3, specials=['', '', '']) - pickle_path = os.path.join(self.test_dir, "vocab.pkl") - pickle.dump(v, open(pickle_path, "wb")) - v_loaded = pickle.load(open(pickle_path, "rb")) - assert v == v_loaded - - def test_serialization_backcompat(self): - # Test whether loading works on models saved in which - # the state was not required to have an "unk_index". - c = Counter({'hello': 4, 'world': 3, 'ᑌᑎIᑕOᗪᕮ_Tᕮ᙭T': 5, 'freq_too_low': 2}) - v = vocab.Vocab(c, min_freq=3, specials=['', '']) # no unk special - # Mock old vocabulary - del v.__dict__["unk_index"] - - pickle_path = os.path.join(self.test_dir, "vocab.pkl") - pickle.dump(v, open(pickle_path, "wb")) - v_loaded = pickle.load(open(pickle_path, "rb")) - assert v == v_loaded - - def test_has_unk(self): - c = Counter({'hello': 4, 'world': 3, 'ᑌᑎIᑕOᗪᕮ_Tᕮ᙭T': 5, 'freq_too_low': 2}) - v = vocab.Vocab(c) - self.assertEqual(v['not_in_it'], 0) diff --git a/test/legacy/translation.py b/test/legacy/translation.py deleted file mode 100644 index 8861fba93b..0000000000 --- a/test/legacy/translation.py +++ /dev/null @@ -1,102 +0,0 @@ -from torchtext.legacy import data -from torchtext.legacy import datasets - -import re -import spacy - -spacy_de = spacy.load('de_core_news_sm') -spacy_en = spacy.load('en_core_web_sm') - -url = re.compile('(.*)') - - -def tokenize_de(text): - return [tok.text for tok in spacy_de.tokenizer(url.sub('@URL@', text))] - - -def tokenize_en(text): - return [tok.text for tok in spacy_en.tokenizer(url.sub('@URL@', text))] - - -# Testing IWSLT -DE = data.Field(tokenize=tokenize_de) -EN = data.Field(tokenize=tokenize_en) - -train, val, test = datasets.IWSLT.splits(exts=('.de', '.en'), fields=(DE, EN)) - -print(train.fields) -print(len(train)) -print(vars(train[0])) -print(vars(train[100])) - -DE.build_vocab(train.src, min_freq=3) -EN.build_vocab(train.trg, max_size=50000) - -train_iter, val_iter = data.BucketIterator.splits( - (train, val), batch_size=3) - -print(DE.vocab.freqs.most_common(10)) -print(len(DE.vocab)) -print(EN.vocab.freqs.most_common(10)) -print(len(EN.vocab)) - -batch = next(iter(train_iter)) -print(batch.src) -print(batch.trg) - - -# Testing Multi30k -DE = data.Field(tokenize=tokenize_de) -EN = data.Field(tokenize=tokenize_en) - -train, val, test = datasets.Multi30k.splits(exts=('.de', '.en'), fields=(DE, EN)) - -print(train.fields) -print(len(train)) -print(vars(train[0])) -print(vars(train[100])) - -DE.build_vocab(train.src, min_freq=3) -EN.build_vocab(train.trg, max_size=50000) - -train_iter, val_iter = data.BucketIterator.splits( - (train, val), batch_size=3) - -print(DE.vocab.freqs.most_common(10)) -print(len(DE.vocab)) -print(EN.vocab.freqs.most_common(10)) -print(len(EN.vocab)) - -batch = next(iter(train_iter)) -print(batch.src) -print(batch.trg) - - -# Testing custom paths -DE = data.Field(tokenize=tokenize_de) -EN = data.Field(tokenize=tokenize_en) - -train, val = datasets.TranslationDataset.splits( - path='.data/multi30k/', train='train', - validation='val', test=None, exts=('.de', '.en'), - fields=(DE, EN)) - -print(train.fields) -print(len(train)) -print(vars(train[0])) -print(vars(train[100])) - -DE.build_vocab(train.src, min_freq=3) -EN.build_vocab(train.trg, max_size=50000) - -train_iter, val_iter = data.BucketIterator.splits( - (train, val), batch_size=3) - -print(DE.vocab.freqs.most_common(10)) -print(len(DE.vocab)) -print(EN.vocab.freqs.most_common(10)) -print(len(EN.vocab)) - -batch = next(iter(train_iter)) -print(batch.src) -print(batch.trg) diff --git a/test/legacy/trec.py b/test/legacy/trec.py deleted file mode 100644 index 43a0c00a08..0000000000 --- a/test/legacy/trec.py +++ /dev/null @@ -1,46 +0,0 @@ -from torchtext.legacy import data -from torchtext.legacy import datasets -from torchtext.vocab import GloVe, CharNGram - - -# Approach 1: -# set up fields -TEXT = data.Field(lower=True, include_lengths=True, batch_first=True) -LABEL = data.Field(sequential=False) - - -# make splits for data -train, test = datasets.TREC.splits(TEXT, LABEL, fine_grained=True) - -# print information about the data -print('train.fields', train.fields) -print('len(train)', len(train)) -print('vars(train[0])', vars(train[0])) - -# build the vocabulary -TEXT.build_vocab(train, vectors=GloVe(name='6B', dim=300)) -LABEL.build_vocab(train) - -# print vocab information -print('len(TEXT.vocab)', len(TEXT.vocab)) -print('TEXT.vocab.vectors.size()', TEXT.vocab.vectors.size()) - -# make iterator for splits -train_iter, test_iter = data.BucketIterator.splits( - (train, test), batch_size=3) - -# print batch information -batch = next(iter(train_iter)) -print(batch.text) -print(batch.label) - -# Approach 2: -TEXT.build_vocab(train, vectors=[GloVe(name='840B', dim='300'), CharNGram()]) -LABEL.build_vocab(train) - -train_iter, test_iter = datasets.TREC.iters(batch_size=4) - -# print batch information -batch = next(iter(train_iter)) -print(batch.text) -print(batch.label) diff --git a/test/test_build.py b/test/test_build.py index ad3781c6c4..7e3ffd520f 100644 --- a/test/test_build.py +++ b/test/test_build.py @@ -1,108 +1,12 @@ #!/usr/bin/env python3 """Tests that requires external resources (Network access to fetch dataset)""" import os -import unittest -from collections import Counter - import torch import torchtext.data from .common.torchtext_test_case import TorchtextTestCase -class TestNestedField(TorchtextTestCase): - def test_build_vocab(self): - nesting_field = torchtext.legacy.data.Field(tokenize=list, init_token="", eos_token="") - - field = torchtext.legacy.data.NestedField( - nesting_field, init_token='', eos_token='', - include_lengths=True, - pad_first=True) - - sources = [ - [['a'], ['s', 'e', 'n', 't', 'e', 'n', 'c', 'e'], ['o', 'f'], ['d', 'a', 't', 'a'], ['.']], - [['y', 'e', 't'], ['a', 'n', 'o', 't', 'h', 'e', 'r']], - [['o', 'n', 'e'], ['l', 'a', 's', 't'], ['s', 'e', 'n', 't']] - ] - - field.build_vocab( - sources, vectors='glove.6B.50d', - unk_init=torch.nn.init.normal_, vectors_cache=".vector_cache") - - -class TestDataset(TorchtextTestCase): - def test_csv_file_no_header_one_col_multiple_fields(self): - self.write_test_ppid_dataset(data_format="csv") - - question_field = torchtext.legacy.data.Field(sequential=True) - spacy_tok_question_field = torchtext.legacy.data.Field(sequential=True, tokenize="spacy") - label_field = torchtext.legacy.data.Field(sequential=False) - # Field name/value as nested tuples - fields = [("ids", None), - (("q1", "q1_spacy"), (question_field, spacy_tok_question_field)), - (("q2", "q2_spacy"), (question_field, spacy_tok_question_field)), - ("label", label_field)] - dataset = torchtext.legacy.data.TabularDataset( - path=self.test_ppid_dataset_path, format="csv", fields=fields) - expected_examples = [ - (["When", "do", "you", "use", "シ", "instead", "of", "し?"], - ["When", "do", "you", "use", "シ", "instead", "of", "し", "?"], - ["When", "do", "you", "use", "\"&\"", - "instead", "of", "\"and\"?"], - ["When", "do", "you", "use", "\"", "&", "\"", - "instead", "of", "\"", "and", "\"", "?"], "0"), - (["Where", "was", "Lincoln", "born?"], - ["Where", "was", "Lincoln", "born", "?"], - ["Which", "location", "was", "Abraham", "Lincoln", "born?"], - ["Which", "location", "was", "Abraham", "Lincoln", "born", "?"], - "1"), - (["What", "is", "2+2"], ["What", "is", "2", "+", "2"], - ["2+2=?"], ["2", "+", "2=", "?"], "1")] - for i, example in enumerate(dataset): - self.assertEqual(example.q1, expected_examples[i][0]) - self.assertEqual(example.q1_spacy, expected_examples[i][1]) - self.assertEqual(example.q2, expected_examples[i][2]) - self.assertEqual(example.q2_spacy, expected_examples[i][3]) - self.assertEqual(example.label, expected_examples[i][4]) - - # 6 Fields including None for ids - assert len(dataset.fields) == 6 - - def test_json_dataset_one_key_multiple_fields(self): - self.write_test_ppid_dataset(data_format="json") - - question_field = torchtext.legacy.data.Field(sequential=True) - spacy_tok_question_field = torchtext.legacy.data.Field(sequential=True, tokenize="spacy") - label_field = torchtext.legacy.data.Field(sequential=False) - fields = {"question1": [("q1", question_field), - ("q1_spacy", spacy_tok_question_field)], - "question2": [("q2", question_field), - ("q2_spacy", spacy_tok_question_field)], - "label": ("label", label_field)} - dataset = torchtext.legacy.data.TabularDataset( - path=self.test_ppid_dataset_path, format="json", fields=fields) - expected_examples = [ - (["When", "do", "you", "use", "シ", "instead", "of", "し?"], - ["When", "do", "you", "use", "シ", "instead", "of", "し", "?"], - ["When", "do", "you", "use", "\"&\"", - "instead", "of", "\"and\"?"], - ["When", "do", "you", "use", "\"", "&", "\"", - "instead", "of", "\"", "and", "\"", "?"], "0"), - (["Where", "was", "Lincoln", "born?"], - ["Where", "was", "Lincoln", "born", "?"], - ["Which", "location", "was", "Abraham", "Lincoln", "born?"], - ["Which", "location", "was", "Abraham", "Lincoln", "born", "?"], - "1"), - (["What", "is", "2+2"], ["What", "is", "2", "+", "2"], - ["2+2=?"], ["2", "+", "2=", "?"], "1")] - for i, example in enumerate(dataset): - self.assertEqual(example.q1, expected_examples[i][0]) - self.assertEqual(example.q1_spacy, expected_examples[i][1]) - self.assertEqual(example.q2, expected_examples[i][2]) - self.assertEqual(example.q2_spacy, expected_examples[i][3]) - self.assertEqual(example.label, expected_examples[i][4]) - - class TestDataUtils(TorchtextTestCase): TEST_STR = "A string, particularly one with slightly complex punctuation." @@ -142,53 +46,37 @@ def test_vectors_get_vecs(self): self.assertEqual(vec[tokens[0].lower()], token_one_vec) def test_download_charngram_vectors(self): - c = Counter({'hello': 4, 'world': 3, 'ᑌᑎIᑕOᗪᕮ_Tᕮ᙭T': 5, 'freq_too_low': 2}) - # Build a vocab and get vectors twice to test caching, then once more - # to test string aliases. - for i in range(3): - if i == 2: - vectors = "charngram.100d" - else: - vectors = torchtext.vocab.CharNGram() - v = torchtext.legacy.vocab.Vocab( - c, min_freq=3, specials=['', '', ''], vectors=vectors) - expected_itos = ['', '', '', - 'ᑌᑎIᑕOᗪᕮ_Tᕮ᙭T', 'hello', 'world'] - expected_stoi = {x: index for index, x in enumerate(expected_itos)} - self.assertEqual(v.itos, expected_itos) - self.assertEqual(dict(v.stoi), expected_stoi) - vectors = v.vectors - + # Build a vocab and get vectors twice to test caching. + for _ in range(2): + vectors = torchtext.vocab.CharNGram() # The first 5 entries in each vector. expected_charngram = { 'hello': [-0.44782442, -0.08937783, -0.34227219, -0.16233221, -0.39343098], - 'world': [-0.29590717, -0.05275926, -0.37334684, 0.27117205, -0.3868292], + 'world': [-0.29590717, -0.05275926, -0.37334684, + 0.27117205, -0.3868292], } for word in expected_charngram: self.assertEqual( - vectors[v.stoi[word], :5], expected_charngram[word]) + vectors[word][0, :5], expected_charngram[word]) + + self.assertEqual(vectors[''][0], torch.zeros(100)) - self.assertEqual(vectors[v.stoi['']], torch.zeros(100)) - self.assertEqual(vectors[v.stoi['OOV token']], torch.zeros(100)) + # The first 5 entries for `OOV token` + expected_oov_token_charngram = [-0.1070, -0.2240, -0.3043, + -0.1092, 0.0953] + self.assertEqual(vectors['OOV token'][0, :5], + expected_oov_token_charngram, atol=0, rtol=10e-4) def test_download_custom_vectors(self): - c = Counter({'hello': 4, 'world': 3, 'ᑌᑎIᑕOᗪᕮ_Tᕮ᙭T': 5, 'freq_too_low': 2}) # Build a vocab and get vectors twice to test caching. for _ in range(2): - v = torchtext.legacy.vocab.Vocab( - c, min_freq=3, specials=['', '', ''], - vectors=torchtext.vocab.Vectors( - 'wiki.simple.vec', - url=torchtext.vocab.FastText.url_base.format('simple') - ) + vectors = torchtext.vocab.Vectors( + 'wiki.simple.vec', + url=torchtext.vocab.FastText.url_base.format('simple') ) - self.assertEqual(v.itos, ['', '', '', - 'ᑌᑎIᑕOᗪᕮ_Tᕮ᙭T', 'hello', 'world']) - vectors = v.vectors - # The first 5 entries in each vector. expected_fasttext_simple_en = { 'hello': [0.39567, 0.21454, -0.035389, -0.24299, -0.095645], @@ -197,29 +85,14 @@ def test_download_custom_vectors(self): for word in expected_fasttext_simple_en: self.assertEqual( - vectors[v.stoi[word], :5], expected_fasttext_simple_en[word]) + vectors[word][:5], expected_fasttext_simple_en[word]) - self.assertEqual(vectors[v.stoi['']], torch.zeros(300)) + self.assertEqual(vectors[''], torch.zeros(300)) def test_download_fasttext_vectors(self): - c = Counter({'hello': 4, 'world': 3, 'ᑌᑎIᑕOᗪᕮ_Tᕮ᙭T': 5, 'freq_too_low': 2}) - # Build a vocab and get vectors twice to test caching, then once more - # to test string aliases. - for i in range(3): - if i == 2: - vectors = "fasttext.simple.300d" - else: - vectors = torchtext.vocab.FastText(language='simple') - - v = torchtext.legacy.vocab.Vocab( - c, min_freq=3, specials=['', '', ''], vectors=vectors) - - expected_itos = ['', '', '', - 'ᑌᑎIᑕOᗪᕮ_Tᕮ᙭T', 'hello', 'world'] - expected_stoi = {x: index for index, x in enumerate(expected_itos)} - self.assertEqual(v.itos, expected_itos) - self.assertEqual(dict(v.stoi), expected_stoi) - vectors = v.vectors + # Build a vocab and get vectors twice to test caching. + for _ in range(2): + vectors = torchtext.vocab.FastText(language='simple') # The first 5 entries in each vector. expected_fasttext_simple_en = { @@ -229,91 +102,37 @@ def test_download_fasttext_vectors(self): for word in expected_fasttext_simple_en: self.assertEqual( - vectors[v.stoi[word], :5], expected_fasttext_simple_en[word]) + vectors[word][:5], expected_fasttext_simple_en[word]) - self.assertEqual(vectors[v.stoi['']], torch.zeros(300)) - self.assertEqual(vectors[v.stoi['OOV token']], torch.zeros(300)) + self.assertEqual(vectors[''], torch.zeros(300)) + self.assertEqual(vectors['OOV token'], torch.zeros(300)) def test_download_glove_vectors(self): - c = Counter({'hello': 4, 'world': 3, 'ᑌᑎIᑕOᗪᕮ_Tᕮ᙭T': 5, 'freq_too_low': 2}) - - # Build a vocab and get vectors twice to test caching, then once more - # to test string aliases. - for i in range(3): - if i == 2: - vectors = "glove.twitter.27B.25d" - else: - vectors = torchtext.vocab.GloVe(name='twitter.27B', dim='25') - v = torchtext.legacy.vocab.Vocab( - c, min_freq=3, specials=['', '', ''], vectors=vectors) - - expected_itos = ['', '', '', - 'ᑌᑎIᑕOᗪᕮ_Tᕮ᙭T', 'hello', 'world'] - expected_stoi = {x: index for index, x in enumerate(expected_itos)} - self.assertEqual(v.itos, expected_itos) - self.assertEqual(dict(v.stoi), expected_stoi) - - vectors = v.vectors - - # The first 5 entries in each vector. - expected_twitter = { - 'hello': [-0.77069, 0.12827, 0.33137, 0.0050893, -0.47605], - 'world': [0.10301, 0.095666, -0.14789, -0.22383, -0.14775], - } - - for word in expected_twitter: - self.assertEqual( - vectors[v.stoi[word], :5], expected_twitter[word]) - - self.assertEqual(vectors[v.stoi['']], torch.zeros(25)) - self.assertEqual(vectors[v.stoi['OOV token']], torch.zeros(25)) - - def test_extend(self): - c = Counter({'hello': 4, 'world': 3, 'ᑌᑎIᑕOᗪᕮ_Tᕮ᙭T': 5, 'freq_too_low': 2}) # Build a vocab and get vectors twice to test caching. - for _ in range(2): - f = torchtext.vocab.FastText(language='simple') - v = torchtext.legacy.vocab.Vocab( - c, min_freq=3, specials=['', '', ''], vectors=f) - n_vocab = len(v) - v.extend(f) # extend the vocab with the words contained in f.itos - self.assertGreater(len(v), n_vocab) + vectors = torchtext.vocab.GloVe(name='twitter.27B', dim='25') + # The first 5 entries in each vector. + expected_twitter = { + 'hello': [-0.77069, 0.12827, 0.33137, 0.0050893, -0.47605], + 'world': [0.10301, 0.095666, -0.14789, -0.22383, -0.14775], + } - self.assertEqual(v.itos[:6], ['', '', '', - 'ᑌᑎIᑕOᗪᕮ_Tᕮ᙭T', 'hello', 'world']) - vectors = v.vectors + for word in expected_twitter: + self.assertEqual( + vectors[word][:5], expected_twitter[word]) - # The first 5 entries in each vector. - expected_fasttext_simple_en = { - 'hello': [0.39567, 0.21454, -0.035389, -0.24299, -0.095645], - 'world': [0.10444, -0.10858, 0.27212, 0.13299, -0.33165], - } - - for word in expected_fasttext_simple_en: - self.assertEqual( - vectors[v.stoi[word], :5], expected_fasttext_simple_en[word]) - - self.assertEqual(vectors[v.stoi['']], torch.zeros(300)) + self.assertEqual(vectors[''], torch.zeros(25)) + self.assertEqual(vectors['OOV token'], torch.zeros(25)) - @unittest.skip("Download temp. slow.") def test_vectors_custom_cache(self): - c = Counter({'hello': 4, 'world': 3, 'ᑌᑎIᑕOᗪᕮ_Tᕮ᙭T': 5, 'freq_too_low': 2}) vector_cache = os.path.join('/tmp', 'vector_cache') # Build a vocab and get vectors twice to test caching. for i in range(2): if i == 1: self.assertTrue(os.path.exists(vector_cache)) - v = torchtext.legacy.vocab.Vocab( - c, min_freq=3, specials=['', '', ''], - vectors=torchtext.vocab.Vectors( - 'wiki.simple.vec', cache=vector_cache, - url=torchtext.vocab.FastText.url_base.format('simple')) - ) - - self.assertEqual(v.itos, ['', '', '', - 'ᑌᑎIᑕOᗪᕮ_Tᕮ᙭T', 'hello', 'world']) - vectors = v.vectors + vectors = torchtext.vocab.Vectors( + 'wiki.simple.vec', cache=vector_cache, + url=torchtext.vocab.FastText.url_base.format('simple')) # The first 5 entries in each vector. expected_fasttext_simple_en = { @@ -323,6 +142,6 @@ def test_vectors_custom_cache(self): for word in expected_fasttext_simple_en: self.assertEqual( - vectors[v.stoi[word], :5], expected_fasttext_simple_en[word]) + vectors[word][:5], expected_fasttext_simple_en[word]) - self.assertEqual(vectors[v.stoi['']], torch.zeros(300)) + self.assertEqual(vectors[''], torch.zeros(300)) diff --git a/test/test_transforms.py b/test/test_transforms.py index bff6303c78..d7df6d3876 100644 --- a/test/test_transforms.py +++ b/test/test_transforms.py @@ -95,9 +95,6 @@ def test_labeltoindex(self): expected = [0, 1, 2] self.assertEqual(actual, expected) - with self.assertRaises(RuntimeError): - transform(['OOV']) - transform = transforms.LabelToIndex(label_names=label_names, sort_names=True) actual = transform(label_names) expected = [2, 1, 0] diff --git a/torchtext/__init__.py b/torchtext/__init__.py index 212b902bd5..773d2dd19e 100644 --- a/torchtext/__init__.py +++ b/torchtext/__init__.py @@ -11,7 +11,6 @@ from . import functional from . import models from . import experimental -from . import legacy from ._extension import _init_extension @@ -28,8 +27,7 @@ 'transforms', 'functional', 'models', - 'experimental', - 'legacy'] + 'experimental'] _init_extension() diff --git a/torchtext/experimental/datasets/sst2.py b/torchtext/experimental/datasets/sst2.py index d0653b4954..6568cde8a6 100644 --- a/torchtext/experimental/datasets/sst2.py +++ b/torchtext/experimental/datasets/sst2.py @@ -43,7 +43,7 @@ _FIRST_LINE_MD5 = { "train": "2552b8cecd57b2e022ef23411c688fa8", "dev": "1b0ffd6aa5f2bf0fd9840a5f6f1a9f07", - "test": "3e7ff69ab3fc6d026e3c96cadd8b0b53", + "test": "f838c81fe40bfcd7e42e9ffc4dd004f7", } DATASET_NAME = "SST2" @@ -97,13 +97,6 @@ def _get_datapipe(self, root, split, validate_hash): ) # Parse CSV file and yield data samples - if split == "test": - parsed_data = extracted_files.parse_csv(skip_lines=1, delimiter="\t").map( - lambda x: (x[1],) - ) - else: - parsed_data = extracted_files.parse_csv(skip_lines=1, delimiter="\t").map( - lambda x: (x[0], x[1]) - ) - - return parsed_data + return extracted_files.parse_csv(skip_lines=1, delimiter="\t").map( + lambda x: (x[0], x[1]) + ) diff --git a/torchtext/legacy/README.rst b/torchtext/legacy/README.rst deleted file mode 100644 index 41fac9ecf6..0000000000 --- a/torchtext/legacy/README.rst +++ /dev/null @@ -1,53 +0,0 @@ -Legacy -====== - -In v0.9.0 release, we move the following legacy code to `torchtext.legacy <#legacy>`_. This is part of the work to revamp the torchtext library and the motivation has been discussed in `Issue #664 `_: - -* ``torchtext.legacy.data.field`` -* ``torchtext.legacy.data.batch`` -* ``torchtext.legacy.data.example`` -* ``torchtext.legacy.data.iterator`` -* ``torchtext.legacy.data.pipeline`` -* ``torchtext.legacy.datasets`` - -We have a `migration tutorial `_ to help users switch to the torchtext datasets in ``v0.9.0`` release. For the users who still want the legacy components, they can add ``legacy`` to the import path. - -Another option is to import ``torchtext.legacy`` as ``torchtext``. For example: - -With `torchtext v0.8.1` - - .. code-block:: python - - >>> import torchtext - >>> import torch - - >>> TEXT = torchtext.data.Field(tokenize=torchtext.data.get_tokenizer('basic_english'), - init_token='', eos_token='', lower=True) - >>> LABEL = torchtext.data.LabelField(dtype = torch.long) - >>> train_split, test_split = torchtext.datasets.IMDB.splits(TEXT, LABEL) - >>> TEXT.build_vocab(train_split) - >>> LABEL.build_vocab(train_split) - - >>> device = torch.device("cuda" if torch.cuda.is_available() else "cpu") - >>> train_iterator, test_iterator = torchtext.data.Iterator.splits( - (train_split, test_split), batch_size=8, device = device) - >>> next(iter(train_iterator)) - -With `torchtext v0.9.0` - - .. code-block:: python - - >>> import torchtext.legacy as torchtext # need to change only one line - >>> import torch - - >>> TEXT = torchtext.data.Field(tokenize=torchtext.data.get_tokenizer('basic_english'), - init_token='', eos_token='', lower=True) - >>> LABEL = torchtext.data.LabelField(dtype = torch.long) - >>> train_split, test_split = torchtext.datasets.IMDB.splits(TEXT, LABEL) - >>> TEXT.build_vocab(train_split) - >>> LABEL.build_vocab(train_split) - - >>> device = torch.device("cuda" if torch.cuda.is_available() else "cpu") - >>> train_iterator, test_iterator = torchtext.data.Iterator.splits( - (train_split, test_split), batch_size=8, device = device) - >>> next(iter(train_iterator)) diff --git a/torchtext/legacy/__init__.py b/torchtext/legacy/__init__.py deleted file mode 100644 index 5ff23f46d3..0000000000 --- a/torchtext/legacy/__init__.py +++ /dev/null @@ -1,11 +0,0 @@ -from . import data -from .. import nn # Not in the legacy folder -from . import datasets -from .. import utils # Not in the legacy folder -from . import vocab - -__all__ = ['data', - 'nn', - 'datasets', - 'utils', - 'vocab'] diff --git a/torchtext/legacy/data/__init__.py b/torchtext/legacy/data/__init__.py deleted file mode 100644 index 3c84a2ef11..0000000000 --- a/torchtext/legacy/data/__init__.py +++ /dev/null @@ -1,34 +0,0 @@ -from .batch import Batch -from .example import Example -from .field import RawField, Field, ReversibleField, SubwordField, NestedField, LabelField -from .iterator import (batch, BucketIterator, Iterator, BPTTIterator, pool) -from .pipeline import Pipeline -from .dataset import Dataset, TabularDataset -# Those are not in the legacy folder. -from ...data import metrics -from ...data.metrics import bleu_score -from ...data import utils -from ...data.utils import get_tokenizer, interleave_keys -from ...data import functional -from ...data.functional import generate_sp_model, \ - load_sp_model, \ - sentencepiece_numericalizer, \ - sentencepiece_tokenizer, custom_replace, simple_space_split, \ - numericalize_tokens_from_iterator - -__all__ = ["Batch", - "Example", - "RawField", "Field", "ReversibleField", "SubwordField", "NestedField", - "LabelField", - "batch", "BucketIterator", "Iterator", "BPTTIterator", "pool", - "Pipeline", - "Dataset", "TabularDataset", - "metrics", - "bleu_score", - "utils", - "get_tokenizer", "interleave_keys", - "functional", - "generate_sp_model", "load_sp_model", - "sentencepiece_numericalizer", "sentencepiece_tokenizer", - "custom_replace", "simple_space_split", - "numericalize_tokens_from_iterator"] diff --git a/torchtext/legacy/data/batch.py b/torchtext/legacy/data/batch.py deleted file mode 100644 index 3e4250d29f..0000000000 --- a/torchtext/legacy/data/batch.py +++ /dev/null @@ -1,101 +0,0 @@ -import torch - - -class Batch(object): - """Defines a batch of examples along with its Fields. - - Attributes: - batch_size: Number of examples in the batch. - dataset: A reference to the dataset object the examples come from - (which itself contains the dataset's Field objects). - train: Deprecated: this attribute is left for backwards compatibility, - however it is UNUSED as of the merger with pytorch 0.4. - input_fields: The names of the fields that are used as input for the model - target_fields: The names of the fields that are used as targets during - model training - - Also stores the Variable for each column in the batch as an attribute. - """ - - def __init__(self, data=None, dataset=None, device=None): - """Create a Batch from a list of examples.""" - if data is not None: - self.batch_size = len(data) - self.dataset = dataset - self.fields = dataset.fields.keys() # copy field names - self.input_fields = [k for k, v in dataset.fields.items() if - v is not None and not v.is_target] - self.target_fields = [k for k, v in dataset.fields.items() if - v is not None and v.is_target] - - for (name, field) in dataset.fields.items(): - if field is not None: - batch = [getattr(x, name) for x in data] - setattr(self, name, field.process(batch, device=device)) - - @classmethod - def fromvars(cls, dataset, batch_size, train=None, **kwargs): - """Create a Batch directly from a number of Variables.""" - batch = cls() - batch.batch_size = batch_size - batch.dataset = dataset - batch.fields = dataset.fields.keys() - for k, v in kwargs.items(): - setattr(batch, k, v) - return batch - - def __repr__(self): - return str(self) - - def __str__(self): - if not self.__dict__: - return 'Empty {} instance'.format(torch.typename(self)) - - fields_to_index = filter(lambda field: field is not None, self.fields) - var_strs = '\n'.join(['\t[.' + name + ']' + ":" + _short_str(getattr(self, name)) - for name in fields_to_index if hasattr(self, name)]) - - data_str = (' from {}'.format(self.dataset.name.upper()) - if hasattr(self.dataset, 'name') - and isinstance(self.dataset.name, str) else '') - - strt = '[{} of size {}{}]\n{}'.format(torch.typename(self), - self.batch_size, data_str, var_strs) - return '\n' + strt - - def __len__(self): - return self.batch_size - - def _get_field_values(self, fields): - if len(fields) == 0: - return None - elif len(fields) == 1: - return getattr(self, fields[0]) - else: - return tuple(getattr(self, f) for f in fields) - - def __iter__(self): - yield self._get_field_values(self.input_fields) - yield self._get_field_values(self.target_fields) - - -def _short_str(tensor): - # unwrap variable to tensor - if not torch.is_tensor(tensor): - # (1) unpack variable - if hasattr(tensor, 'data'): - tensor = tensor.data - # (2) handle include_lengths - elif isinstance(tensor, tuple): - return str(tuple(_short_str(t) for t in tensor)) - # (3) fallback to default str - else: - return str(tensor) - - # copied from torch _tensor_str - size_str = 'x'.join(str(size) for size in tensor.size()) - device_str = '' if not tensor.is_cuda else \ - ' (GPU {})'.format(tensor.get_device()) - strt = '[{} of size {}{}]'.format(torch.typename(tensor), - size_str, device_str) - return strt diff --git a/torchtext/legacy/data/dataset.py b/torchtext/legacy/data/dataset.py deleted file mode 100644 index 3af51df910..0000000000 --- a/torchtext/legacy/data/dataset.py +++ /dev/null @@ -1,362 +0,0 @@ -import io -import os -import zipfile -import tarfile -import gzip -import shutil -from functools import partial - -import torch.utils.data - -from torchtext.data.utils import RandomShuffler -from .example import Example -from torchtext.utils import download_from_url, unicode_csv_reader - - -class Dataset(torch.utils.data.Dataset): - """Defines a dataset composed of Examples along with its Fields. - - Attributes: - sort_key (callable): A key to use for sorting dataset examples for batching - together examples with similar lengths to minimize padding. - examples (list(Example)): The examples in this dataset. - fields (dict[str, Field]): Contains the name of each column or field, together - with the corresponding Field object. Two fields with the same Field object - will have a shared vocabulary. - """ - sort_key = None - - def __init__(self, examples, fields, filter_pred=None): - """Create a dataset from a list of Examples and Fields. - - Arguments: - examples: List of Examples. - fields (List(tuple(str, Field))): The Fields to use in this tuple. The - string is a field name, and the Field is the associated field. - filter_pred (callable or None): Use only examples for which - filter_pred(example) is True, or use all examples if None. - Default is None. - """ - if filter_pred is not None: - make_list = isinstance(examples, list) - examples = filter(filter_pred, examples) - if make_list: - examples = list(examples) - self.examples = examples - self.fields = dict(fields) - # Unpack field tuples - for n, f in list(self.fields.items()): - if isinstance(n, tuple): - self.fields.update(zip(n, f)) - del self.fields[n] - - @classmethod - def splits(cls, path=None, root='.data', train=None, validation=None, - test=None, **kwargs): - """Create Dataset objects for multiple splits of a dataset. - - Arguments: - path (str): Common prefix of the splits' file paths, or None to use - the result of cls.download(root). - root (str): Root dataset storage directory. Default is '.data'. - train (str): Suffix to add to path for the train set, or None for no - train set. Default is None. - validation (str): Suffix to add to path for the validation set, or None - for no validation set. Default is None. - test (str): Suffix to add to path for the test set, or None for no test - set. Default is None. - Remaining keyword arguments: Passed to the constructor of the - Dataset (sub)class being used. - - Returns: - Tuple[Dataset]: Datasets for train, validation, and - test splits in that order, if provided. - """ - if path is None: - path = cls.download(root) - train_data = None if train is None else cls( - os.path.join(path, train), **kwargs) - val_data = None if validation is None else cls( - os.path.join(path, validation), **kwargs) - test_data = None if test is None else cls( - os.path.join(path, test), **kwargs) - return tuple(d for d in (train_data, val_data, test_data) - if d is not None) - - def split(self, split_ratio=0.7, stratified=False, strata_field='label', - random_state=None): - """Create train-test(-valid?) splits from the instance's examples. - - Arguments: - split_ratio (float or List of floats): a number [0, 1] denoting the amount - of data to be used for the training split (rest is used for test), - or a list of numbers denoting the relative sizes of train, test and valid - splits respectively. If the relative size for valid is missing, only the - train-test split is returned. Default is 0.7 (for the train set). - stratified (bool): whether the sampling should be stratified. - Default is False. - strata_field (str): name of the examples Field stratified over. - Default is 'label' for the conventional label field. - random_state (tuple): the random seed used for shuffling. - A return value of `random.getstate()`. - - Returns: - Tuple[Dataset]: Datasets for train, validation, and - test splits in that order, if the splits are provided. - """ - train_ratio, test_ratio, val_ratio = check_split_ratio(split_ratio) - - # For the permutations - rnd = RandomShuffler(random_state) - if not stratified: - train_data, test_data, val_data = rationed_split(self.examples, train_ratio, - test_ratio, val_ratio, rnd) - else: - if strata_field not in self.fields: - raise ValueError("Invalid field name for strata_field {}" - .format(strata_field)) - strata = stratify(self.examples, strata_field) - train_data, test_data, val_data = [], [], [] - for group in strata: - # Stratify each group and add together the indices. - group_train, group_test, group_val = rationed_split(group, train_ratio, - test_ratio, val_ratio, - rnd) - train_data += group_train - test_data += group_test - val_data += group_val - - splits = tuple(Dataset(d, self.fields) - for d in (train_data, val_data, test_data) if d) - - # In case the parent sort key isn't none - if self.sort_key: - for subset in splits: - subset.sort_key = self.sort_key - return splits - - def __getitem__(self, i): - return self.examples[i] - - def __len__(self): - try: - return len(self.examples) - except TypeError: - return 2**32 - - def __iter__(self): - for x in self.examples: - yield x - - def __getattr__(self, attr): - if attr in self.fields: - for x in self.examples: - yield getattr(x, attr) - - @classmethod - def download(cls, root, check=None): - """Download and unzip an online archive (.zip, .gz, or .tgz). - - Arguments: - root (str): Folder to download data to. - check (str or None): Folder whose existence indicates - that the dataset has already been downloaded, or - None to check the existence of root/{cls.name}. - - Returns: - str: Path to extracted dataset. - """ - path = os.path.join(root, cls.name) - check = path if check is None else check - if not os.path.isdir(check): - for url in cls.urls: - if isinstance(url, tuple): - url, filename = url - else: - filename = os.path.basename(url) - zpath = os.path.join(path, filename) - if not os.path.isfile(zpath): - if not os.path.exists(os.path.dirname(zpath)): - os.makedirs(os.path.dirname(zpath)) - print('downloading {}'.format(filename)) - download_from_url(url, zpath) - zroot, ext = os.path.splitext(zpath) - _, ext_inner = os.path.splitext(zroot) - if ext == '.zip': - with zipfile.ZipFile(zpath, 'r') as zfile: - print('extracting') - zfile.extractall(path) - # tarfile cannot handle bare .gz files - elif ext == '.tgz' or ext == '.gz' and ext_inner == '.tar': - with tarfile.open(zpath, 'r:gz') as tar: - dirs = [member for member in tar.getmembers()] - tar.extractall(path=path, members=dirs) - elif ext == '.gz': - with gzip.open(zpath, 'rb') as gz: - with open(zroot, 'wb') as uncompressed: - shutil.copyfileobj(gz, uncompressed) - - return os.path.join(path, cls.dirname) - - def filter_examples(self, field_names): - """Remove unknown words from dataset examples with respect to given field. - - Arguments: - field_names (list(str)): Within example only the parts with field names in - field_names will have their unknown words deleted. - """ - for i, example in enumerate(self.examples): - for field_name in field_names: - vocab = set(self.fields[field_name].vocab.stoi) - text = getattr(example, field_name) - example_part = [word for word in text if word in vocab] - setattr(example, field_name, example_part) - self.examples[i] = example - - -class TabularDataset(Dataset): - """Defines a Dataset of columns stored in CSV, TSV, or JSON format.""" - - def __init__(self, path, format, fields, skip_header=False, - csv_reader_params=None, **kwargs): - """Create a TabularDataset given a path, file format, and field list. - - Args: - path (str): Path to the data file. - format (str): The format of the data file. One of "CSV", "TSV", or - "JSON" (case-insensitive). - fields ((list(tuple(str, Field)) or dict[str: tuple(str, Field)): If using a list, - the format must be CSV or TSV, and the values of the list - should be tuples of (name, field). - The fields should be in the same order as the columns in the CSV or TSV - file, while tuples of (name, None) represent columns that will be ignored. - - If using a dict, the keys should be a subset of the JSON keys or CSV/TSV - columns, and the values should be tuples of (name, field). - Keys not present in the input dictionary are ignored. - This allows the user to rename columns from their JSON/CSV/TSV key names - and also enables selecting a subset of columns to load. - skip_header (bool): Whether to skip the first line of the input file. - csv_reader_params(dict): Parameters to pass to the csv reader. - Only relevant when format is csv or tsv. - See - https://docs.python.org/3/library/csv.html#csv.reader - for more details. - kwargs (dict): passed to the Dataset parent class. - """ - if csv_reader_params is None: - csv_reader_params = {} - format = format.lower() - make_example = { - 'json': Example.fromJSON, 'dict': Example.fromdict, - 'tsv': Example.fromCSV, 'csv': Example.fromCSV}[format] - - with io.open(os.path.expanduser(path), encoding="utf8") as f: - if format == 'csv': - reader = unicode_csv_reader(f, **csv_reader_params) - elif format == 'tsv': - reader = unicode_csv_reader(f, delimiter='\t', **csv_reader_params) - else: - reader = f - - if format in ['csv', 'tsv'] and isinstance(fields, dict): - if skip_header: - raise ValueError('When using a dict to specify fields with a {} file,' - 'skip_header must be False and' - 'the file must have a header.'.format(format)) - header = next(reader) - field_to_index = {f: header.index(f) for f in fields.keys()} - make_example = partial(make_example, field_to_index=field_to_index) - - if skip_header: - next(reader) - - examples = [make_example(line, fields) for line in reader] - - if isinstance(fields, dict): - fields, field_dict = [], fields - for field in field_dict.values(): - if isinstance(field, list): - fields.extend(field) - else: - fields.append(field) - - super(TabularDataset, self).__init__(examples, fields, **kwargs) - - -def check_split_ratio(split_ratio): - """Check that the split ratio argument is not malformed""" - valid_ratio = 0. - if isinstance(split_ratio, float): - # Only the train set relative ratio is provided - # Assert in bounds, validation size is zero - assert 0. < split_ratio < 1., ( - "Split ratio {} not between 0 and 1".format(split_ratio)) - - test_ratio = 1. - split_ratio - return (split_ratio, test_ratio, valid_ratio) - elif isinstance(split_ratio, list): - # A list of relative ratios is provided - length = len(split_ratio) - assert length == 2 or length == 3, ( - "Length of split ratio list should be 2 or 3, got {}".format(split_ratio)) - - # Normalize if necessary - ratio_sum = sum(split_ratio) - if not ratio_sum == 1.: - split_ratio = [float(ratio) / ratio_sum for ratio in split_ratio] - - if length == 2: - return tuple(split_ratio + [valid_ratio]) - return tuple(split_ratio) - else: - raise ValueError('Split ratio must be float or a list, got {}' - .format(type(split_ratio))) - - -def stratify(examples, strata_field): - # The field has to be hashable otherwise this doesn't work - # There's two iterations over the whole dataset here, which can be - # reduced to just one if a dedicated method for stratified splitting is used - unique_strata = set(getattr(example, strata_field) for example in examples) - strata_maps = {s: [] for s in unique_strata} - for example in examples: - strata_maps[getattr(example, strata_field)].append(example) - return list(strata_maps.values()) - - -def rationed_split(examples, train_ratio, test_ratio, val_ratio, rnd): - """Create a random permutation of examples, then split them by ratios - - Arguments: - examples: a list of data - train_ratio, test_ratio, val_ratio: split fractions. - rnd: a random shuffler - - Examples: - >>> examples = [] - >>> train_ratio, test_ratio, val_ratio = 0.7, 0.2, 0.1 - >>> rnd = torchtext.data.dataset.RandomShuffler(None) - >>> train_examples, test_examples, valid_examples = \ - torchtext.data.dataset.rationed_split(examples, train_ratio, - test_ratio, val_ratio, - rnd) - """ - N = len(examples) - randperm = rnd(range(N)) - train_len = int(round(train_ratio * N)) - - # Due to possible rounding problems - if not val_ratio: - test_len = N - train_len - else: - test_len = int(round(test_ratio * N)) - - indices = (randperm[:train_len], # Train - randperm[train_len:train_len + test_len], # Test - randperm[train_len + test_len:]) # Validation - - # There's a possibly empty list for the validation set - data = tuple([examples[i] for i in index] for index in indices) - - return data diff --git a/torchtext/legacy/data/example.py b/torchtext/legacy/data/example.py deleted file mode 100644 index d9f96aeda3..0000000000 --- a/torchtext/legacy/data/example.py +++ /dev/null @@ -1,99 +0,0 @@ -import json -from functools import reduce - - -class Example(object): - """Defines a single training or test example. - - Stores each column of the example as an attribute. - """ - @classmethod - def fromJSON(cls, data, fields): - ex = cls() - obj = json.loads(data) - - for key, vals in fields.items(): - if vals is not None: - if not isinstance(vals, list): - vals = [vals] - - for val in vals: - # for processing the key likes 'foo.bar' - name, field = val - ks = key.split('.') - - def reducer(obj, key): - if isinstance(obj, list): - results = [] - for data in obj: - if key not in data: - # key error - raise ValueError("Specified key {} was not found in " - "the input data".format(key)) - else: - results.append(data[key]) - return results - else: - # key error - if key not in obj: - raise ValueError("Specified key {} was not found in " - "the input data".format(key)) - else: - return obj[key] - - v = reduce(reducer, ks, obj) - setattr(ex, name, field.preprocess(v)) - return ex - - @classmethod - def fromdict(cls, data, fields): - ex = cls() - for key, vals in fields.items(): - if key not in data: - raise ValueError("Specified key {} was not found in " - "the input data".format(key)) - if vals is not None: - if not isinstance(vals, list): - vals = [vals] - for val in vals: - name, field = val - setattr(ex, name, field.preprocess(data[key])) - return ex - - @classmethod - def fromCSV(cls, data, fields, field_to_index=None): - if field_to_index is None: - return cls.fromlist(data, fields) - else: - assert(isinstance(fields, dict)) - data_dict = {f: data[idx] for f, idx in field_to_index.items()} - return cls.fromdict(data_dict, fields) - - @classmethod - def fromlist(cls, data, fields): - ex = cls() - for (name, field), val in zip(fields, data): - if field is not None: - if isinstance(val, str): - val = val.rstrip('\n') - # Handle field tuples - if isinstance(name, tuple): - for n, f in zip(name, field): - setattr(ex, n, f.preprocess(val)) - else: - setattr(ex, name, field.preprocess(val)) - return ex - - @classmethod - def fromtree(cls, data, fields, subtrees=False): - try: - from nltk.tree import Tree - except ImportError: - print("Please install NLTK. " - "See the docs at http://nltk.org for more information.") - raise - tree = Tree.fromstring(data) - if subtrees: - return [cls.fromlist( - [' '.join(t.leaves()), t.label()], fields) for t in tree.subtrees()] - return cls.fromlist([' '.join(tree.leaves()), tree.label()], fields) diff --git a/torchtext/legacy/data/field.py b/torchtext/legacy/data/field.py deleted file mode 100644 index efbf888666..0000000000 --- a/torchtext/legacy/data/field.py +++ /dev/null @@ -1,735 +0,0 @@ -# coding: utf8 -from collections import Counter, OrderedDict -from itertools import chain -import torch -from tqdm import tqdm -from .dataset import Dataset -from .pipeline import Pipeline -from torchtext.data.utils import get_tokenizer, dtype_to_attr, is_tokenizer_serializable -from torchtext.legacy.vocab import Vocab, SubwordVocab - - -class RawField(object): - """ Defines a general datatype. - - Every dataset consists of one or more types of data. For instance, a text - classification dataset contains sentences and their classes, while a - machine translation dataset contains paired examples of text in two - languages. Each of these types of data is represented by a RawField object. - A RawField object does not assume any property of the data type and - it holds parameters relating to how a datatype should be processed. - - Attributes: - preprocessing: The Pipeline that will be applied to examples - using this field before creating an example. - Default: None. - postprocessing: A Pipeline that will be applied to a list of examples - using this field before assigning to a batch. - Function signature: (batch(list)) -> object - Default: None. - is_target: Whether this field is a target variable. - Affects iteration over batches. Default: False - """ - - def __init__(self, preprocessing=None, postprocessing=None, is_target=False): - self.preprocessing = preprocessing - self.postprocessing = postprocessing - self.is_target = is_target - - def preprocess(self, x): - """ Preprocess an example if the `preprocessing` Pipeline is provided. """ - if self.preprocessing is not None: - return self.preprocessing(x) - else: - return x - - def process(self, batch, *args, **kwargs): - """ Process a list of examples to create a batch. - - Postprocess the batch with user-provided Pipeline. - - Args: - batch (list(object)): A list of object from a batch of examples. - Returns: - object: Processed object given the input and custom - postprocessing Pipeline. - """ - if self.postprocessing is not None: - batch = self.postprocessing(batch) - return batch - - -class Field(RawField): - """Defines a datatype together with instructions for converting to Tensor. - - Field class models common text processing datatypes that can be represented - by tensors. It holds a Vocab object that defines the set of possible values - for elements of the field and their corresponding numerical representations. - The Field object also holds other parameters relating to how a datatype - should be numericalized, such as a tokenization method and the kind of - Tensor that should be produced. - - If a Field is shared between two columns in a dataset (e.g., question and - answer in a QA dataset), then they will have a shared vocabulary. - - Attributes: - sequential: Whether the datatype represents sequential data. If False, - no tokenization is applied. Default: True. - use_vocab: Whether to use a Vocab object. If False, the data in this - field should already be numerical. Default: True. - init_token: A token that will be prepended to every example using this - field, or None for no initial token. Default: None. - eos_token: A token that will be appended to every example using this - field, or None for no end-of-sentence token. Default: None. - fix_length: A fixed length that all examples using this field will be - padded to, or None for flexible sequence lengths. Default: None. - dtype: The torch.dtype class that represents a batch of examples - of this kind of data. Default: torch.long. - preprocessing: The Pipeline that will be applied to examples - using this field after tokenizing but before numericalizing. Many - Datasets replace this attribute with a custom preprocessor. - Default: None. - postprocessing: A Pipeline that will be applied to examples using - this field after numericalizing but before the numbers are turned - into a Tensor. The pipeline function takes the batch as a list, and - the field's Vocab. - Default: None. - lower: Whether to lowercase the text in this field. Default: False. - tokenize: The function used to tokenize strings using this field into - sequential examples. If "spacy", the SpaCy tokenizer is - used. If a non-serializable function is passed as an argument, - the field will not be able to be serialized. Default: string.split. - tokenizer_language: The language of the tokenizer to be constructed. - Various languages currently supported only in SpaCy. - include_lengths: Whether to return a tuple of a padded minibatch and - a list containing the lengths of each examples, or just a padded - minibatch. Default: False. - batch_first: Whether to produce tensors with the batch dimension first. - Default: False. - pad_token: The string token used as padding. Default: "". - unk_token: The string token used to represent OOV words. Default: "". - pad_first: Do the padding of the sequence at the beginning. Default: False. - truncate_first: Do the truncating of the sequence at the beginning. Default: False - stop_words: Tokens to discard during the preprocessing step. Default: None - is_target: Whether this field is a target variable. - Affects iteration over batches. Default: False - """ - - vocab_cls = Vocab - # Dictionary mapping PyTorch tensor dtypes to the appropriate Python - # numeric type. - dtypes = { - torch.float32: float, - torch.float: float, - torch.float64: float, - torch.double: float, - torch.float16: float, - torch.half: float, - - torch.uint8: int, - torch.int8: int, - torch.int16: int, - torch.short: int, - torch.int32: int, - torch.int: int, - torch.int64: int, - torch.long: int, - } - - ignore = ['dtype', 'tokenize'] - - def __init__(self, sequential=True, use_vocab=True, init_token=None, - eos_token=None, fix_length=None, dtype=torch.long, - preprocessing=None, postprocessing=None, lower=False, - tokenize=None, tokenizer_language='en', include_lengths=False, - batch_first=False, pad_token="", unk_token="", - pad_first=False, truncate_first=False, stop_words=None, - is_target=False): - self.sequential = sequential - self.use_vocab = use_vocab - self.init_token = init_token - self.eos_token = eos_token - self.unk_token = unk_token - self.fix_length = fix_length - self.dtype = dtype - self.preprocessing = preprocessing - self.postprocessing = postprocessing - self.lower = lower - # store params to construct tokenizer for serialization - # in case the tokenizer isn't picklable (e.g. spacy) - self.tokenizer_args = (tokenize, tokenizer_language) - self.tokenize = get_tokenizer(tokenize, tokenizer_language) - self.include_lengths = include_lengths - self.batch_first = batch_first - self.pad_token = pad_token if self.sequential else None - self.pad_first = pad_first - self.truncate_first = truncate_first - try: - self.stop_words = set(stop_words) if stop_words is not None else None - except TypeError: - raise ValueError("Stop words must be convertible to a set") - self.is_target = is_target - - def __getstate__(self): - str_type = dtype_to_attr(self.dtype) - if is_tokenizer_serializable(*self.tokenizer_args): - tokenize = self.tokenize - else: - # signal to restore in `__setstate__` - tokenize = None - attrs = {k: v for k, v in self.__dict__.items() if k not in self.ignore} - attrs['dtype'] = str_type - attrs['tokenize'] = tokenize - - return attrs - - def __setstate__(self, state): - state['dtype'] = getattr(torch, state['dtype']) - if not state['tokenize']: - state['tokenize'] = get_tokenizer(*state['tokenizer_args']) - self.__dict__.update(state) - - def __hash__(self): - # we don't expect this to be called often - return 42 - - def __eq__(self, other): - if not isinstance(other, RawField): - return False - - return self.__dict__ == other.__dict__ - - def preprocess(self, x): - """Load a single example using this field, tokenizing if necessary. - - If `sequential=True`, the input will be tokenized. Then the input - will be optionally lowercased and passed to the user-provided - `preprocessing` Pipeline.""" - if self.sequential and isinstance(x, str): - x = self.tokenize(x.rstrip('\n')) - if self.lower: - x = Pipeline(str.lower)(x) - if self.sequential and self.use_vocab and self.stop_words is not None: - x = [w for w in x if w not in self.stop_words] - if self.preprocessing is not None: - return self.preprocessing(x) - else: - return x - - def process(self, batch, device=None): - """ Process a list of examples to create a torch.Tensor. - - Pad, numericalize, and postprocess a batch and create a tensor. - - Args: - batch (list(object)): A list of object from a batch of examples. - Returns: - torch.autograd.Variable: Processed object given the input - and custom postprocessing Pipeline. - """ - padded = self.pad(batch) - tensor = self.numericalize(padded, device=device) - return tensor - - def pad(self, minibatch): - """Pad a batch of examples using this field. - - Pads to self.fix_length if provided, otherwise pads to the length of - the longest example in the batch. Prepends self.init_token and appends - self.eos_token if those attributes are not None. Returns a tuple of the - padded list and a list containing lengths of each example if - `self.include_lengths` is `True` and `self.sequential` is `True`, else just - returns the padded list. If `self.sequential` is `False`, no padding is applied. - """ - minibatch = list(minibatch) - if not self.sequential: - return minibatch - if self.fix_length is None: - max_len = max(len(x) for x in minibatch) - else: - max_len = self.fix_length + ( - self.init_token, self.eos_token).count(None) - 2 - padded, lengths = [], [] - for x in minibatch: - if self.pad_first: - padded.append( - [self.pad_token] * max(0, max_len - len(x)) - + ([] if self.init_token is None else [self.init_token]) - + list(x[-max_len:] if self.truncate_first else x[:max_len]) - + ([] if self.eos_token is None else [self.eos_token])) - else: - padded.append( - ([] if self.init_token is None else [self.init_token]) - + list(x[-max_len:] if self.truncate_first else x[:max_len]) - + ([] if self.eos_token is None else [self.eos_token]) - + [self.pad_token] * max(0, max_len - len(x))) - lengths.append(len(padded[-1]) - max(0, max_len - len(x))) - if self.include_lengths: - return (padded, lengths) - return padded - - def build_vocab(self, *args, **kwargs): - """Construct the Vocab object for this field from one or more datasets. - - Arguments: - Positional arguments: Dataset objects or other iterable data - sources from which to construct the Vocab object that - represents the set of possible values for this field. If - a Dataset object is provided, all columns corresponding - to this field are used; individual columns can also be - provided directly. - Remaining keyword arguments: Passed to the constructor of Vocab. - """ - counter = Counter() - sources = [] - for arg in args: - if isinstance(arg, Dataset): - sources += [getattr(arg, name) for name, field in - arg.fields.items() if field is self] - else: - sources.append(arg) - for data in sources: - for x in data: - if not self.sequential: - x = [x] - try: - counter.update(x) - except TypeError: - counter.update(chain.from_iterable(x)) - specials = list(OrderedDict.fromkeys( - tok for tok in [self.unk_token, self.pad_token, self.init_token, - self.eos_token] + kwargs.pop('specials', []) - if tok is not None)) - self.vocab = self.vocab_cls(counter, specials=specials, **kwargs) - - def numericalize(self, arr, device=None): - """Turn a batch of examples that use this field into a Variable. - - If the field has include_lengths=True, a tensor of lengths will be - included in the return value. - - Arguments: - arr (List[List[str]], or tuple of (List[List[str]], List[int])): List of tokenized - and padded examples, or tuple of List of - tokenized and padded examples and List of lengths of each - example if self.include_lengths is True. - device (str or torch.device): A string or instance of `torch.device` - specifying which device the Variables are going to be created on. - If left as default, the tensors will be created on cpu. Default: None. - """ - if self.include_lengths and not isinstance(arr, tuple): - raise ValueError("Field has include_lengths set to True, but " - "input data is not a tuple of " - "(data batch, batch lengths).") - if isinstance(arr, tuple): - arr, lengths = arr - lengths = torch.tensor(lengths, dtype=self.dtype, device=device) - - if self.use_vocab: - if self.sequential: - arr = [[self.vocab.stoi[x] for x in ex] for ex in arr] - else: - arr = [self.vocab.stoi[x] for x in arr] - - if self.postprocessing is not None: - arr = self.postprocessing(arr, self.vocab) - else: - if self.dtype not in self.dtypes: - raise ValueError( - "Specified Field dtype {} can not be used with " - "use_vocab=False because we do not know how to numericalize it. " - "Please raise an issue at " - "https://github.com/pytorch/text/issues".format(self.dtype)) - numericalization_func = self.dtypes[self.dtype] - # It doesn't make sense to explicitly coerce to a numeric type if - # the data is sequential, since it's unclear how to coerce padding tokens - # to a numeric type. - if not self.sequential: - arr = [numericalization_func(x) if isinstance(x, str) - else x for x in arr] - if self.postprocessing is not None: - arr = self.postprocessing(arr, None) - - var = torch.tensor(arr, dtype=self.dtype, device=device) - - if self.sequential and not self.batch_first: - var.t_() - if self.sequential: - var = var.contiguous() - - if self.include_lengths: - return var, lengths - return var - - -class ReversibleField(Field): - def __init__(self, **kwargs): - if kwargs.get('tokenize') is list: - self.use_revtok = False - else: - self.use_revtok = True - if kwargs.get('tokenize') is None: - kwargs['tokenize'] = 'revtok' - if 'unk_token' not in kwargs: - kwargs['unk_token'] = ' UNK ' - super(ReversibleField, self).__init__(**kwargs) - - def reverse(self, batch): - if self.use_revtok: - try: - import revtok - except ImportError: - print("Please install revtok.") - raise - if not self.batch_first: - batch = batch.t() - with torch.cuda.device_of(batch): - batch = batch.tolist() - batch = [[self.vocab.itos[ind] for ind in ex] for ex in batch] # denumericalize - - def trim(s, t): - sentence = [] - for w in s: - if w == t: - break - sentence.append(w) - return sentence - - batch = [trim(ex, self.eos_token) for ex in batch] # trim past frst eos - - def filter_special(tok): - return tok not in (self.init_token, self.pad_token) - - batch = [filter(filter_special, ex) for ex in batch] - if self.use_revtok: - return [revtok.detokenize(ex) for ex in batch] - return [''.join(ex) for ex in batch] - - -class SubwordField(ReversibleField): - vocab_cls = SubwordVocab - - def __init__(self, **kwargs): - kwargs['tokenize'] = 'subword' - if 'unk_token' not in kwargs: - kwargs['unk_token'] = '�' - super(SubwordField, self).__init__(**kwargs) - - def segment(self, *args): - """Segment one or more datasets with this subword field. - - Arguments: - Positional arguments: Dataset objects or other indexable - mutable sequences to segment. If a Dataset object is provided, - all columns corresponding to this field are used; individual - columns can also be provided directly. - """ - sources = [] - for arg in args: - if isinstance(arg, Dataset): - sources += [getattr(arg, name) for name, field in - arg.fields.items() if field is self] - else: - sources.append(arg) - for data in sources: - for x in tqdm(data, 'segmenting'): - x[:] = self.vocab.segment(x) - - -class NestedField(Field): - """A nested field. - - A nested field holds another field (called *nesting field*), accepts an untokenized - string or a list string tokens and groups and treats them as one field as described - by the nesting field. Every token will be preprocessed, padded, etc. in the manner - specified by the nesting field. Note that this means a nested field always has - ``sequential=True``. The two fields' vocabularies will be shared. Their - numericalization results will be stacked into a single tensor. And NestedField will - share the same include_lengths with nesting_field, so one shouldn't specify the - include_lengths in the nesting_field. This field is - primarily used to implement character embeddings. See ``tests/data/test_field.py`` - for examples on how to use this field. - - Arguments: - nesting_field (Field): A field contained in this nested field. - use_vocab (bool): Whether to use a Vocab object. If False, the data in this - field should already be numerical. Default: ``True``. - init_token (str): A token that will be prepended to every example using this - field, or None for no initial token. Default: ``None``. - eos_token (str): A token that will be appended to every example using this - field, or None for no end-of-sentence token. Default: ``None``. - fix_length (int): A fixed length that all examples using this field will be - padded to, or ``None`` for flexible sequence lengths. Default: ``None``. - dtype: The torch.dtype class that represents a batch of examples - of this kind of data. Default: ``torch.long``. - preprocessing (Pipeline): The Pipeline that will be applied to examples - using this field after tokenizing but before numericalizing. Many - Datasets replace this attribute with a custom preprocessor. - Default: ``None``. - postprocessing (Pipeline): A Pipeline that will be applied to examples using - this field after numericalizing but before the numbers are turned - into a Tensor. The pipeline function takes the batch as a list, and - the field's Vocab. Default: ``None``. - include_lengths: Whether to return a tuple of a padded minibatch and - a list containing the lengths of each examples, or just a padded - minibatch. Default: False. - tokenize: The function used to tokenize strings using this field into - sequential examples. If "spacy", the SpaCy tokenizer is - used. If a non-serializable function is passed as an argument, - the field will not be able to be serialized. Default: string.split. - tokenizer_language: The language of the tokenizer to be constructed. - Various languages currently supported only in SpaCy. - pad_token (str): The string token used as padding. If ``nesting_field`` is - sequential, this will be set to its ``pad_token``. Default: ``""``. - pad_first (bool): Do the padding of the sequence at the beginning. Default: - ``False``. - """ - - def __init__(self, nesting_field, use_vocab=True, init_token=None, eos_token=None, - fix_length=None, dtype=torch.long, preprocessing=None, - postprocessing=None, tokenize=None, tokenizer_language='en', - include_lengths=False, pad_token='', - pad_first=False, truncate_first=False): - if isinstance(nesting_field, NestedField): - raise ValueError('nesting field must not be another NestedField') - if nesting_field.include_lengths: - raise ValueError('nesting field cannot have include_lengths=True') - - if nesting_field.sequential: - pad_token = nesting_field.pad_token - super(NestedField, self).__init__( - use_vocab=use_vocab, - init_token=init_token, - eos_token=eos_token, - fix_length=fix_length, - dtype=dtype, - preprocessing=preprocessing, - postprocessing=postprocessing, - lower=nesting_field.lower, - tokenize=tokenize, - tokenizer_language=tokenizer_language, - batch_first=True, - pad_token=pad_token, - unk_token=nesting_field.unk_token, - pad_first=pad_first, - truncate_first=truncate_first, - include_lengths=include_lengths - ) - self.nesting_field = nesting_field - # in case the user forget to do that - self.nesting_field.batch_first = True - - def preprocess(self, xs): - """Preprocess a single example. - - Firstly, tokenization and the supplied preprocessing pipeline is applied. Since - this field is always sequential, the result is a list. Then, each element of - the list is preprocessed using ``self.nesting_field.preprocess`` and the resulting - list is returned. - - Arguments: - xs (list or str): The input to preprocess. - - Returns: - list: The preprocessed list. - """ - return [self.nesting_field.preprocess(x) - for x in super(NestedField, self).preprocess(xs)] - - def pad(self, minibatch): - """Pad a batch of examples using this field. - - If ``self.nesting_field.sequential`` is ``False``, each example in the batch must - be a list of string tokens, and pads them as if by a ``Field`` with - ``sequential=True``. Otherwise, each example must be a list of list of tokens. - Using ``self.nesting_field``, pads the list of tokens to - ``self.nesting_field.fix_length`` if provided, or otherwise to the length of the - longest list of tokens in the batch. Next, using this field, pads the result by - filling short examples with ``self.nesting_field.pad_token``. - - Example: - >>> import pprint - >>> pp = pprint.PrettyPrinter(indent=4) - >>> - >>> nesting_field = Field(pad_token='', init_token='', eos_token='') - >>> field = NestedField(nesting_field, init_token='', eos_token='') - >>> minibatch = [ - ... [list('john'), list('loves'), list('mary')], - ... [list('mary'), list('cries')], - ... ] - >>> padded = field.pad(minibatch) - >>> pp.pprint(padded) - [ [ ['', '', '', '', '', '', ''], - ['', 'j', 'o', 'h', 'n', '', ''], - ['', 'l', 'o', 'v', 'e', 's', ''], - ['', 'm', 'a', 'r', 'y', '', ''], - ['', '', '', '', '', '', '']], - [ ['', '', '', '', '', '', ''], - ['', 'm', 'a', 'r', 'y', '', ''], - ['', 'c', 'r', 'i', 'e', 's', ''], - ['', '', '', '', '', '', ''], - ['', '', '', '', '', '', '']]] - - Arguments: - minibatch (list): Each element is a list of string if - ``self.nesting_field.sequential`` is ``False``, a list of list of string - otherwise. - - Returns: - list: The padded minibatch. or (padded, sentence_lens, word_lengths) - """ - minibatch = list(minibatch) - if not self.nesting_field.sequential: - return super(NestedField, self).pad(minibatch) - - # Save values of attributes to be monkeypatched - old_pad_token = self.pad_token - old_init_token = self.init_token - old_eos_token = self.eos_token - old_fix_len = self.nesting_field.fix_length - # Monkeypatch the attributes - if self.nesting_field.fix_length is None: - max_len = max(len(xs) for ex in minibatch for xs in ex) - fix_len = max_len + 2 - (self.nesting_field.init_token, - self.nesting_field.eos_token).count(None) - self.nesting_field.fix_length = fix_len - self.pad_token = [self.pad_token] * self.nesting_field.fix_length - if self.init_token is not None: - # self.init_token = self.nesting_field.pad([[self.init_token]])[0] - self.init_token = [self.init_token] - if self.eos_token is not None: - # self.eos_token = self.nesting_field.pad([[self.eos_token]])[0] - self.eos_token = [self.eos_token] - # Do padding - old_include_lengths = self.include_lengths - self.include_lengths = True - self.nesting_field.include_lengths = True - padded, sentence_lengths = super(NestedField, self).pad(minibatch) - padded_with_lengths = [self.nesting_field.pad(ex) for ex in padded] - word_lengths = [] - final_padded = [] - max_sen_len = len(padded[0]) - for (pad, lens), sentence_len in zip(padded_with_lengths, sentence_lengths): - if sentence_len == max_sen_len: - lens = lens - pad = pad - elif self.pad_first: - lens[:(max_sen_len - sentence_len)] = ( - [0] * (max_sen_len - sentence_len)) - pad[:(max_sen_len - sentence_len)] = ( - [self.pad_token] * (max_sen_len - sentence_len)) - else: - lens[-(max_sen_len - sentence_len):] = ( - [0] * (max_sen_len - sentence_len)) - pad[-(max_sen_len - sentence_len):] = ( - [self.pad_token] * (max_sen_len - sentence_len)) - word_lengths.append(lens) - final_padded.append(pad) - padded = final_padded - - # Restore monkeypatched attributes - self.nesting_field.fix_length = old_fix_len - self.pad_token = old_pad_token - self.init_token = old_init_token - self.eos_token = old_eos_token - self.include_lengths = old_include_lengths - if self.include_lengths: - return padded, sentence_lengths, word_lengths - return padded - - def build_vocab(self, *args, **kwargs): - """Construct the Vocab object for nesting field and combine it with this field's vocab. - - Arguments: - Positional arguments: Dataset objects or other iterable data - sources from which to construct the Vocab object that - represents the set of possible values for the nesting field. If - a Dataset object is provided, all columns corresponding - to this field are used; individual columns can also be - provided directly. - Remaining keyword arguments: Passed to the constructor of Vocab. - """ - sources = [] - for arg in args: - if isinstance(arg, Dataset): - sources.extend( - [getattr(arg, name) for name, field in arg.fields.items() - if field is self] - ) - else: - sources.append(arg) - - flattened = [] - for source in sources: - flattened.extend(source) - old_vectors = None - old_unk_init = None - old_vectors_cache = None - if "vectors" in kwargs.keys(): - old_vectors = kwargs["vectors"] - kwargs["vectors"] = None - if "unk_init" in kwargs.keys(): - old_unk_init = kwargs["unk_init"] - kwargs["unk_init"] = None - if "vectors_cache" in kwargs.keys(): - old_vectors_cache = kwargs["vectors_cache"] - kwargs["vectors_cache"] = None - # just build vocab and does not load vector - self.nesting_field.build_vocab(*flattened, **kwargs) - super(NestedField, self).build_vocab() - self.vocab.extend(self.nesting_field.vocab) - self.vocab.freqs = self.nesting_field.vocab.freqs.copy() - if old_vectors is not None: - self.vocab.load_vectors(old_vectors, - unk_init=old_unk_init, cache=old_vectors_cache) - - self.nesting_field.vocab = self.vocab - - def numericalize(self, arrs, device=None): - """Convert a padded minibatch into a variable tensor. - - Each item in the minibatch will be numericalized independently and the resulting - tensors will be stacked at the first dimension. - - Arguments: - arrs (List[List[str]]): List of tokenized and padded examples. - device (str or torch.device): A string or instance of `torch.device` - specifying which device the Variables are going to be created on. - If left as default, the tensors will be created on cpu. Default: None. - """ - numericalized = [] - self.nesting_field.include_lengths = False - if self.include_lengths: - arrs, sentence_lengths, word_lengths = arrs - - for arr in arrs: - numericalized_ex = self.nesting_field.numericalize( - arr, device=device) - numericalized.append(numericalized_ex) - padded_batch = torch.stack(numericalized) - - self.nesting_field.include_lengths = True - if self.include_lengths: - sentence_lengths = \ - torch.tensor(sentence_lengths, dtype=self.dtype, device=device) - word_lengths = torch.tensor(word_lengths, dtype=self.dtype, device=device) - return (padded_batch, sentence_lengths, word_lengths) - return padded_batch - - -class LabelField(Field): - """A Label field. - - A label field is a shallow wrapper around a standard field designed to hold labels - for a classification task. Its only use is to set the unk_token and sequential to - `None` by default. - """ - - def __init__(self, **kwargs): - # whichever value is set for sequential, unk_token, and is_target - # will be overwritten - kwargs['sequential'] = False - kwargs['unk_token'] = None - kwargs['is_target'] = True - - super(LabelField, self).__init__(**kwargs) diff --git a/torchtext/legacy/data/iterator.py b/torchtext/legacy/data/iterator.py deleted file mode 100644 index b0bb4437b5..0000000000 --- a/torchtext/legacy/data/iterator.py +++ /dev/null @@ -1,297 +0,0 @@ -import math -import random - -import logging -import torch -from torchtext.data.utils import RandomShuffler -from .batch import Batch -from .dataset import Dataset - -logger = logging.getLogger(__name__) - - -class Iterator(object): - """Defines an iterator that loads batches of data from a Dataset. - - Attributes: - dataset: The Dataset object to load Examples from. - batch_size: Batch size. - batch_size_fn: Function of three arguments (new example to add, current - count of examples in the batch, and current effective batch size) - that returns the new effective batch size resulting from adding - that example to a batch. This is useful for dynamic batching, where - this function would add to the current effective batch size the - number of tokens in the new example. - sort_key: A key to use for sorting examples in order to batch together - examples with similar lengths and minimize padding. The sort_key - provided to the Iterator constructor overrides the sort_key - attribute of the Dataset, or defers to it if None. - train: Whether the iterator represents a train set. - repeat: Whether to repeat the iterator for multiple epochs. Default: False. - shuffle: Whether to shuffle examples between epochs. - sort: Whether to sort examples according to self.sort_key. - Note that shuffle and sort default to train and (not train). - sort_within_batch: Whether to sort (in descending order according to - self.sort_key) within each batch. If None, defaults to self.sort. - If self.sort is True and this is False, the batch is left in the - original (ascending) sorted order. - device (str or `torch.device`): A string or instance of `torch.device` - specifying which device the Variables are going to be created on. - If left as default, the tensors will be created on cpu. Default: None. - """ - - def __init__(self, dataset, batch_size, sort_key=None, device=None, - batch_size_fn=None, train=True, - repeat=False, shuffle=None, sort=None, - sort_within_batch=None): - self.batch_size, self.train, self.dataset = batch_size, train, dataset - self.batch_size_fn = batch_size_fn - self.iterations = 0 - self.repeat = repeat - self.shuffle = train if shuffle is None else shuffle - self.sort = not train if sort is None else sort - - if sort_within_batch is None: - self.sort_within_batch = self.sort - else: - self.sort_within_batch = sort_within_batch - if sort_key is None: - self.sort_key = dataset.sort_key - else: - self.sort_key = sort_key - - if isinstance(device, int): - logger.warning("The `device` argument should be set by using `torch.device`" - + " or passing a string as an argument. This behavior will be" - + " deprecated soon and currently defaults to cpu.") - device = None - - if device is None: - device = torch.device('cpu') - elif isinstance(device, str): - device = torch.device(device) - - self.device = device - self.random_shuffler = RandomShuffler() - - # For state loading/saving only - self._iterations_this_epoch = 0 - self._random_state_this_epoch = None - self._restored_from_state = False - - @classmethod - def splits(cls, datasets, batch_sizes=None, **kwargs): - """Create Iterator objects for multiple splits of a dataset. - - Arguments: - datasets: Tuple of Dataset objects corresponding to the splits. The - first such object should be the train set. - batch_sizes: Tuple of batch sizes to use for the different splits, - or None to use the same batch_size for all splits. - Remaining keyword arguments: Passed to the constructor of the - iterator class being used. - """ - if batch_sizes is None: - batch_sizes = [kwargs.pop('batch_size')] * len(datasets) - ret = [] - for i in range(len(datasets)): - train = i == 0 - ret.append(cls( - datasets[i], batch_size=batch_sizes[i], train=train, **kwargs)) - return tuple(ret) - - def data(self): - """Return the examples in the dataset in order, sorted, or shuffled.""" - if self.sort: - xs = sorted(self.dataset, key=self.sort_key) - elif self.shuffle: - xs = [self.dataset[i] for i in self.random_shuffler(range(len(self.dataset)))] - else: - xs = self.dataset - return xs - - def init_epoch(self): - """Set up the batch generator for a new epoch.""" - - if self._restored_from_state: - self.random_shuffler.random_state = self._random_state_this_epoch - else: - self._random_state_this_epoch = self.random_shuffler.random_state - - self.create_batches() - - if self._restored_from_state: - self._restored_from_state = False - else: - self._iterations_this_epoch = 0 - - if not self.repeat: - self.iterations = 0 - - def create_batches(self): - self.batches = batch(self.data(), self.batch_size, self.batch_size_fn) - - @property - def epoch(self): - return math.floor(self.iterations / len(self)) - - def __len__(self): - if self.batch_size_fn is not None: - raise NotImplementedError - return math.ceil(len(self.dataset) / self.batch_size) - - def __iter__(self): - while True: - self.init_epoch() - for idx, minibatch in enumerate(self.batches): - # fast-forward if loaded from state - if self._iterations_this_epoch > idx: - continue - self.iterations += 1 - self._iterations_this_epoch += 1 - if self.sort_within_batch: - # NOTE: `rnn.pack_padded_sequence` requires that a minibatch - # be sorted by decreasing order, which requires reversing - # relative to typical sort keys - if self.sort: - minibatch.reverse() - else: - minibatch.sort(key=self.sort_key, reverse=True) - yield Batch(minibatch, self.dataset, self.device) - if not self.repeat: - return - - def state_dict(self): - return { - "iterations": self.iterations, - "iterations_this_epoch": self._iterations_this_epoch, - "random_state_this_epoch": self._random_state_this_epoch} - - def load_state_dict(self, state_dict): - self.iterations = state_dict["iterations"] - self._iterations_this_epoch = state_dict["iterations_this_epoch"] - self._random_state_this_epoch = state_dict["random_state_this_epoch"] - self._restored_from_state = True - - -class BPTTIterator(Iterator): - """Defines an iterator for language modeling tasks that use BPTT. - - Provides contiguous streams of examples together with targets that are - one timestep further forward, for language modeling training with - backpropagation through time (BPTT). Expects a Dataset with a single - example and a single field called 'text' and produces Batches with text and - target attributes. - - Attributes: - dataset: The Dataset object to load Examples from. - batch_size: Batch size. - bptt_len: Length of sequences for backpropagation through time. - sort_key: A key to use for sorting examples in order to batch together - examples with similar lengths and minimize padding. The sort_key - provided to the Iterator constructor overrides the sort_key - attribute of the Dataset, or defers to it if None. - train: Whether the iterator represents a train set. - repeat: Whether to repeat the iterator for multiple epochs. Default: False. - shuffle: Whether to shuffle examples between epochs. - sort: Whether to sort examples according to self.sort_key. - Note that shuffle and sort default to train and (not train). - device (str or torch.device): A string or instance of `torch.device` - specifying which device the Variables are going to be created on. - If left as default, the tensors will be created on cpu. Default: None. - """ - - def __init__(self, dataset, batch_size, bptt_len, **kwargs): - self.bptt_len = bptt_len - super(BPTTIterator, self).__init__(dataset, batch_size, **kwargs) - - def __len__(self): - return math.ceil((len(self.dataset[0].text) / self.batch_size - 1) - / self.bptt_len) - - def __iter__(self): - text = self.dataset[0].text - TEXT = self.dataset.fields['text'] - TEXT.eos_token = None - text = text + ([TEXT.pad_token] * int(math.ceil(len(text) / self.batch_size) - * self.batch_size - len(text))) - data = TEXT.numericalize( - [text], device=self.device) - data = data.view(self.batch_size, -1).t().contiguous() - dataset = Dataset(examples=self.dataset.examples, fields=[ - ('text', TEXT), ('target', TEXT)]) - while True: - for i in range(0, len(self) * self.bptt_len, self.bptt_len): - self.iterations += 1 - seq_len = min(self.bptt_len, len(data) - i - 1) - batch_text = data[i:i + seq_len] - batch_target = data[i + 1:i + 1 + seq_len] - if TEXT.batch_first: - batch_text = batch_text.t().contiguous() - batch_target = batch_target.t().contiguous() - yield Batch.fromvars( - dataset, self.batch_size, - text=batch_text, - target=batch_target) - if not self.repeat: - return - - -class BucketIterator(Iterator): - """Defines an iterator that batches examples of similar lengths together. - - Minimizes amount of padding needed while producing freshly shuffled - batches for each new epoch. See pool for the bucketing procedure used. - """ - - def create_batches(self): - if self.sort: - self.batches = batch(self.data(), self.batch_size, - self.batch_size_fn) - else: - self.batches = pool(self.data(), self.batch_size, - self.sort_key, self.batch_size_fn, - random_shuffler=self.random_shuffler, - shuffle=self.shuffle, - sort_within_batch=self.sort_within_batch) - - -def batch(data, batch_size, batch_size_fn=None): - """Yield elements from data in chunks of batch_size.""" - if batch_size_fn is None: - def batch_size_fn(new, count, sofar): - return count - minibatch, size_so_far = [], 0 - for ex in data: - minibatch.append(ex) - size_so_far = batch_size_fn(ex, len(minibatch), size_so_far) - if size_so_far == batch_size: - yield minibatch - minibatch, size_so_far = [], 0 - elif size_so_far > batch_size: - yield minibatch[:-1] - minibatch, size_so_far = minibatch[-1:], batch_size_fn(ex, 1, 0) - if minibatch: - yield minibatch - - -def pool(data, batch_size, key, batch_size_fn=lambda new, count, sofar: count, - random_shuffler=None, shuffle=False, sort_within_batch=False): - """Sort within buckets, then batch, then shuffle batches. - - Partitions data into chunks of size 100*batch_size, sorts examples within - each chunk using sort_key, then batch these examples and shuffle the - batches. - """ - if random_shuffler is None: - random_shuffler = random.shuffle - for p in batch(data, batch_size * 100, batch_size_fn): - p_batch = batch(sorted(p, key=key), batch_size, batch_size_fn) \ - if sort_within_batch \ - else batch(p, batch_size, batch_size_fn) - if shuffle: - for b in random_shuffler(list(p_batch)): - yield b - else: - for b in list(p_batch): - yield b diff --git a/torchtext/legacy/data/pipeline.py b/torchtext/legacy/data/pipeline.py deleted file mode 100644 index f576fdc720..0000000000 --- a/torchtext/legacy/data/pipeline.py +++ /dev/null @@ -1,85 +0,0 @@ -class Pipeline(object): - """Defines a pipeline for transforming sequence data. - - The input is assumed to be utf-8 encoded `str`. - - Attributes: - convert_token: The function to apply to input sequence data. - pipes: The Pipelines that will be applied to input sequence - data in order. - """ - - def __init__(self, convert_token=None): - """Create a pipeline. - - Arguments: - convert_token: The function to apply to input sequence data. - If None, the identity function is used. Default: None - """ - if convert_token is None: - self.convert_token = Pipeline.identity - elif callable(convert_token): - self.convert_token = convert_token - else: - raise ValueError("Pipeline input convert_token {} is not None " - "or callable".format(convert_token)) - self.pipes = [self] - - def __call__(self, x, *args): - """Apply the the current Pipeline(s) to an input. - - Arguments: - x: The input to process with the Pipeline(s). - Positional arguments: Forwarded to the `call` function - of the Pipeline(s). - """ - for pipe in self.pipes: - x = pipe.call(x, *args) - return x - - def call(self, x, *args): - """Apply _only_ the convert_token function of the current pipeline - to the input. If the input is a list, a list with the results of - applying the `convert_token` function to all input elements is - returned. - - Arguments: - x: The input to apply the convert_token function to. - Positional arguments: Forwarded to the `convert_token` function - of the current Pipeline. - """ - if isinstance(x, list): - return [self.convert_token(tok, *args) for tok in x] - return self.convert_token(x, *args) - - def add_before(self, pipeline): - """Add a Pipeline to be applied before this processing pipeline. - - Arguments: - pipeline: The Pipeline or callable to apply before this - Pipeline. - """ - if not isinstance(pipeline, Pipeline): - pipeline = Pipeline(pipeline) - self.pipes = pipeline.pipes[:] + self.pipes[:] - return self - - def add_after(self, pipeline): - """Add a Pipeline to be applied after this processing pipeline. - - Arguments: - pipeline: The Pipeline or callable to apply after this - Pipeline. - """ - if not isinstance(pipeline, Pipeline): - pipeline = Pipeline(pipeline) - self.pipes = self.pipes[:] + pipeline.pipes[:] - return self - - @staticmethod - def identity(x): - """Return a copy of the input. - - This is here for serialization compatibility with pickle. - """ - return x diff --git a/torchtext/legacy/datasets/__init__.py b/torchtext/legacy/datasets/__init__.py deleted file mode 100644 index 5eced837a5..0000000000 --- a/torchtext/legacy/datasets/__init__.py +++ /dev/null @@ -1,42 +0,0 @@ -from .language_modeling import LanguageModelingDataset, WikiText2, WikiText103, PennTreebank # NOQA -from .nli import SNLI, MultiNLI, XNLI -from .sst import SST -from .translation import TranslationDataset, Multi30k, IWSLT, WMT14 # NOQA -from .sequence_tagging import SequenceTaggingDataset, UDPOS, CoNLL2000Chunking # NOQA -from .trec import TREC -from .imdb import IMDB -from .babi import BABI20 -from .text_classification import TextClassificationDataset, \ - AG_NEWS, SogouNews, DBpedia, YelpReviewPolarity, \ - YelpReviewFull, YahooAnswers, \ - AmazonReviewPolarity, AmazonReviewFull -from .unsupervised_learning import EnWik9 - -__all__ = ['LanguageModelingDataset', - 'SNLI', - 'MultiNLI', - 'XNLI', - 'SST', - 'TranslationDataset', - 'Multi30k', - 'IWSLT', - 'WMT14', - 'WikiText2', - 'WikiText103', - 'PennTreebank', - 'TREC', - 'IMDB', - 'SequenceTaggingDataset', - 'UDPOS', - 'CoNLL2000Chunking', - 'BABI20', - 'TextClassificationDataset', - 'AG_NEWS', - 'SogouNews', - 'DBpedia', - 'YelpReviewPolarity', - 'YelpReviewFull', - 'YahooAnswers', - 'AmazonReviewPolarity', - 'AmazonReviewFull', - 'EnWik9'] diff --git a/torchtext/legacy/datasets/babi.py b/torchtext/legacy/datasets/babi.py deleted file mode 100644 index 631642b4d1..0000000000 --- a/torchtext/legacy/datasets/babi.py +++ /dev/null @@ -1,140 +0,0 @@ -import os -from io import open - -import torch - -from ..data import Dataset, Field, Example, Iterator - - -class BABI20Field(Field): - - def __init__(self, memory_size, **kwargs): - super(BABI20Field, self).__init__(**kwargs) - self.memory_size = memory_size - self.unk_token = None - self.batch_first = True - - def preprocess(self, x): - if isinstance(x, list): - return [super(BABI20Field, self).preprocess(s) for s in x] - else: - return super(BABI20Field, self).preprocess(x) - - def pad(self, minibatch): - if isinstance(minibatch[0][0], list): - self.fix_length = max(max(len(x) for x in ex) for ex in minibatch) - padded = [] - for ex in minibatch: - # sentences are indexed in reverse order and truncated to memory_size - nex = ex[::-1][:self.memory_size] - padded.append( - super(BABI20Field, self).pad(nex) - + [[self.pad_token] * self.fix_length] - * (self.memory_size - len(nex))) - self.fix_length = None - return padded - else: - return super(BABI20Field, self).pad(minibatch) - - def numericalize(self, arr, device=None): - if isinstance(arr[0][0], list): - tmp = [ - super(BABI20Field, self).numericalize(x, device=device).data - for x in arr - ] - arr = torch.stack(tmp) - if self.sequential: - arr = arr.contiguous() - return arr - else: - return super(BABI20Field, self).numericalize(arr, device=device) - - -class BABI20(Dataset): - urls = ['http://www.thespermwhale.com/jaseweston/babi/tasks_1-20_v1-2.tar.gz'] - name = '' - dirname = '' - - def __init__(self, path, text_field, only_supporting=False, **kwargs): - fields = [('story', text_field), ('query', text_field), ('answer', text_field)] - self.sort_key = lambda x: len(x.query) - - with open(path, 'r', encoding="utf-8") as f: - triplets = self._parse(f, only_supporting) - examples = [Example.fromlist(triplet, fields) for triplet in triplets] - - super(BABI20, self).__init__(examples, fields, **kwargs) - - @staticmethod - def _parse(file, only_supporting): - data, story = [], [] - for line in file: - tid, text = line.rstrip('\n').split(' ', 1) - if tid == '1': - story = [] - # sentence - if text.endswith('.'): - story.append(text[:-1]) - # question - else: - # remove any leading or trailing whitespace after splitting - query, answer, supporting = (x.strip() for x in text.split('\t')) - if only_supporting: - substory = [story[int(i) - 1] for i in supporting.split()] - else: - substory = [x for x in story if x] - data.append((substory, query[:-1], answer)) # remove '?' - story.append("") - return data - - @classmethod - def splits(cls, text_field, path=None, root='.data', task=1, joint=False, tenK=False, - only_supporting=False, train=None, validation=None, test=None, **kwargs): - assert isinstance(task, int) and 1 <= task <= 20 - if tenK: - cls.dirname = os.path.join('tasks_1-20_v1-2', 'en-valid-10k') - else: - cls.dirname = os.path.join('tasks_1-20_v1-2', 'en-valid') - if path is None: - path = cls.download(root) - if train is None: - if joint: # put all tasks together for joint learning - train = 'all_train.txt' - if not os.path.isfile(os.path.join(path, train)): - with open(os.path.join(path, train), 'w') as tf: - for task in range(1, 21): - with open( - os.path.join(path, - 'qa' + str(task) + '_train.txt')) as f: - tf.write(f.read()) - else: - train = 'qa' + str(task) + '_train.txt' - if validation is None: - if joint: # put all tasks together for joint learning - validation = 'all_valid.txt' - if not os.path.isfile(os.path.join(path, validation)): - with open(os.path.join(path, validation), 'w') as tf: - for task in range(1, 21): - with open( - os.path.join(path, - 'qa' + str(task) + '_valid.txt')) as f: - tf.write(f.read()) - else: - validation = 'qa' + str(task) + '_valid.txt' - if test is None: - test = 'qa' + str(task) + '_test.txt' - return super(BABI20, - cls).splits(path=path, root=root, text_field=text_field, train=train, - validation=validation, test=test, **kwargs) - - @classmethod - def iters(cls, batch_size=32, root='.data', memory_size=50, task=1, joint=False, - tenK=False, only_supporting=False, sort=False, shuffle=False, device=None, - **kwargs): - text = BABI20Field(memory_size) - train, val, test = BABI20.splits(text, root=root, task=task, joint=joint, - tenK=tenK, only_supporting=only_supporting, - **kwargs) - text.build_vocab(train) - return Iterator.splits((train, val, test), batch_size=batch_size, sort=sort, - shuffle=shuffle, device=device) diff --git a/torchtext/legacy/datasets/imdb.py b/torchtext/legacy/datasets/imdb.py deleted file mode 100644 index e59ce19ecb..0000000000 --- a/torchtext/legacy/datasets/imdb.py +++ /dev/null @@ -1,80 +0,0 @@ -import os -import glob -import io - -from .. import data - - -class IMDB(data.Dataset): - - urls = ['http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz'] - name = 'imdb' - dirname = 'aclImdb' - - @staticmethod - def sort_key(ex): - return len(ex.text) - - def __init__(self, path, text_field, label_field, **kwargs): - """Create an IMDB dataset instance given a path and fields. - - Args: - path: Path to the dataset's highest level directory - text_field: The field that will be used for text data. - label_field: The field that will be used for label data. - Remaining keyword arguments: Passed to the constructor of - data.Dataset. - """ - fields = [('text', text_field), ('label', label_field)] - examples = [] - - for label in ['pos', 'neg']: - for fname in glob.iglob(os.path.join(path, label, '*.txt')): - with io.open(fname, 'r', encoding="utf-8") as f: - text = f.readline() - examples.append(data.Example.fromlist([text, label], fields)) - - super(IMDB, self).__init__(examples, fields, **kwargs) - - @classmethod - def splits(cls, text_field, label_field, root='.data', - train='train', test='test', **kwargs): - """Create dataset objects for splits of the IMDB dataset. - - Args: - text_field: The field that will be used for the sentence. - label_field: The field that will be used for label data. - root: Root dataset storage directory. Default is '.data'. - train: The directory that contains the training examples - test: The directory that contains the test examples - Remaining keyword arguments: Passed to the splits method of - Dataset. - """ - return super(IMDB, cls).splits( - root=root, text_field=text_field, label_field=label_field, - train=train, validation=None, test=test, **kwargs) - - @classmethod - def iters(cls, batch_size=32, device=0, root='.data', vectors=None, **kwargs): - """Create iterator objects for splits of the IMDB dataset. - - Args: - batch_size: Batch_size - device: Device to create batches on. Use - 1 for CPU and None for - the currently active GPU device. - root: The root directory that contains the imdb dataset subdirectory - vectors: one of the available pretrained vectors or a list with each - element one of the available pretrained vectors (see Vocab.load_vectors) - - Remaining keyword arguments: Passed to the splits method. - """ - TEXT = data.Field() - LABEL = data.Field(sequential=False) - - train, test = cls.splits(TEXT, LABEL, root=root, **kwargs) - - TEXT.build_vocab(train, vectors=vectors) - LABEL.build_vocab(train) - - return data.BucketIterator.splits( - (train, test), batch_size=batch_size, device=device) diff --git a/torchtext/legacy/datasets/language_modeling.py b/torchtext/legacy/datasets/language_modeling.py deleted file mode 100644 index 5349c6b7b1..0000000000 --- a/torchtext/legacy/datasets/language_modeling.py +++ /dev/null @@ -1,217 +0,0 @@ -from .. import data -import io - - -class LanguageModelingDataset(data.Dataset): - """Defines a dataset for language modeling.""" - - def __init__(self, path, text_field, newline_eos=True, - encoding='utf-8', **kwargs): - """Create a LanguageModelingDataset given a path and a field. - - Args: - path: Path to the data file. - text_field: The field that will be used for text data. - newline_eos: Whether to add an token for every newline in the - data file. Default: True. - encoding: The encoding of the file. - kwargs: Passed to the constructor of - data.Dataset. - """ - fields = [('text', text_field)] - text = [] - with io.open(path, encoding=encoding) as f: - for line in f: - text += text_field.preprocess(line) - if newline_eos: - text.append(u'') - - examples = [data.Example.fromlist([text], fields)] - super(LanguageModelingDataset, self).__init__( - examples, fields, **kwargs) - - -class WikiText2(LanguageModelingDataset): - - urls = ['https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-v1.zip'] - name = 'wikitext-2' - dirname = 'wikitext-2' - - @classmethod - def splits(cls, text_field, root='.data', train='wiki.train.tokens', - validation='wiki.valid.tokens', test='wiki.test.tokens', - **kwargs): - """Create dataset objects for splits of the WikiText-2 dataset. - - This is the most flexible way to use the dataset. - - Args: - text_field: The field that will be used for text data. - root: The root directory that the dataset's zip archive will be - expanded into; therefore the directory in whose wikitext-2 - subdirectory the data files will be stored. - train: The filename of the train data. Default: 'wiki.train.tokens'. - validation: The filename of the validation data, or None to not - load the validation set. Default: 'wiki.valid.tokens'. - test: The filename of the test data, or None to not load the test - set. Default: 'wiki.test.tokens'. - """ - return super(WikiText2, cls).splits( - root=root, train=train, validation=validation, test=test, - text_field=text_field, **kwargs) - - @classmethod - def iters(cls, batch_size=32, bptt_len=35, device=0, root='.data', - vectors=None, **kwargs): - """Create iterator objects for splits of the WikiText-2 dataset. - - This is the simplest way to use the dataset, and assumes common - defaults for field, vocabulary, and iterator parameters. - - Args: - batch_size: Batch size. - bptt_len: Length of sequences for backpropagation through time. - device: Device to create batches on. Use -1 for CPU and None for - the currently active GPU device. - root: The root directory that the dataset's zip archive will be - expanded into; therefore the directory in whose wikitext-2 - subdirectory the data files will be stored. - vectors: One of either the available pretrained vectors - or custom pretrained vectors (see Vocab.load_vectors); - or a list of aforementioned vectors - kwargs: Passed to the splits method. - """ - TEXT = data.Field() - - train, val, test = cls.splits(TEXT, root=root, **kwargs) - - TEXT.build_vocab(train, vectors=vectors) - - return data.BPTTIterator.splits( - (train, val, test), batch_size=batch_size, bptt_len=bptt_len, - device=device) - - -class WikiText103(LanguageModelingDataset): - - urls = ['https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-103-v1.zip'] - name = 'wikitext-103' - dirname = 'wikitext-103' - - @classmethod - def splits(cls, text_field, root='.data', train='wiki.train.tokens', - validation='wiki.valid.tokens', test='wiki.test.tokens', - **kwargs): - """Create dataset objects for splits of the WikiText-103 dataset. - - This is the most flexible way to use the dataset. - - Args: - text_field: The field that will be used for text data. - root: The root directory that the dataset's zip archive will be - expanded into; therefore the directory in whose wikitext-103 - subdirectory the data files will be stored. - train: The filename of the train data. Default: 'wiki.train.tokens'. - validation: The filename of the validation data, or None to not - load the validation set. Default: 'wiki.valid.tokens'. - test: The filename of the test data, or None to not load the test - set. Default: 'wiki.test.tokens'. - """ - return super(WikiText103, cls).splits( - root=root, train=train, validation=validation, test=test, - text_field=text_field, **kwargs) - - @classmethod - def iters(cls, batch_size=32, bptt_len=35, device=0, root='.data', - vectors=None, **kwargs): - """Create iterator objects for splits of the WikiText-103 dataset. - - This is the simplest way to use the dataset, and assumes common - defaults for field, vocabulary, and iterator parameters. - - Args: - batch_size: Batch size. - bptt_len: Length of sequences for backpropagation through time. - device: Device to create batches on. Use -1 for CPU and None for - the currently active GPU device. - root: The root directory that the dataset's zip archive will be - expanded into; therefore the directory in whose wikitext-2 - subdirectory the data files will be stored. - vectors: One of either the available pretrained vectors - or custom pretrained vectors (see Vocab.load_vectors); - or a list of aforementioned vectors - kwargs: Passed to the splits method. - """ - TEXT = data.Field() - - train, val, test = cls.splits(TEXT, root=root, **kwargs) - - TEXT.build_vocab(train, vectors=vectors) - - return data.BPTTIterator.splits( - (train, val, test), batch_size=batch_size, bptt_len=bptt_len, - device=device) - - -class PennTreebank(LanguageModelingDataset): - """The Penn Treebank dataset. - A relatively small dataset originally created for POS tagging. - - References: - Marcus, Mitchell P., Marcinkiewicz, Mary Ann & Santorini, Beatrice (1993). - Building a Large Annotated Corpus of English: The Penn Treebank - """ - - urls = ['https://raw.githubusercontent.com/wojzaremba/lstm/master/data/ptb.train.txt', - 'https://raw.githubusercontent.com/wojzaremba/lstm/master/data/ptb.valid.txt', - 'https://raw.githubusercontent.com/wojzaremba/lstm/master/data/ptb.test.txt'] - name = 'penn-treebank' - dirname = '' - - @classmethod - def splits(cls, text_field, root='.data', train='ptb.train.txt', - validation='ptb.valid.txt', test='ptb.test.txt', - **kwargs): - """Create dataset objects for splits of the Penn Treebank dataset. - - Args: - text_field: The field that will be used for text data. - root: The root directory where the data files will be stored. - train: The filename of the train data. Default: 'ptb.train.txt'. - validation: The filename of the validation data, or None to not - load the validation set. Default: 'ptb.valid.txt'. - test: The filename of the test data, or None to not load the test - set. Default: 'ptb.test.txt'. - """ - return super(PennTreebank, cls).splits( - root=root, train=train, validation=validation, test=test, - text_field=text_field, **kwargs) - - @classmethod - def iters(cls, batch_size=32, bptt_len=35, device=0, root='.data', - vectors=None, **kwargs): - """Create iterator objects for splits of the Penn Treebank dataset. - - This is the simplest way to use the dataset, and assumes common - defaults for field, vocabulary, and iterator parameters. - - Args: - batch_size: Batch size. - bptt_len: Length of sequences for backpropagation through time. - device: Device to create batches on. Use -1 for CPU and None for - the currently active GPU device. - root: The root directory where the data files will be stored. - vectors: One of either the available pretrained vectors - or custom pretrained vectors (see Vocab.load_vectors); - or a list of aforementioned vectors - kwargs: Passed to the splits method. - """ - TEXT = data.Field() - - train, val, test = cls.splits(TEXT, root=root, **kwargs) - - TEXT.build_vocab(train, vectors=vectors) - - return data.BPTTIterator.splits( - (train, val, test), batch_size=batch_size, bptt_len=bptt_len, - device=device) diff --git a/torchtext/legacy/datasets/nli.py b/torchtext/legacy/datasets/nli.py deleted file mode 100644 index 78a4a172f4..0000000000 --- a/torchtext/legacy/datasets/nli.py +++ /dev/null @@ -1,191 +0,0 @@ -from .. import data - - -class ShiftReduceField(data.Field): - - def __init__(self): - - super(ShiftReduceField, self).__init__(preprocessing=lambda parse: [ - 'reduce' if t == ')' else 'shift' for t in parse if t != '(']) - - self.build_vocab([['reduce'], ['shift']]) - - -class ParsedTextField(data.Field): - """ - Field for parsed sentences data in NLI datasets. - Expensive tokenization could be omitted from the pipeline as - the parse tree annotations are already in tokenized form. - """ - - def __init__(self, eos_token='', lower=False, reverse=False): - if reverse: - super(ParsedTextField, self).__init__( - eos_token=eos_token, lower=lower, - preprocessing=lambda parse: [t for t in parse if t not in ('(', ')')], - postprocessing=lambda parse, _: [list(reversed(p)) for p in parse], - include_lengths=True) - else: - super(ParsedTextField, self).__init__( - eos_token=eos_token, lower=lower, - preprocessing=lambda parse: [t for t in parse if t not in ('(', ')')], - include_lengths=True) - - -class NLIDataset(data.TabularDataset): - - urls = [] - dirname = '' - name = 'nli' - - @staticmethod - def sort_key(ex): - return data.interleave_keys( - len(ex.premise), len(ex.hypothesis)) - - @classmethod - def splits(cls, text_field, label_field, parse_field=None, - extra_fields=None, root='.data', train='train.jsonl', - validation='val.jsonl', test='test.jsonl'): - """Create dataset objects for splits of the SNLI dataset. - - This is the most flexible way to use the dataset. - - Args: - text_field: The field that will be used for premise and hypothesis - data. - label_field: The field that will be used for label data. - parse_field: The field that will be used for shift-reduce parser - transitions, or None to not include them. - extra_fields: A dict[json_key: Tuple(field_name, Field)] - root: The root directory that the dataset's zip archive will be - expanded into. - train: The filename of the train data. Default: 'train.jsonl'. - validation: The filename of the validation data, or None to not - load the validation set. Default: 'dev.jsonl'. - test: The filename of the test data, or None to not load the test - set. Default: 'test.jsonl'. - """ - if extra_fields is None: - extra_fields = {} - path = cls.download(root) - - if parse_field is None: - fields = {'sentence1': ('premise', text_field), - 'sentence2': ('hypothesis', text_field), - 'gold_label': ('label', label_field)} - else: - fields = {'sentence1_binary_parse': [('premise', text_field), - ('premise_transitions', parse_field)], - 'sentence2_binary_parse': [('hypothesis', text_field), - ('hypothesis_transitions', parse_field)], - 'gold_label': ('label', label_field)} - - for key in extra_fields: - if key not in fields.keys(): - fields[key] = extra_fields[key] - - return super(NLIDataset, cls).splits( - path, root, train, validation, test, - format='json', fields=fields, - filter_pred=lambda ex: ex.label != '-') - - @classmethod - def iters(cls, batch_size=32, device=0, root='.data', - vectors=None, trees=False, **kwargs): - """Create iterator objects for splits of the SNLI dataset. - - This is the simplest way to use the dataset, and assumes common - defaults for field, vocabulary, and iterator parameters. - - Args: - batch_size: Batch size. - device: Device to create batches on. Use -1 for CPU and None for - the currently active GPU device. - root: The root directory that the dataset's zip archive will be - expanded into; therefore the directory in whose wikitext-2 - subdirectory the data files will be stored. - vectors: one of the available pretrained vectors or a list with each - element one of the available pretrained vectors (see Vocab.load_vectors) - trees: Whether to include shift-reduce parser transitions. - Default: False. - Remaining keyword arguments: Passed to the splits method. - """ - if trees: - TEXT = ParsedTextField() - TRANSITIONS = ShiftReduceField() - else: - TEXT = data.Field(tokenize='spacy') - TRANSITIONS = None - LABEL = data.Field(sequential=False) - - train, val, test = cls.splits( - TEXT, LABEL, TRANSITIONS, root=root, **kwargs) - - TEXT.build_vocab(train, vectors=vectors) - LABEL.build_vocab(train) - - return data.BucketIterator.splits( - (train, val, test), batch_size=batch_size, device=device) - - -class SNLI(NLIDataset): - urls = ['http://nlp.stanford.edu/projects/snli/snli_1.0.zip'] - dirname = 'snli_1.0' - name = 'snli' - - @classmethod - def splits(cls, text_field, label_field, parse_field=None, root='.data', - train='snli_1.0_train.jsonl', validation='snli_1.0_dev.jsonl', - test='snli_1.0_test.jsonl'): - return super(SNLI, cls).splits(text_field, label_field, parse_field=parse_field, - root=root, train=train, validation=validation, - test=test) - - -class MultiNLI(NLIDataset): - urls = ['http://www.nyu.edu/projects/bowman/multinli/multinli_1.0.zip'] - dirname = 'multinli_1.0' - name = 'multinli' - - @classmethod - def splits(cls, text_field, label_field, parse_field=None, genre_field=None, - root='.data', - train='multinli_1.0_train.jsonl', - validation='multinli_1.0_dev_matched.jsonl', - test='multinli_1.0_dev_mismatched.jsonl'): - extra_fields = {} - if genre_field is not None: - extra_fields["genre"] = ("genre", genre_field) - - return super(MultiNLI, cls).splits(text_field, label_field, - parse_field=parse_field, - extra_fields=extra_fields, - root=root, train=train, - validation=validation, test=test) - - -class XNLI(NLIDataset): - urls = ['http://www.nyu.edu/projects/bowman/xnli/XNLI-1.0.zip'] - dirname = 'XNLI-1.0' - name = 'xnli' - - @classmethod - def splits(cls, text_field, label_field, genre_field=None, language_field=None, - root='.data', - validation='xnli.dev.jsonl', - test='xnli.test.jsonl'): - extra_fields = {} - if genre_field is not None: - extra_fields["genre"] = ("genre", genre_field) - if language_field is not None: - extra_fields["language"] = ("language", language_field) - - return super(XNLI, cls).splits(text_field, label_field, - extra_fields=extra_fields, - root=root, train=None, - validation=validation, test=test) - - @classmethod - def iters(cls, *args, **kwargs): - raise NotImplementedError('XNLI dataset does not support iters') diff --git a/torchtext/legacy/datasets/sequence_tagging.py b/torchtext/legacy/datasets/sequence_tagging.py deleted file mode 100644 index 849d8be15d..0000000000 --- a/torchtext/legacy/datasets/sequence_tagging.py +++ /dev/null @@ -1,102 +0,0 @@ -from .. import data -import random - - -class SequenceTaggingDataset(data.Dataset): - """Defines a dataset for sequence tagging. Examples in this dataset - contain paired lists -- paired list of words and tags. - - For example, in the case of part-of-speech tagging, an example is of the - form - [I, love, PyTorch, .] paired with [PRON, VERB, PROPN, PUNCT] - - See torchtext/test/sequence_tagging.py on how to use this class. - """ - - @staticmethod - def sort_key(example): - for attr in dir(example): - if not callable(getattr(example, attr)) and \ - not attr.startswith("__"): - return len(getattr(example, attr)) - return 0 - - def __init__(self, path, fields, encoding="utf-8", separator="\t", **kwargs): - examples = [] - columns = [] - - with open(path, encoding=encoding) as input_file: - for line in input_file: - line = line.strip() - if line == "": - if columns: - examples.append(data.Example.fromlist(columns, fields)) - columns = [] - else: - for i, column in enumerate(line.split(separator)): - if len(columns) < i + 1: - columns.append([]) - columns[i].append(column) - - if columns: - examples.append(data.Example.fromlist(columns, fields)) - super(SequenceTaggingDataset, self).__init__(examples, fields, - **kwargs) - - -class UDPOS(SequenceTaggingDataset): - - # Universal Dependencies English Web Treebank. - # Download original at http://universaldependencies.org/ - # License: http://creativecommons.org/licenses/by-sa/4.0/ - urls = ['https://bitbucket.org/sivareddyg/public/downloads/en-ud-v2.zip'] - dirname = 'en-ud-v2' - name = 'udpos' - - @classmethod - def splits(cls, fields, root=".data", train="en-ud-tag.v2.train.txt", - validation="en-ud-tag.v2.dev.txt", - test="en-ud-tag.v2.test.txt", **kwargs): - """Downloads and loads the Universal Dependencies Version 2 POS Tagged - data. - """ - - return super(UDPOS, cls).splits( - fields=fields, root=root, train=train, validation=validation, - test=test, **kwargs) - - -class CoNLL2000Chunking(SequenceTaggingDataset): - # CoNLL 2000 Chunking Dataset - # https://www.clips.uantwerpen.be/conll2000/chunking/ - urls = ['https://www.clips.uantwerpen.be/conll2000/chunking/train.txt.gz', - 'https://www.clips.uantwerpen.be/conll2000/chunking/test.txt.gz'] - dirname = '' - name = 'conll2000' - - @classmethod - def splits(cls, fields, root=".data", train="train.txt", - test="test.txt", validation_frac=0.1, **kwargs): - """Downloads and loads the CoNLL 2000 Chunking dataset. - NOTE: There is only a train and test dataset so we use 10% of the train set as validation - """ - - train, test = super(CoNLL2000Chunking, cls).splits( - fields=fields, root=root, train=train, - test=test, separator=' ', **kwargs) - - # HACK: Saving the sort key function as the split() call removes it - sort_key = train.sort_key - - # Now split the train set - # Force a random seed to make the split deterministic - random.seed(0) - train, val = train.split(1 - validation_frac, random_state=random.getstate()) - # Reset the seed - random.seed() - - # HACK: Set the sort key - train.sort_key = sort_key - val.sort_key = sort_key - - return train, val, test diff --git a/torchtext/legacy/datasets/sst.py b/torchtext/legacy/datasets/sst.py deleted file mode 100644 index 8c793fad93..0000000000 --- a/torchtext/legacy/datasets/sst.py +++ /dev/null @@ -1,104 +0,0 @@ -import os - -from .. import data - - -class SST(data.Dataset): - - urls = ['http://nlp.stanford.edu/sentiment/trainDevTestTrees_PTB.zip'] - dirname = 'trees' - name = 'sst' - - @staticmethod - def sort_key(ex): - return len(ex.text) - - def __init__(self, path, text_field, label_field, subtrees=False, - fine_grained=False, **kwargs): - """Create an SST dataset instance given a path and fields. - - Args: - path: Path to the data file - text_field: The field that will be used for text data. - label_field: The field that will be used for label data. - subtrees: Whether to include sentiment-tagged subphrases - in addition to complete examples. Default: False. - fine_grained: Whether to use 5-class instead of 3-class - labeling. Default: False. - Remaining keyword arguments: Passed to the constructor of - data.Dataset. - """ - fields = [('text', text_field), ('label', label_field)] - - def get_label_str(label): - pre = 'very ' if fine_grained else '' - return {'0': pre + 'negative', '1': 'negative', '2': 'neutral', - '3': 'positive', '4': pre + 'positive', None: None}[label] - label_field.preprocessing = data.Pipeline(get_label_str) - with open(os.path.expanduser(path)) as f: - if subtrees: - examples = [ex for line in f for ex in - data.Example.fromtree(line, fields, True)] - else: - examples = [data.Example.fromtree(line, fields) for line in f] - super(SST, self).__init__(examples, fields, **kwargs) - - @classmethod - def splits(cls, text_field, label_field, root='.data', - train='train.txt', validation='dev.txt', test='test.txt', - train_subtrees=False, **kwargs): - """Create dataset objects for splits of the SST dataset. - - Args: - text_field: The field that will be used for the sentence. - label_field: The field that will be used for label data. - root: The root directory that the dataset's zip archive will be - expanded into; therefore the directory in whose trees - subdirectory the data files will be stored. - train: The filename of the train data. Default: 'train.txt'. - validation: The filename of the validation data, or None to not - load the validation set. Default: 'dev.txt'. - test: The filename of the test data, or None to not load the test - set. Default: 'test.txt'. - train_subtrees: Whether to use all subtrees in the training set. - Default: False. - Remaining keyword arguments: Passed to the splits method of - Dataset. - """ - path = cls.download(root) - - train_data = None if train is None else cls( - os.path.join(path, train), text_field, label_field, subtrees=train_subtrees, - **kwargs) - val_data = None if validation is None else cls( - os.path.join(path, validation), text_field, label_field, **kwargs) - test_data = None if test is None else cls( - os.path.join(path, test), text_field, label_field, **kwargs) - return tuple(d for d in (train_data, val_data, test_data) - if d is not None) - - @classmethod - def iters(cls, batch_size=32, device=0, root='.data', vectors=None, **kwargs): - """Create iterator objects for splits of the SST dataset. - - Args: - batch_size: Batch_size - device: Device to create batches on. Use - 1 for CPU and None for - the currently active GPU device. - root: The root directory that the dataset's zip archive will be - expanded into; therefore the directory in whose trees - subdirectory the data files will be stored. - vectors: one of the available pretrained vectors or a list with each - element one of the available pretrained vectors (see Vocab.load_vectors) - Remaining keyword arguments: Passed to the splits method. - """ - TEXT = data.Field() - LABEL = data.Field(sequential=False) - - train, val, test = cls.splits(TEXT, LABEL, root=root, **kwargs) - - TEXT.build_vocab(train, vectors=vectors) - LABEL.build_vocab(train) - - return data.BucketIterator.splits( - (train, val, test), batch_size=batch_size, device=device) diff --git a/torchtext/legacy/datasets/text_classification.py b/torchtext/legacy/datasets/text_classification.py deleted file mode 100644 index 6c5f47b4ee..0000000000 --- a/torchtext/legacy/datasets/text_classification.py +++ /dev/null @@ -1,452 +0,0 @@ -import logging -import torch -import io -from torchtext.utils import download_from_url, extract_archive, unicode_csv_reader -from torchtext.data.utils import ngrams_iterator -from torchtext.data.utils import get_tokenizer -from torchtext.legacy.vocab import build_vocab_from_iterator -from torchtext.legacy.vocab import Vocab -from tqdm import tqdm - -URLS = { - 'AG_NEWS': - 'https://drive.google.com/uc?export=download&id=0Bz8a_Dbh9QhbUDNpeUdjb0wxRms', - 'SogouNews': - 'https://drive.google.com/uc?export=download&id=0Bz8a_Dbh9QhbUkVqNEszd0pHaFE', - 'DBpedia': - 'https://drive.google.com/uc?export=download&id=0Bz8a_Dbh9QhbQ2Vic1kxMmZZQ1k', - 'YelpReviewPolarity': - 'https://drive.google.com/uc?export=download&id=0Bz8a_Dbh9QhbNUpYQ2N3SGlFaDg', - 'YelpReviewFull': - 'https://drive.google.com/uc?export=download&id=0Bz8a_Dbh9QhbZlU4dXhHTFhZQU0', - 'YahooAnswers': - 'https://drive.google.com/uc?export=download&id=0Bz8a_Dbh9Qhbd2JNdDBsQUdocVU', - 'AmazonReviewPolarity': - 'https://drive.google.com/uc?export=download&id=0Bz8a_Dbh9QhbaW12WVVZS2drcnM', - 'AmazonReviewFull': - 'https://drive.google.com/uc?export=download&id=0Bz8a_Dbh9QhbZVhsUnRWRDhETzA' -} - - -def _csv_iterator(data_path, ngrams, yield_cls=False): - tokenizer = get_tokenizer("basic_english") - with io.open(data_path, encoding="utf8") as f: - reader = unicode_csv_reader(f) - for row in reader: - tokens = ' '.join(row[1:]) - tokens = tokenizer(tokens) - if yield_cls: - yield int(row[0]) - 1, ngrams_iterator(tokens, ngrams) - else: - yield ngrams_iterator(tokens, ngrams) - - -def _create_data_from_iterator(vocab, iterator, include_unk): - data = [] - labels = [] - with tqdm(unit_scale=0, unit='lines') as t: - for cls, tokens in iterator: - if include_unk: - tokens = torch.tensor([vocab[token] for token in tokens]) - else: - token_ids = list(filter(lambda x: x is not Vocab.UNK, [vocab[token] - for token in tokens])) - tokens = torch.tensor(token_ids) - if len(tokens) == 0: - logging.info('Row contains no tokens.') - data.append((cls, tokens)) - labels.append(cls) - t.update(1) - return data, set(labels) - - -class TextClassificationDataset(torch.utils.data.Dataset): - """Defines an abstract text classification datasets. - Currently, we only support the following datasets: - - - AG_NEWS - - SogouNews - - DBpedia - - YelpReviewPolarity - - YelpReviewFull - - YahooAnswers - - AmazonReviewPolarity - - AmazonReviewFull - - """ - - def __init__(self, vocab, data, labels): - """Initiate text-classification dataset. - - Args: - vocab: Vocabulary object used for dataset. - data: a list of label/tokens tuple. tokens are a tensor after - numericalizing the string tokens. label is an integer. - [(label1, tokens1), (label2, tokens2), (label2, tokens3)] - label: a set of the labels. - {label1, label2} - - Examples: - See the examples in examples/text_classification/ - - """ - - super(TextClassificationDataset, self).__init__() - self._data = data - self._labels = labels - self._vocab = vocab - - def __getitem__(self, i): - return self._data[i] - - def __len__(self): - return len(self._data) - - def __iter__(self): - for x in self._data: - yield x - - def get_labels(self): - return self._labels - - def get_vocab(self): - return self._vocab - - -def _setup_datasets(dataset_name, root='.data', ngrams=1, vocab=None, include_unk=False): - dataset_tar = download_from_url(URLS[dataset_name], root=root) - extracted_files = extract_archive(dataset_tar) - - for fname in extracted_files: - if fname.endswith('train.csv'): - train_csv_path = fname - if fname.endswith('test.csv'): - test_csv_path = fname - - if vocab is None: - logging.info('Building Vocab based on {}'.format(train_csv_path)) - vocab = build_vocab_from_iterator(_csv_iterator(train_csv_path, ngrams)) - else: - if not isinstance(vocab, Vocab): - raise TypeError("Passed vocabulary is not of type Vocab") - logging.info('Vocab has {} entries'.format(len(vocab))) - logging.info('Creating training data') - train_data, train_labels = _create_data_from_iterator( - vocab, _csv_iterator(train_csv_path, ngrams, yield_cls=True), include_unk) - logging.info('Creating testing data') - test_data, test_labels = _create_data_from_iterator( - vocab, _csv_iterator(test_csv_path, ngrams, yield_cls=True), include_unk) - if len(train_labels ^ test_labels) > 0: - raise ValueError("Training and test labels don't match") - return (TextClassificationDataset(vocab, train_data, train_labels), - TextClassificationDataset(vocab, test_data, test_labels)) - - -def AG_NEWS(*args, **kwargs): - """ Defines AG_NEWS datasets. - - The labels include: - - - 0 : World - - 1 : Sports - - 2 : Business - - 3 : Sci/Tech - - Create supervised learning dataset: AG_NEWS - - Separately returns the training and test dataset - - Args: - root: Directory where the datasets are saved. Default: ".data" - ngrams: a contiguous sequence of n items from s string text. - Default: 1 - vocab: Vocabulary used for dataset. If None, it will generate a new - vocabulary based on the train data set. - include_unk: include unknown token in the data (Default: False) - - Examples: - >>> train_dataset, test_dataset = torchtext.datasets.AG_NEWS(ngrams=3) - - """ - - return _setup_datasets(*(("AG_NEWS",) + args), **kwargs) - - -def SogouNews(*args, **kwargs): - """ Defines SogouNews datasets. - - The labels include: - - - 0 : Sports - - 1 : Finance - - 2 : Entertainment - - 3 : Automobile - - 4 : Technology - - Create supervised learning dataset: SogouNews - - Separately returns the training and test dataset - - Args: - root: Directory where the datasets are saved. Default: ".data" - ngrams: a contiguous sequence of n items from s string text. - Default: 1 - vocab: Vocabulary used for dataset. If None, it will generate a new - vocabulary based on the train data set. - include_unk: include unknown token in the data (Default: False) - - Examples: - >>> train_dataset, test_dataset = torchtext.datasets.SogouNews(ngrams=3) - - """ - - return _setup_datasets(*(("SogouNews",) + args), **kwargs) - - -def DBpedia(*args, **kwargs): - """ Defines DBpedia datasets. - - The labels include: - - - 0 : Company - - 1 : EducationalInstitution - - 2 : Artist - - 3 : Athlete - - 4 : OfficeHolder - - 5 : MeanOfTransportation - - 6 : Building - - 7 : NaturalPlace - - 8 : Village - - 9 : Animal - - 10 : Plant - - 11 : Album - - 12 : Film - - 13 : WrittenWork - - Create supervised learning dataset: DBpedia - - Separately returns the training and test dataset - - Args: - root: Directory where the datasets are saved. Default: ".data" - ngrams: a contiguous sequence of n items from s string text. - Default: 1 - vocab: Vocabulary used for dataset. If None, it will generate a new - vocabulary based on the train data set. - include_unk: include unknown token in the data (Default: False) - - Examples: - >>> train_dataset, test_dataset = torchtext.datasets.DBpedia(ngrams=3) - - """ - - return _setup_datasets(*(("DBpedia",) + args), **kwargs) - - -def YelpReviewPolarity(*args, **kwargs): - """ Defines YelpReviewPolarity datasets. - - The labels include: - - - 0 : Negative polarity. - - 1 : Positive polarity. - - Create supervised learning dataset: YelpReviewPolarity - - Separately returns the training and test dataset - - Args: - root: Directory where the datasets are saved. Default: ".data" - ngrams: a contiguous sequence of n items from s string text. - Default: 1 - vocab: Vocabulary used for dataset. If None, it will generate a new - vocabulary based on the train data set. - include_unk: include unknown token in the data (Default: False) - - Examples: - >>> train_dataset, test_dataset = torchtext.datasets.YelpReviewPolarity(ngrams=3) - - """ - - return _setup_datasets(*(("YelpReviewPolarity",) + args), **kwargs) - - -def YelpReviewFull(*args, **kwargs): - """ Defines YelpReviewFull datasets. - - The labels include: - - 0 - 4 : rating classes (4 is highly recommended). - - Create supervised learning dataset: YelpReviewFull - - Separately returns the training and test dataset - - Args: - root: Directory where the datasets are saved. Default: ".data" - ngrams: a contiguous sequence of n items from s string text. - Default: 1 - vocab: Vocabulary used for dataset. If None, it will generate a new - vocabulary based on the train data set. - include_unk: include unknown token in the data (Default: False) - - Examples: - >>> train_dataset, test_dataset = torchtext.datasets.YelpReviewFull(ngrams=3) - - """ - - return _setup_datasets(*(("YelpReviewFull",) + args), **kwargs) - - -def YahooAnswers(*args, **kwargs): - """ Defines YahooAnswers datasets. - - The labels include: - - - 0 : Society & Culture - - 1 : Science & Mathematics - - 2 : Health - - 3 : Education & Reference - - 4 : Computers & Internet - - 5 : Sports - - 6 : Business & Finance - - 7 : Entertainment & Music - - 8 : Family & Relationships - - 9 : Politics & Government - - Create supervised learning dataset: YahooAnswers - - Separately returns the training and test dataset - - Args: - root: Directory where the datasets are saved. Default: ".data" - ngrams: a contiguous sequence of n items from s string text. - Default: 1 - vocab: Vocabulary used for dataset. If None, it will generate a new - vocabulary based on the train data set. - include_unk: include unknown token in the data (Default: False) - - Examples: - >>> train_dataset, test_dataset = torchtext.datasets.YahooAnswers(ngrams=3) - - """ - - return _setup_datasets(*(("YahooAnswers",) + args), **kwargs) - - -def AmazonReviewPolarity(*args, **kwargs): - """ Defines AmazonReviewPolarity datasets. - - The labels include: - - - 0 : Negative polarity - - 1 : Positive polarity - - Create supervised learning dataset: AmazonReviewPolarity - - Separately returns the training and test dataset - - Args: - root: Directory where the datasets are saved. Default: ".data" - ngrams: a contiguous sequence of n items from s string text. - Default: 1 - vocab: Vocabulary used for dataset. If None, it will generate a new - vocabulary based on the train data set. - include_unk: include unknown token in the data (Default: False) - - Examples: - >>> train_dataset, test_dataset = torchtext.datasets.AmazonReviewPolarity(ngrams=3) - - """ - - return _setup_datasets(*(("AmazonReviewPolarity",) + args), **kwargs) - - -def AmazonReviewFull(*args, **kwargs): - """ Defines AmazonReviewFull datasets. - - The labels include: - - 0 - 4 : rating classes (4 is highly recommended) - - Create supervised learning dataset: AmazonReviewFull - - Separately returns the training and test dataset - - Args: - root: Directory where the dataset are saved. Default: ".data" - ngrams: a contiguous sequence of n items from s string text. - Default: 1 - vocab: Vocabulary used for dataset. If None, it will generate a new - vocabulary based on the train data set. - include_unk: include unknown token in the data (Default: False) - - Examples: - >>> train_dataset, test_dataset = torchtext.datasets.AmazonReviewFull(ngrams=3) - - """ - - return _setup_datasets(*(("AmazonReviewFull",) + args), **kwargs) - - -DATASETS = { - 'AG_NEWS': AG_NEWS, - 'SogouNews': SogouNews, - 'DBpedia': DBpedia, - 'YelpReviewPolarity': YelpReviewPolarity, - 'YelpReviewFull': YelpReviewFull, - 'YahooAnswers': YahooAnswers, - 'AmazonReviewPolarity': AmazonReviewPolarity, - 'AmazonReviewFull': AmazonReviewFull -} - - -LABELS = { - 'AG_NEWS': {0: 'World', - 1: 'Sports', - 2: 'Business', - 3: 'Sci/Tech'}, - 'SogouNews': {0: 'Sports', - 1: 'Finance', - 2: 'Entertainment', - 3: 'Automobile', - 4: 'Technology'}, - 'DBpedia': {0: 'Company', - 1: 'EducationalInstitution', - 2: 'Artist', - 3: 'Athlete', - 4: 'OfficeHolder', - 5: 'MeanOfTransportation', - 6: 'Building', - 7: 'NaturalPlace', - 8: 'Village', - 9: 'Animal', - 10: 'Plant', - 11: 'Album', - 12: 'Film', - 13: 'WrittenWork'}, - 'YelpReviewPolarity': {0: 'Negative polarity', - 1: 'Positive polarity'}, - 'YelpReviewFull': {0: 'score 1', - 1: 'score 2', - 2: 'score 3', - 3: 'score 4', - 4: 'score 5'}, - 'YahooAnswers': {0: 'Society & Culture', - 1: 'Science & Mathematics', - 2: 'Health', - 3: 'Education & Reference', - 4: 'Computers & Internet', - 5: 'Sports', - 6: 'Business & Finance', - 7: 'Entertainment & Music', - 8: 'Family & Relationships', - 9: 'Politics & Government'}, - 'AmazonReviewPolarity': {0: 'Negative polarity', - 1: 'Positive polarity'}, - 'AmazonReviewFull': {0: 'score 1', - 1: 'score 2', - 2: 'score 3', - 3: 'score 4', - 4: 'score 5'} -} diff --git a/torchtext/legacy/datasets/translation.py b/torchtext/legacy/datasets/translation.py deleted file mode 100644 index 6e6bfeb36e..0000000000 --- a/torchtext/legacy/datasets/translation.py +++ /dev/null @@ -1,234 +0,0 @@ -import os -try: - import defusedxml.ElementTree as ET -except ImportError: - import xml.etree.ElementTree as ET -import glob -import io -import codecs - -from .. import data - - -class TranslationDataset(data.Dataset): - """Defines a dataset for machine translation.""" - - @staticmethod - def sort_key(ex): - return data.interleave_keys(len(ex.src), len(ex.trg)) - - def __init__(self, path, exts, fields, **kwargs): - """Create a TranslationDataset given paths and fields. - - Args: - path: Common prefix of paths to the data files for both languages. - exts: A tuple containing the extension to path for each language. - fields: A tuple containing the fields that will be used for data - in each language. - Remaining keyword arguments: Passed to the constructor of - data.Dataset. - """ - if not isinstance(fields[0], (tuple, list)): - fields = [('src', fields[0]), ('trg', fields[1])] - - src_path, trg_path = tuple(os.path.expanduser(path + x) for x in exts) - - examples = [] - with io.open(src_path, mode='r', encoding='utf-8') as src_file, \ - io.open(trg_path, mode='r', encoding='utf-8') as trg_file: - for src_line, trg_line in zip(src_file, trg_file): - src_line, trg_line = src_line.strip(), trg_line.strip() - if src_line != '' and trg_line != '': - examples.append(data.Example.fromlist( - [src_line, trg_line], fields)) - - super(TranslationDataset, self).__init__(examples, fields, **kwargs) - - @classmethod - def splits(cls, exts, fields, path=None, root='.data', - train='train', validation='val', test='test', **kwargs): - """Create dataset objects for splits of a TranslationDataset. - - Args: - exts: A tuple containing the extension to path for each language. - fields: A tuple containing the fields that will be used for data - in each language. - path (str): Common prefix of the splits' file paths, or None to use - the result of cls.download(root). - root: Root dataset storage directory. Default is '.data'. - train: The prefix of the train data. Default: 'train'. - validation: The prefix of the validation data. Default: 'val'. - test: The prefix of the test data. Default: 'test'. - Remaining keyword arguments: Passed to the splits method of - Dataset. - """ - if path is None: - path = cls.download(root) - - train_data = None if train is None else cls( - os.path.join(path, train), exts, fields, **kwargs) - val_data = None if validation is None else cls( - os.path.join(path, validation), exts, fields, **kwargs) - test_data = None if test is None else cls( - os.path.join(path, test), exts, fields, **kwargs) - return tuple(d for d in (train_data, val_data, test_data) - if d is not None) - - -class Multi30k(TranslationDataset): - """The small-dataset WMT 2016 multimodal task, also known as Flickr30k""" - - urls = ['http://www.quest.dcs.shef.ac.uk/wmt16_files_mmt/training.tar.gz', - 'http://www.quest.dcs.shef.ac.uk/wmt16_files_mmt/validation.tar.gz', - 'http://www.quest.dcs.shef.ac.uk/' - 'wmt17_files_mmt/mmt_task1_test2016.tar.gz'] - name = 'multi30k' - dirname = '' - - @classmethod - def splits(cls, exts, fields, root='.data', - train='train', validation='val', test='test2016', **kwargs): - """Create dataset objects for splits of the Multi30k dataset. - - Args: - exts: A tuple containing the extension to path for each language. - fields: A tuple containing the fields that will be used for data - in each language. - root: Root dataset storage directory. Default is '.data'. - train: The prefix of the train data. Default: 'train'. - validation: The prefix of the validation data. Default: 'val'. - test: The prefix of the test data. Default: 'test'. - Remaining keyword arguments: Passed to the splits method of - Dataset. - """ - - # TODO: This is a _HORRIBLE_ patch related to #208 - # 'path' can be passed as a kwarg to the translation dataset constructor - # or has to be set (so the download wouldn't be duplicated). A good idea - # seems to rename the existence check variable from path to something else - if 'path' not in kwargs: - expected_folder = os.path.join(root, cls.name) - path = expected_folder if os.path.exists(expected_folder) else None - else: - path = kwargs['path'] - del kwargs['path'] - - return super(Multi30k, cls).splits( - exts, fields, path, root, train, validation, test, **kwargs) - - -class IWSLT(TranslationDataset): - """The IWSLT 2016 TED talk translation task""" - - base_url = 'https://wit3.fbk.eu/archive/2016-01//texts/{}/{}/{}.tgz' - name = 'iwslt' - base_dirname = '{}-{}' - - @classmethod - def splits(cls, exts, fields, root='.data', - train='train', validation='IWSLT16.TED.tst2013', - test='IWSLT16.TED.tst2014', **kwargs): - """Create dataset objects for splits of the IWSLT dataset. - - Args: - exts: A tuple containing the extension to path for each language. - fields: A tuple containing the fields that will be used for data - in each language. - root: Root dataset storage directory. Default is '.data'. - train: The prefix of the train data. Default: 'train'. - validation: The prefix of the validation data. Default: 'val'. - test: The prefix of the test data. Default: 'test'. - Remaining keyword arguments: Passed to the splits method of - Dataset. - """ - cls.dirname = cls.base_dirname.format(exts[0][1:], exts[1][1:]) - cls.urls = [cls.base_url.format(exts[0][1:], exts[1][1:], cls.dirname)] - check = os.path.join(root, cls.name, cls.dirname) - path = cls.download(root, check=check) - - train = '.'.join([train, cls.dirname]) - validation = '.'.join([validation, cls.dirname]) - if test is not None: - test = '.'.join([test, cls.dirname]) - - if not os.path.exists(os.path.join(path, train) + exts[0]): - cls.clean(path) - - train_data = None if train is None else cls( - os.path.join(path, train), exts, fields, **kwargs) - val_data = None if validation is None else cls( - os.path.join(path, validation), exts, fields, **kwargs) - test_data = None if test is None else cls( - os.path.join(path, test), exts, fields, **kwargs) - return tuple(d for d in (train_data, val_data, test_data) - if d is not None) - - @staticmethod - def clean(path): - for f_xml in glob.iglob(os.path.join(path, '*.xml')): - print(f_xml) - f_txt = os.path.splitext(f_xml)[0] - with codecs.open(f_txt, mode='w', encoding='utf-8') as fd_txt: - root = ET.parse(f_xml).getroot()[0] - for doc in root.findall('doc'): - for e in doc.findall('seg'): - fd_txt.write(e.text.strip() + '\n') - - xml_tags = ['', ''), - (r'&', '&'), - (r'<', '<'), - (r'>', '>'), - (r'', ''), - (r'<[^>]*>', ''), - (r'\[http:[^] ]*', '['), - (r'\|thumb', ''), - (r'\|left', ''), - (r'\|right', ''), - (r'\|\d+px', ''), - (r'\[\[image:[^\[\]]*\|', ''), - (r'\[\[category:([^|\]]*)[^]]*\]\]', '[[$1]]'), - (r'\[\[[a-z\-]*:[^\]]*\]\]', ''), - (r'\[\[[^\|\]]*\|', '[['), - (r'\{\{[^\}]*\}\}', ''), - (r'\{[^\}]*\}', ''), - (r'\[', ''), - (r'\]', ''), - (r'&[^;]*;', ' '), - (r'A', 'a'), (r'B', 'b'), (r'C', 'c'), - (r'D', 'd'), (r'E', 'e'), (r'F', 'f'), - (r'G', 'g'), (r'H', 'h'), (r'I', 'i'), - (r'J', 'j'), (r'K', 'k'), (r'L', 'l'), - (r'M', 'm'), (r'N', 'n'), (r'O', 'o'), - (r'P', 'p'), (r'Q', 'q'), (r'R', 'r'), - (r'S', 's'), (r'T', 't'), (r'U', 'u'), - (r'V', 'v'), (r'W', 'w'), (r'X', 'x'), - (r'Y', 'y'), (r'Z', 'z'), - (r'0', ' zero '), (r'1', ' one '), (r'2', ' two '), - (r'3', ' three '), (r'4', ' four '), (r'5', ' five '), - (r'6', ' six '), (r'7', ' seven '), (r'8', ' eight '), - (r'9', ' nine '), - (r'[^a-z\n]+', ' '), - (r'\n ', ''), - (r'\s+', ' '), - (r'\n\s*\n', r'\n') - ] -enwik9_norm_transform = custom_replace(_patterns) - - -def generate_offsets(filename): - offsets = [] - with open(filename) as f: - offsets.append(f.tell()) - while f.readline(): - offsets.append(f.tell()) - return offsets - - -def read_lines_from_iterator(data_path, offsets, begin_line, num_lines): - with open(data_path) as f: - f.seek(offsets[begin_line]) - for i in range(num_lines): - yield f.readline() - - -def preprocess_raw_enwik9(input_filename, output_filename): - with open(input_filename, 'r') as f1: - with open(output_filename, 'w') as f2: - while True: - line = f1.readline() - if not line: - break - line = list(enwik9_norm_transform([line]))[0] - if line != ' ' and line != '': - if line[0] == ' ': - line = line[1:] - f2.writelines(line + '\n') - - -class EnWik9(torch.utils.data.Dataset): - r"""Compressed size of first 10^9 bytes of enwiki-20060303-pages-articles.xml. - It's part of Large Text Compression Benchmark project - """ - - def __init__(self, begin_line=0, num_lines=6348957, root='.data'): - """Initiate EnWik9 dataset. - - Args: - begin_line: the number of beginning line. Default: 0 - num_lines: the number of lines to be loaded. Default: 6348957 - root: Directory where the datasets are saved. Default: ".data" - data: a list of label/tokens tuple. tokens are a tensor after - - Examples: - >>> from torchtext.datasets import EnWik9 - >>> enwik9 = EnWik9(num_lines=20000) - >>> vocab = enwik9.get_vocab() - """ - - super(EnWik9, self).__init__() - - processed_file = os.path.join(root, 'norm_enwik9') - if not os.path.exists(processed_file): - url = 'http://mattmahoney.net/dc/enwik9.zip' - dataset_zip = download_from_url(url, - path=os.path.join(root, 'enwik9.zip'), - root=root) - extracted_file = extract_archive(dataset_zip) - raw_file = extracted_file[0] - preprocess_raw_enwik9(raw_file, processed_file) - - # Meta information - offsets = generate_offsets(processed_file) - read_lines = read_lines_from_iterator(processed_file, - offsets, begin_line, num_lines) - - self._data = [] - for item in simple_space_split(read_lines): - self._data += item - - self._vocab = None - - def __getitem__(self, i): - return self._data[i] - - def __len__(self): - return len(self._data) - - def __iter__(self): - for x in self._data: - yield x - - def get_vocab(self): - if self._vocab is None: - self._vocab = build_vocab_from_iterator([self._data]) - return self._vocab diff --git a/torchtext/legacy/vocab.py b/torchtext/legacy/vocab.py deleted file mode 100755 index a28ec440ae..0000000000 --- a/torchtext/legacy/vocab.py +++ /dev/null @@ -1,294 +0,0 @@ -from collections import defaultdict -import logging -import torch -from tqdm import tqdm -from collections import Counter -from torchtext.vocab import ( - pretrained_aliases, # not in legacy - Vectors, # not in legacy -) - -logger = logging.getLogger(__name__) - - -class Vocab(object): - """Defines a vocabulary object that will be used to numericalize a field. - - Attributes: - freqs: A collections.Counter object holding the frequencies of tokens - in the data used to build the Vocab. - stoi: A collections.defaultdict instance mapping token strings to - numerical identifiers. - itos: A list of token strings indexed by their numerical identifiers. - """ - - # TODO (@mttk): Populate classs with default values of special symbols - UNK = '' - - def __init__(self, counter, max_size=None, min_freq=1, specials=('', ''), - vectors=None, unk_init=None, vectors_cache=None, specials_first=True): - """Create a Vocab object from a collections.Counter. - - Args: - counter: collections.Counter object holding the frequencies of - each value found in the data. - max_size: The maximum size of the vocabulary, or None for no - maximum. Default: None. - min_freq: The minimum frequency needed to include a token in the - vocabulary. Values less than 1 will be set to 1. Default: 1. - specials: The list of special tokens (e.g., padding or eos) that - will be prepended to the vocabulary. Default: [', ''] - vectors: One of either the available pretrained vectors - or custom pretrained vectors (see Vocab.load_vectors); - or a list of aforementioned vectors - unk_init (callback): by default, initialize out-of-vocabulary word vectors - to zero vectors; can be any function that takes in a Tensor and - returns a Tensor of the same size. Default: 'torch.zeros' - vectors_cache: directory for cached vectors. Default: '.vector_cache' - specials_first: Whether to add special tokens into the vocabulary at first. - If it is False, they are added into the vocabulary at last. - Default: True. - """ - self.freqs = counter - counter = counter.copy() - min_freq = max(min_freq, 1) - - self.itos = list() - self.unk_index = None - if specials_first: - self.itos = list(specials) - # only extend max size if specials are prepended - max_size = None if max_size is None else max_size + len(specials) - - # frequencies of special tokens are not counted when building vocabulary - # in frequency order - for tok in specials: - del counter[tok] - - # sort by frequency, then alphabetically - words_and_frequencies = sorted(counter.items(), key=lambda tup: tup[0]) - words_and_frequencies.sort(key=lambda tup: tup[1], reverse=True) - - for word, freq in words_and_frequencies: - if freq < min_freq or len(self.itos) == max_size: - break - self.itos.append(word) - - if Vocab.UNK in specials: # hard-coded for now - unk_index = specials.index(Vocab.UNK) # position in list - # account for ordering of specials, set variable - self.unk_index = unk_index if specials_first else len(self.itos) + unk_index - self.stoi = defaultdict(self._default_unk_index) - else: - self.stoi = defaultdict() - - if not specials_first: - self.itos.extend(list(specials)) - - # stoi is simply a reverse dict for itos - self.stoi.update({tok: i for i, tok in enumerate(self.itos)}) - - self.vectors = None - if vectors is not None: - self.load_vectors(vectors, unk_init=unk_init, cache=vectors_cache) - else: - assert unk_init is None and vectors_cache is None - - def _default_unk_index(self): - return self.unk_index - - def __getitem__(self, token): - return self.stoi.get(token, self.stoi.get(Vocab.UNK)) - - def __getstate__(self): - # avoid picking defaultdict - attrs = dict(self.__dict__) - # cast to regular dict - attrs['stoi'] = dict(self.stoi) - return attrs - - def __setstate__(self, state): - if state.get("unk_index", None) is None: - stoi = defaultdict() - else: - stoi = defaultdict(self._default_unk_index) - stoi.update(state['stoi']) - state['stoi'] = stoi - self.__dict__.update(state) - - def __eq__(self, other): - if self.freqs != other.freqs: - return False - if self.stoi != other.stoi: - return False - if self.itos != other.itos: - return False - if self.vectors != other.vectors: - return False - return True - - def __len__(self): - return len(self.itos) - - def lookup_indices(self, tokens): - indices = [self.__getitem__(token) for token in tokens] - return indices - - def extend(self, v, sort=False): - words = sorted(v.itos) if sort else v.itos - for w in words: - if w not in self.stoi: - self.itos.append(w) - self.stoi[w] = len(self.itos) - 1 - - def load_vectors(self, vectors, **kwargs): - """ - Args: - vectors: one of or a list containing instantiations of the - GloVe, CharNGram, or Vectors classes. Alternatively, one - of or a list of available pretrained vectors: - - charngram.100d - fasttext.en.300d - fasttext.simple.300d - glove.42B.300d - glove.840B.300d - glove.twitter.27B.25d - glove.twitter.27B.50d - glove.twitter.27B.100d - glove.twitter.27B.200d - glove.6B.50d - glove.6B.100d - glove.6B.200d - glove.6B.300d - - Remaining keyword arguments: Passed to the constructor of Vectors classes. - """ - if not isinstance(vectors, list): - vectors = [vectors] - for idx, vector in enumerate(vectors): - if isinstance(vector, str): - # Convert the string pretrained vector identifier - # to a Vectors object - if vector not in pretrained_aliases: - raise ValueError( - "Got string input vector {}, but allowed pretrained " - "vectors are {}".format( - vector, list(pretrained_aliases.keys()))) - vectors[idx] = pretrained_aliases[vector](**kwargs) - elif not isinstance(vector, Vectors): - raise ValueError( - "Got input vectors of type {}, expected str or " - "Vectors object".format(type(vector))) - - tot_dim = sum(v.dim for v in vectors) - self.vectors = torch.Tensor(len(self), tot_dim) - for i, token in enumerate(self.itos): - start_dim = 0 - for v in vectors: - end_dim = start_dim + v.dim - self.vectors[i][start_dim:end_dim] = v[token.strip()] - start_dim = end_dim - assert(start_dim == tot_dim) - - def set_vectors(self, stoi, vectors, dim, unk_init=torch.Tensor.zero_): - """ - Set the vectors for the Vocab instance from a collection of Tensors. - - Args: - stoi: A dictionary of string to the index of the associated vector - in the `vectors` input argument. - vectors: An indexed iterable (or other structure supporting __getitem__) that - given an input index, returns a FloatTensor representing the vector - for the token associated with the index. For example, - vector[stoi["string"]] should return the vector for "string". - dim: The dimensionality of the vectors. - unk_init (callback): by default, initialize out-of-vocabulary word vectors - to zero vectors; can be any function that takes in a Tensor and - returns a Tensor of the same size. Default: 'torch.zeros' - """ - self.vectors = torch.Tensor(len(self), dim) - for i, token in enumerate(self.itos): - wv_index = stoi.get(token, None) - if wv_index is not None: - self.vectors[i] = vectors[wv_index] - else: - self.vectors[i] = unk_init(self.vectors[i]) - - -class SubwordVocab(Vocab): - - def __init__(self, counter, max_size=None, specials=(''), - vectors=None, unk_init=torch.Tensor.zero_): - """Create a revtok subword vocabulary from a collections.Counter. - - Args: - counter: collections.Counter object holding the frequencies of - each word found in the data. - max_size: The maximum size of the subword vocabulary, or None for no - maximum. Default: None. - specials: The list of special tokens (e.g., padding or eos) that - will be prepended to the vocabulary in addition to an - token. - vectors: One of either the available pretrained vectors - or custom pretrained vectors (see Vocab.load_vectors); - or a list of aforementioned vectors - unk_init (callback): by default, initialize out-of-vocabulary word vectors - to zero vectors; can be any function that takes in a Tensor and - returns a Tensor of the same size. Default: 'torch.zeros - """ - try: - import revtok - except ImportError: - print("Please install revtok.") - raise - - # Hardcode unk_index as subword_vocab has no specials_first argument - self.unk_index = (specials.index(SubwordVocab.UNK) - if SubwordVocab.UNK in specials else None) - - if self.unk_index is None: - self.stoi = defaultdict() - else: - self.stoi = defaultdict(self._default_unk_index) - - self.stoi.update({tok: i for i, tok in enumerate(specials)}) - self.itos = specials.copy() - - self.segment = revtok.SubwordSegmenter(counter, max_size) - - max_size = None if max_size is None else max_size + len(self.itos) - - # sort by frequency/entropy, then alphabetically - toks = sorted(self.segment.vocab.items(), - key=lambda tup: (len(tup[0]) != 1, -tup[1], tup[0])) - - for tok, _ in toks: - if len(self.itos) == max_size: - break - self.itos.append(tok) - self.stoi[tok] = len(self.itos) - 1 - - if vectors is not None: - self.load_vectors(vectors, unk_init=unk_init) - - -def build_vocab_from_iterator(iterator, num_lines=None): - """ - Build a Vocab from an iterator. - - Args: - iterator: Iterator used to build Vocab. Must yield list or iterator of tokens. - num_lines: The expected number of elements returned by the iterator. - (Default: None) - Optionally, if known, the expected number of elements can be passed to - this factory function for improved progress reporting. - """ - - counter = Counter() - with tqdm(unit_scale=0, unit='lines', total=num_lines) as t: - for tokens in iterator: - counter.update(tokens) - t.update(1) - word_vocab = Vocab(counter) - return word_vocab diff --git a/torchtext/transforms.py b/torchtext/transforms.py index 436c083754..cf43e40f4d 100644 --- a/torchtext/transforms.py +++ b/torchtext/transforms.py @@ -130,7 +130,7 @@ def __init__( if sort_names: label_names = sorted(label_names) - self._label_vocab = Vocab(torch.classes.torchtext.Vocab(label_names, None)) + self._label_vocab = Vocab(torch.classes.torchtext.Vocab(label_names, 0)) self._label_names = self._label_vocab.get_itos() def forward(self, labels: Union[str, List[str]]) -> Union[int, List[int]]: