Move stripped down legacy torchtext to pytext

Summary: We recently deprecated the legacy folder from PyTorch Text OSS in pytorch/text#1437. However, some FB specific code, especially in PyText depends on TorchText legacy. This diff simply upstreams some part of legacy code to PyText for future deletion. Reviewed By: parmeet Differential Revision: D32409029 fbshipit-source-id: c79fa986e2b1851c4fe6eea362cb7daac8549af6
facebookresearch · Nov 17, 2021 · 079d274 · 079d274
1 parent 3402f1f
commit 079d274
Show file tree

Hide file tree

Showing 20 changed files with 4,223 additions and 0 deletions.
diff --git a/pytext/legacy/__init__.py b/pytext/legacy/__init__.py
@@ -0,0 +1,11 @@
+#!/usr/bin/env python3
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+
+from torchtext import nn
+from torchtext import utils
+
+from . import data
+from . import datasets
+from . import vocab
+
+__all__ = ["data", "nn", "datasets", "utils", "vocab"]
diff --git a/pytext/legacy/data/__init__.py b/pytext/legacy/data/__init__.py
@@ -0,0 +1,56 @@
+#!/usr/bin/env python3
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+
+from torchtext.data import functional
+
+# Those are not in the legacy folder.
+from torchtext.data import metrics
+from torchtext.data import utils
+from torchtext.data.functional import (
+    generate_sp_model,
+    load_sp_model,
+    sentencepiece_numericalizer,
+    sentencepiece_tokenizer,
+    custom_replace,
+    simple_space_split,
+    numericalize_tokens_from_iterator,
+)
+from torchtext.data.metrics import bleu_score
+from torchtext.data.utils import get_tokenizer, interleave_keys
+
+from .batch import Batch
+from .dataset import Dataset, TabularDataset
+from .example import Example
+from .field import RawField, Field, NestedField, LabelField
+from .iterator import batch, BucketIterator, Iterator, BPTTIterator, pool
+from .pipeline import Pipeline
+
+__all__ = [
+    "Batch",
+    "Example",
+    "RawField",
+    "Field",
+    "NestedField",
+    "LabelField",
+    "batch",
+    "BucketIterator",
+    "Iterator",
+    "BPTTIterator",
+    "pool",
+    "Pipeline",
+    "Dataset",
+    "TabularDataset",
+    "metrics",
+    "bleu_score",
+    "utils",
+    "get_tokenizer",
+    "interleave_keys",
+    "functional",
+    "generate_sp_model",
+    "load_sp_model",
+    "sentencepiece_numericalizer",
+    "sentencepiece_tokenizer",
+    "custom_replace",
+    "simple_space_split",
+    "numericalize_tokens_from_iterator",
+]
diff --git a/pytext/legacy/data/batch.py b/pytext/legacy/data/batch.py
@@ -0,0 +1,114 @@
+#!/usr/bin/env python3
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+
+import torch
+
+
+class Batch(object):
+    """Defines a batch of examples along with its Fields.
+
+    Attributes:
+        batch_size: Number of examples in the batch.
+        dataset: A reference to the dataset object the examples come from
+            (which itself contains the dataset's Field objects).
+        train: Deprecated: this attribute is left for backwards compatibility,
+            however it is UNUSED as of the merger with pytorch 0.4.
+        input_fields: The names of the fields that are used as input for the model
+        target_fields: The names of the fields that are used as targets during
+                       model training
+
+    Also stores the Variable for each column in the batch as an attribute.
+    """
+
+    def __init__(self, data=None, dataset=None, device=None):
+        """Create a Batch from a list of examples."""
+        if data is not None:
+            self.batch_size = len(data)
+            self.dataset = dataset
+            self.fields = dataset.fields.keys()  # copy field names
+            self.input_fields = [
+                k
+                for k, v in dataset.fields.items()
+                if v is not None and not v.is_target
+            ]
+            self.target_fields = [
+                k for k, v in dataset.fields.items() if v is not None and v.is_target
+            ]
+
+            for (name, field) in dataset.fields.items():
+                if field is not None:
+                    batch = [getattr(x, name) for x in data]
+                    setattr(self, name, field.process(batch, device=device))
+
+    @classmethod
+    def fromvars(cls, dataset, batch_size, train=None, **kwargs):
+        """Create a Batch directly from a number of Variables."""
+        batch = cls()
+        batch.batch_size = batch_size
+        batch.dataset = dataset
+        batch.fields = dataset.fields.keys()
+        for k, v in kwargs.items():
+            setattr(batch, k, v)
+        return batch
+
+    def __repr__(self):
+        return str(self)
+
+    def __str__(self):
+        if not self.__dict__:
+            return "Empty {} instance".format(torch.typename(self))
+
+        fields_to_index = filter(lambda field: field is not None, self.fields)
+        var_strs = "\n".join(
+            [
+                "\t[." + name + "]" + ":" + _short_str(getattr(self, name))
+                for name in fields_to_index
+                if hasattr(self, name)
+            ]
+        )
+
+        data_str = (
+            " from {}".format(self.dataset.name.upper())
+            if hasattr(self.dataset, "name") and isinstance(self.dataset.name, str)
+            else ""
+        )
+
+        strt = "[{} of size {}{}]\n{}".format(
+            torch.typename(self), self.batch_size, data_str, var_strs
+        )
+        return "\n" + strt
+
+    def __len__(self):
+        return self.batch_size
+
+    def _get_field_values(self, fields):
+        if len(fields) == 0:
+            return None
+        elif len(fields) == 1:
+            return getattr(self, fields[0])
+        else:
+            return tuple(getattr(self, f) for f in fields)
+
+    def __iter__(self):
+        yield self._get_field_values(self.input_fields)
+        yield self._get_field_values(self.target_fields)
+
+
+def _short_str(tensor):
+    # unwrap variable to tensor
+    if not torch.is_tensor(tensor):
+        # (1) unpack variable
+        if hasattr(tensor, "data"):
+            tensor = tensor.data
+        # (2) handle include_lengths
+        elif isinstance(tensor, tuple):
+            return str(tuple(_short_str(t) for t in tensor))
+        # (3) fallback to default str
+        else:
+            return str(tensor)
+
+    # copied from torch _tensor_str
+    size_str = "x".join(str(size) for size in tensor.size())
+    device_str = "" if not tensor.is_cuda else " (GPU {})".format(tensor.get_device())
+    strt = "[{} of size {}{}]".format(torch.typename(tensor), size_str, device_str)
+    return strt