Skip to content
This repository has been archived by the owner on Nov 22, 2022. It is now read-only.

Commit

Permalink
Move stripped down legacy torchtext to pytext
Browse files Browse the repository at this point in the history
Summary: We recently deprecated the legacy folder from PyTorch Text OSS in pytorch/text#1437. However, some FB specific code, especially in PyText depends on TorchText legacy. This diff simply upstreams some part of legacy code to PyText for future deletion.

Reviewed By: parmeet

Differential Revision: D32409029

fbshipit-source-id: c79fa986e2b1851c4fe6eea362cb7daac8549af6
  • Loading branch information
abhinavarora authored and facebook-github-bot committed Nov 17, 2021
1 parent 3402f1f commit 079d274
Show file tree
Hide file tree
Showing 20 changed files with 4,223 additions and 0 deletions.
11 changes: 11 additions & 0 deletions pytext/legacy/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
#!/usr/bin/env python3
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved

from torchtext import nn
from torchtext import utils

from . import data
from . import datasets
from . import vocab

__all__ = ["data", "nn", "datasets", "utils", "vocab"]
56 changes: 56 additions & 0 deletions pytext/legacy/data/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
#!/usr/bin/env python3
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved

from torchtext.data import functional

# Those are not in the legacy folder.
from torchtext.data import metrics
from torchtext.data import utils
from torchtext.data.functional import (
generate_sp_model,
load_sp_model,
sentencepiece_numericalizer,
sentencepiece_tokenizer,
custom_replace,
simple_space_split,
numericalize_tokens_from_iterator,
)
from torchtext.data.metrics import bleu_score
from torchtext.data.utils import get_tokenizer, interleave_keys

from .batch import Batch
from .dataset import Dataset, TabularDataset
from .example import Example
from .field import RawField, Field, NestedField, LabelField
from .iterator import batch, BucketIterator, Iterator, BPTTIterator, pool
from .pipeline import Pipeline

__all__ = [
"Batch",
"Example",
"RawField",
"Field",
"NestedField",
"LabelField",
"batch",
"BucketIterator",
"Iterator",
"BPTTIterator",
"pool",
"Pipeline",
"Dataset",
"TabularDataset",
"metrics",
"bleu_score",
"utils",
"get_tokenizer",
"interleave_keys",
"functional",
"generate_sp_model",
"load_sp_model",
"sentencepiece_numericalizer",
"sentencepiece_tokenizer",
"custom_replace",
"simple_space_split",
"numericalize_tokens_from_iterator",
]
114 changes: 114 additions & 0 deletions pytext/legacy/data/batch.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,114 @@
#!/usr/bin/env python3
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved

import torch


class Batch(object):
"""Defines a batch of examples along with its Fields.
Attributes:
batch_size: Number of examples in the batch.
dataset: A reference to the dataset object the examples come from
(which itself contains the dataset's Field objects).
train: Deprecated: this attribute is left for backwards compatibility,
however it is UNUSED as of the merger with pytorch 0.4.
input_fields: The names of the fields that are used as input for the model
target_fields: The names of the fields that are used as targets during
model training
Also stores the Variable for each column in the batch as an attribute.
"""

def __init__(self, data=None, dataset=None, device=None):
"""Create a Batch from a list of examples."""
if data is not None:
self.batch_size = len(data)
self.dataset = dataset
self.fields = dataset.fields.keys() # copy field names
self.input_fields = [
k
for k, v in dataset.fields.items()
if v is not None and not v.is_target
]
self.target_fields = [
k for k, v in dataset.fields.items() if v is not None and v.is_target
]

for (name, field) in dataset.fields.items():
if field is not None:
batch = [getattr(x, name) for x in data]
setattr(self, name, field.process(batch, device=device))

@classmethod
def fromvars(cls, dataset, batch_size, train=None, **kwargs):
"""Create a Batch directly from a number of Variables."""
batch = cls()
batch.batch_size = batch_size
batch.dataset = dataset
batch.fields = dataset.fields.keys()
for k, v in kwargs.items():
setattr(batch, k, v)
return batch

def __repr__(self):
return str(self)

def __str__(self):
if not self.__dict__:
return "Empty {} instance".format(torch.typename(self))

fields_to_index = filter(lambda field: field is not None, self.fields)
var_strs = "\n".join(
[
"\t[." + name + "]" + ":" + _short_str(getattr(self, name))
for name in fields_to_index
if hasattr(self, name)
]
)

data_str = (
" from {}".format(self.dataset.name.upper())
if hasattr(self.dataset, "name") and isinstance(self.dataset.name, str)
else ""
)

strt = "[{} of size {}{}]\n{}".format(
torch.typename(self), self.batch_size, data_str, var_strs
)
return "\n" + strt

def __len__(self):
return self.batch_size

def _get_field_values(self, fields):
if len(fields) == 0:
return None
elif len(fields) == 1:
return getattr(self, fields[0])
else:
return tuple(getattr(self, f) for f in fields)

def __iter__(self):
yield self._get_field_values(self.input_fields)
yield self._get_field_values(self.target_fields)


def _short_str(tensor):
# unwrap variable to tensor
if not torch.is_tensor(tensor):
# (1) unpack variable
if hasattr(tensor, "data"):
tensor = tensor.data
# (2) handle include_lengths
elif isinstance(tensor, tuple):
return str(tuple(_short_str(t) for t in tensor))
# (3) fallback to default str
else:
return str(tensor)

# copied from torch _tensor_str
size_str = "x".join(str(size) for size in tensor.size())
device_str = "" if not tensor.is_cuda else " (GPU {})".format(tensor.get_device())
strt = "[{} of size {}{}]".format(torch.typename(tensor), size_str, device_str)
return strt
Loading

0 comments on commit 079d274

Please sign in to comment.