Skip to content

Commit

Permalink
Import torchtext #1437 2cebac3
Browse files Browse the repository at this point in the history
Summary: Imports [#1437](#1437) from OSS Torchtext that removes the legacy folder.

Reviewed By: parmeet

Differential Revision: D32923084

fbshipit-source-id: 83411efd62cd527c518e36279bdbf586435ac9e5
  • Loading branch information
abhinavarora authored and facebook-github-bot committed Dec 8, 2021
1 parent 8d19c03 commit f898310
Show file tree
Hide file tree
Showing 47 changed files with 60 additions and 7,200 deletions.
134 changes: 13 additions & 121 deletions benchmark/benchmark_vocab.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,6 @@
import argparse
from collections import (Counter, OrderedDict)
import time
import random
import string
from timeit import default_timer as timer
from matplotlib import pyplot as plt
import torch
from torchtext.datasets import DATASETS
from torchtext.experimental.vocab_factory import (
Expand All @@ -13,15 +9,12 @@
)
from torchtext.vocab import build_vocab_from_iterator
from torchtext.vocab import vocab as VocabNew
from torchtext.legacy.vocab import (
Vocab,
build_vocab_from_iterator as build_vocab_from_iterator_legacy,
)
from torchtext.experimental.transforms import(
from torchtext.experimental.transforms import (
basic_english_normalize,
)
from torchtext.data.utils import get_tokenizer


def build_vocab(data, transforms):
def apply_transforms(data):
for _, line in data:
Expand All @@ -31,96 +24,16 @@ def apply_transforms(data):
return vocab


def compare_legacy_and_new_batch_lookup():
num_tokens = 1000
num_letters = 6
num_lines = 100000
vocab = [''.join(random.sample(string.ascii_letters * num_letters, num_letters)) for _ in range(num_tokens)]
counter = Counter()
counter.update(vocab)
legacy_vocab = Vocab(counter)
new_vocab = VocabNew(counter)
speed_ups = []
token_lengths = [i for i in range(2, 100)]
for i in token_lengths:
lines = [random.sample(vocab, i) for _ in range(num_lines)]
start_time = timer()
for text in lines:
legacy_vocab.lookup_indices(text)
legacy_time = timer() - start_time

start_time = timer()
for text in lines:
new_vocab.lookup_indices(text)

new_time = timer() - start_time

speed_ups.append(legacy_time / new_time)
print("speed-up={} for average length={}".format(legacy_time / new_time, i))
del lines

plt.close()
fig, ax = plt.subplots(1, 1)
ax.plot(token_lengths, speed_ups)
ax.set_xlabel('Average Tokens per line')
ax.set_ylabel('Speed-up')
plt.savefig("speedup.jpg")


def legacy_vocab_from_file_object(file_like_object, **kwargs):
r"""Create a `Vocab` object from a file like object.
The `file_like_object` should contain tokens seperated by new lines. Note that the vocab
will be created in the order that the tokens first appear in the file (and not by the frequency of tokens).
Format for txt file:
token1
token2
...
token_n
Args:
file_like_object (FileObject): a file like object to read data from.
Remaining keyword arguments: Passed to the constructor of Vocab class.
Returns:
Vocab: a `Vocab` object.
Examples:
>>> from torchtext.vocab import vocab_from_file_object
>>> f = open('vocab.txt', 'r')
>>> v = vocab_from_file_object(f, specials=('<unk>', '<pad>', '<eos>'), specials_first=False)
"""
tokenizer = basic_english_normalize()

def tokenize(line):
return tokenizer(line)

def token_iterator(lines):
for line in lines:
for token in tokenize(line):
yield token

return build_vocab_from_iterator_legacy(token_iterator(file_like_object))


def benchmark_new_vocab_construction(vocab_file_path, is_raw_text=True, is_legacy=True, num_iters=1):
def benchmark_new_vocab_construction(vocab_file_path, is_raw_text=True, num_iters=1):
f = open(vocab_file_path, 'r')
t0 = time.monotonic()
if is_raw_text:
if is_legacy:
print("Loading from raw text file with legacy python function")
for _ in range(num_iters):
legacy_vocab_from_file_object(f)

print("Construction time:", time.monotonic() - t0)
else:
print("Loading from raw text file with basic_english_normalize tokenizer")
for _ in range(num_iters):
tokenizer = basic_english_normalize()
jited_tokenizer = torch.jit.script(tokenizer)
build_vocab_from_text_file(vocab_file_path, jited_tokenizer, num_cpus=1)
print("Construction time:", time.monotonic() - t0)
print("Loading from raw text file with basic_english_normalize tokenizer")
for _ in range(num_iters):
tokenizer = basic_english_normalize()
jited_tokenizer = torch.jit.script(tokenizer)
build_vocab_from_text_file(vocab_file_path, jited_tokenizer, num_cpus=1)
print("Construction time:", time.monotonic() - t0)
else:
for _ in range(num_iters):
load_vocab_from_file(f)
Expand All @@ -146,9 +59,9 @@ def _run_benchmark_lookup(tokens, vocab):
tokens_lists = []
tokenizer = get_tokenizer("basic_english")
for (_, text) in DATASETS[dataset](split='train'):
cur_tokens = tokenizer(text)
tokens_lists.append(cur_tokens)
tokens += cur_tokens
cur_tokens = tokenizer(text)
tokens_lists.append(cur_tokens)
tokens += cur_tokens

if vocab_file_path:
print("Loading Vocab from file {}".format(vocab_file_path))
Expand All @@ -158,12 +71,6 @@ def token_iterator(file_path):
for token in f:
yield token

# existing Vocab construction
print("Vocab")
t0 = time.monotonic()
v_existing = build_vocab_from_iterator_legacy(token_iterator(vocab_file_path))
print("Construction time:", time.monotonic() - t0)

# new Vocab construction
print("Vocab New")
t0 = time.monotonic()
Expand All @@ -176,25 +83,13 @@ def token_iterator(file_path):
sorted_by_freq_tuples = sorted(counter.items(), key=lambda x: x[1], reverse=True)
ordered_dict = OrderedDict(sorted_by_freq_tuples)

# existing Vocab construction
print("Vocab")
t0 = time.monotonic()
v_existing = Vocab(counter)
print("Construction time:", time.monotonic() - t0)

# new Vocab construction
print("Vocab New")
t0 = time.monotonic()
v_new = VocabNew(ordered_dict)
print("Construction time:", time.monotonic() - t0)
jit_v_new = torch.jit.script(v_new)

# existing Vocab eager lookup
print("Vocab - Eager Mode")
_run_benchmark_lookup(tokens, v_existing)
_run_benchmark_lookup([tokens], v_existing)
_run_benchmark_lookup(tokens_lists, v_existing)

# new Vocab eager lookup
print("Vocab New - Eager Mode")
_run_benchmark_lookup(tokens, v_new)
Expand All @@ -215,8 +110,6 @@ def token_iterator(file_path):
help='run benchmark for constructing a vocab (default=False)')
parser.add_argument('--is-raw-text', type=bool, default=True,
help='construct vocab from raw text file (default=True)')
parser.add_argument('--is-legacy', type=bool, default=False,
help='construct vocab using legacy implementation (default=False)')
parser.add_argument('--vocab-filename-construction', type=str, default='vocab.txt',
help='The name of vocab file used for construction')
parser.add_argument('--vocab-filename-lookup', type=str, default=None,
Expand All @@ -226,8 +119,7 @@ def token_iterator(file_path):
args = parser.parse_args()

if args.run_construction_benchmark:
print("is_legacy", args.is_legacy)
benchmark_new_vocab_construction(args.vocab_filename_construction,
is_raw_text=args.is_raw_text, is_legacy=args.is_legacy)
is_raw_text=args.is_raw_text)
else:
benchmark_new_vocab_lookup(args.vocab_filename_lookup, args.dataset)
Loading

0 comments on commit f898310

Please sign in to comment.