Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Function to compare batch look-up for vocab #1290

Closed
wants to merge 4 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
43 changes: 41 additions & 2 deletions benchmark/benchmark_experimental_vocab.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,10 @@
import argparse
from collections import (Counter, OrderedDict)
import time

from timeit import default_timer as timer
import random
import string
from matplotlib import pyplot as plt
import torch
from torchtext.experimental.datasets import DATASETS
from torchtext.experimental.vocab import (
Expand All @@ -16,6 +19,42 @@
from torchtext.experimental.transforms import basic_english_normalize


def compare_legacy_and_experimental_batch_lookup():
num_tokens = 1000
num_letters = 6
num_lines = 100000
vocab = [''.join(random.sample(string.ascii_letters * num_letters, num_letters)) for _ in range(num_tokens)]
counter = Counter()
counter.update(vocab)
legacy_vocab = Vocab(counter)
experimental_vocab = VocabExperimental(counter)
speed_ups = []
token_lengths = [i for i in range(2, 100)]
for i in token_lengths:
lines = [random.sample(vocab, i) for _ in range(num_lines)]
start_time = timer()
for l in lines:
legacy_vocab.lookup_indices(l)
legacy_time = timer() - start_time

start_time = timer()
for l in lines:
experimental_vocab.lookup_indices(l)

experimental_time = timer() - start_time

speed_ups.append(legacy_time / experimental_time)
print("speed-up={} for average length={}".format(legacy_time / experimental_time, i))
del lines

plt.close()
fig, ax = plt.subplots(1,1)
ax.plot(token_lengths, speed_ups)
ax.set_xlabel('Average Tokens per line')
ax.set_ylabel('Speed-up')
plt.savefig("speedup.jpg")


def legacy_vocab_from_file_object(file_like_object, **kwargs):
r"""Create a `Vocab` object from a file like object.

Expand Down Expand Up @@ -76,7 +115,7 @@ def benchmark_experimental_vocab_construction(vocab_file_path, is_raw_text=True,
print("Construction time:", time.monotonic() - t0)


def benchmark_experimental_vocab_lookup(vocab_file_path=None, dataset = 'AG_NEWS'):
def benchmark_experimental_vocab_lookup(vocab_file_path=None, dataset='AG_NEWS'):
def _run_benchmark_lookup(tokens, vocab):
t0 = time.monotonic()
# list lookup
Expand Down
2 changes: 1 addition & 1 deletion test/experimental/test_vocab.py
Original file line number Diff line number Diff line change
Expand Up @@ -219,7 +219,7 @@ def test_vocab_load_and_save(self):

def test_build_vocab_iterator(self):
iterator = [['hello', 'hello', 'hello', 'freq_low', 'hello', 'world', 'world', 'world', 'ᑌᑎIᑕOᗪᕮ_Tᕮ᙭T',
'ᑌᑎIᑕOᗪᕮ_Tᕮ᙭T', 'ᑌᑎIᑕOᗪᕮ_Tᕮ᙭T', 'freq_low', 'ᑌᑎIᑕOᗪᕮ_Tᕮ᙭T', 'ᑌᑎIᑕOᗪᕮ_Tᕮ᙭T']]
'ᑌᑎIᑕOᗪᕮ_Tᕮ᙭T', 'ᑌᑎIᑕOᗪᕮ_Tᕮ᙭T', 'freq_low', 'ᑌᑎIᑕOᗪᕮ_Tᕮ᙭T', 'ᑌᑎIᑕOᗪᕮ_Tᕮ᙭T']]
v = build_vocab_from_iterator(iterator)
expected_itos = ['<unk>', 'ᑌᑎIᑕOᗪᕮ_Tᕮ᙭T', 'hello', 'world', 'freq_low']
expected_stoi = {x: index for index, x in enumerate(expected_itos)}
Expand Down
54 changes: 25 additions & 29 deletions test/experimental/test_with_asset.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,13 +78,12 @@ class TestTransformsWithAsset(TorchtextTestCase):
def test_vocab_transform(self):
asset_name = 'vocab_test2.txt'
asset_path = get_asset_path(asset_name)
with open(asset_path, 'r') as f:
vocab_transform = VocabTransform(load_vocab_from_file(f))
self.assertEqual(vocab_transform(['of', 'that', 'new']),
[7, 18, 24])
jit_vocab_transform = torch.jit.script(vocab_transform)
self.assertEqual(jit_vocab_transform(['of', 'that', 'new', 'that']),
[7, 18, 24, 18])
vocab_transform = VocabTransform(load_vocab_from_file(asset_path))
self.assertEqual(vocab_transform(['of', 'that', 'new']),
[7, 18, 24])
jit_vocab_transform = torch.jit.script(vocab_transform)
self.assertEqual(jit_vocab_transform(['of', 'that', 'new', 'that']),
[7, 18, 24, 18])

def test_errors_vectors_python(self):
tokens = []
Expand Down Expand Up @@ -179,27 +178,25 @@ def test_glove_different_dims(self):
def test_vocab_from_file(self):
asset_name = 'vocab_test.txt'
asset_path = get_asset_path(asset_name)
with open(asset_path, 'r') as f:
v = load_vocab_from_file(f, unk_token='<new_unk>')
expected_itos = ['<new_unk>', 'b', 'a', 'c']
expected_stoi = {x: index for index, x in enumerate(expected_itos)}
self.assertEqual(v.get_itos(), expected_itos)
self.assertEqual(dict(v.get_stoi()), expected_stoi)
v = load_vocab_from_file(asset_path, unk_token='<new_unk>')
expected_itos = ['<new_unk>', 'b', 'a', 'c']
expected_stoi = {x: index for index, x in enumerate(expected_itos)}
self.assertEqual(v.get_itos(), expected_itos)
self.assertEqual(dict(v.get_stoi()), expected_stoi)

def test_vocab_from_raw_text_file(self):
asset_name = 'vocab_raw_text_test.txt'
asset_path = get_asset_path(asset_name)
with open(asset_path, 'r') as f:
tokenizer = basic_english_normalize()
jit_tokenizer = torch.jit.script(tokenizer)
v = build_vocab_from_text_file(f, jit_tokenizer, unk_token='<new_unk>')
expected_itos = ['<new_unk>', "'", 'after', 'talks', '.', 'are', 'at', 'disappointed',
'fears', 'federal', 'firm', 'for', 'mogul', 'n', 'newall', 'parent',
'pension', 'representing', 'say', 'stricken', 't', 'they', 'turner',
'unions', 'with', 'workers']
expected_stoi = {x: index for index, x in enumerate(expected_itos)}
self.assertEqual(v.get_itos(), expected_itos)
self.assertEqual(dict(v.get_stoi()), expected_stoi)
tokenizer = basic_english_normalize()
jit_tokenizer = torch.jit.script(tokenizer)
v = build_vocab_from_text_file(asset_path, jit_tokenizer, unk_token='<new_unk>')
expected_itos = ['<new_unk>', "'", 'after', 'talks', '.', 'are', 'at', 'disappointed',
'fears', 'federal', 'firm', 'for', 'mogul', 'n', 'newall', 'parent',
'pension', 'representing', 'say', 'stricken', 't', 'they', 'turner',
'unions', 'with', 'workers']
expected_stoi = {x: index for index, x in enumerate(expected_itos)}
self.assertEqual(v.get_itos(), expected_itos)
self.assertEqual(dict(v.get_stoi()), expected_stoi)

def test_builtin_pretrained_sentencepiece_processor(self):
sp_model_path = download_from_url(PRETRAINED_SP_MODEL['text_unigram_25000'])
Expand Down Expand Up @@ -241,11 +238,10 @@ def batch_func(data):
def test_text_sequential_transform(self):
asset_name = 'vocab_test2.txt'
asset_path = get_asset_path(asset_name)
with open(asset_path, 'r') as f:
pipeline = TextSequentialTransforms(basic_english_normalize(), load_vocab_from_file(f))
jit_pipeline = torch.jit.script(pipeline)
self.assertEqual(pipeline('of that new'), [7, 18, 24])
self.assertEqual(jit_pipeline('of that new'), [7, 18, 24])
pipeline = TextSequentialTransforms(basic_english_normalize(), load_vocab_from_file(asset_path))
jit_pipeline = torch.jit.script(pipeline)
self.assertEqual(pipeline('of that new'), [7, 18, 24])
self.assertEqual(jit_pipeline('of that new'), [7, 18, 24])

def test_vectors_from_file(self):
asset_name = 'vectors_test.csv'
Expand Down
24 changes: 10 additions & 14 deletions torchtext/experimental/vocab.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,12 +19,11 @@
logger = logging.getLogger(__name__)


def build_vocab_from_text_file(file_object, jited_tokenizer, min_freq=1, unk_token='<unk>', num_cpus=4):
def build_vocab_from_text_file(file_path, jited_tokenizer, min_freq=1, unk_token='<unk>', num_cpus=4):
r"""Create a `Vocab` object from a raw text file.

The `file_object` can contain any raw text. This function applies a generic JITed tokenizer in
parallel to the text. Note that the vocab will be created in the order that the tokens first appear
in the file (and not by the frequency of tokens).
The `file_path` can contain any raw text. This function applies a generic JITed tokenizer in
parallel to the text.

Args:
file_object (FileObject): a file object to read data from.
Expand All @@ -40,20 +39,18 @@ def build_vocab_from_text_file(file_object, jited_tokenizer, min_freq=1, unk_tok
Examples:
>>> from torchtext.experimental.vocab import build_vocab_from_text_file
>>> from torchtext.experimental.transforms import basic_english_normalize
>>> f = open('vocab.txt', 'r')
>>> tokenizer = basic_english_normalize()
>>> tokenizer = basic_english_normalize()
>>> tokenizer = basic_english_normalize()
>>> jit_tokenizer = torch.jit.script(tokenizer)
>>> v = build_vocab_from_text_file(f, jit_tokenizer)
>>> v = build_vocab_from_text_file('vocab.txt', jit_tokenizer)
"""
vocab_obj = _build_vocab_from_text_file(file_object.name, unk_token, min_freq, num_cpus, jited_tokenizer)
vocab_obj = _build_vocab_from_text_file(file_path, unk_token, min_freq, num_cpus, jited_tokenizer)
return Vocab(vocab_obj)


def load_vocab_from_file(file_object, min_freq=1, unk_token='<unk>', num_cpus=4):
def load_vocab_from_file(file_path, min_freq=1, unk_token='<unk>', num_cpus=4):
r"""Create a `Vocab` object from a text file.
The `file_object` should contain tokens separated by new lines. Note that the vocab
will be created in the order that the tokens first appear in the file (and not by the frequency of tokens).
The `file_path` should contain tokens separated by new lines.
Format for txt file:

token1
Expand All @@ -73,11 +70,10 @@ def load_vocab_from_file(file_object, min_freq=1, unk_token='<unk>', num_cpus=4)

Examples:
>>> from torchtext.experimental.vocab import load_vocab_from_file
>>> f = open('vocab.txt', 'r')
>>> v = load_vocab_from_file(f)
>>> v = load_vocab_from_file('vocab.txt')
"""

vocab_obj = _load_vocab_from_file(file_object.name, unk_token, min_freq, num_cpus)
vocab_obj = _load_vocab_from_file(file_path, unk_token, min_freq, num_cpus)
return Vocab(vocab_obj)


Expand Down