Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fixing benchmark code #1339

Merged
merged 1 commit into from
Jun 21, 2021
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -6,28 +6,40 @@
from timeit import default_timer as timer
from matplotlib import pyplot as plt
import torch
from torchtext.experimental.datasets import DATASETS
from torchtext.datasets import DATASETS
from torchtext.experimental.vocab_factory import (
load_vocab_from_file,
build_vocab_from_text_file
)
from torchtext.vocab import vocab as VocabExperimental
from torchtext.vocab import build_vocab_from_iterator
from torchtext.vocab import vocab as VocabNew
from torchtext.legacy.vocab import (
Vocab,
build_vocab_from_iterator
build_vocab_from_iterator as build_vocab_from_iterator_legacy,
)
from torchtext.experimental.transforms import basic_english_normalize
from torchtext.experimental.transforms import(
basic_english_normalize,
)
from torchtext.data.utils import get_tokenizer

def build_vocab(data, transforms):
def apply_transforms(data):
for _, line in data:
yield transforms(line)
vocab = build_vocab_from_iterator(apply_transforms(data), specials=['<unk>', '<pad>'])
vocab.set_default_index(vocab['<unk>'])
return vocab


def compare_legacy_and_experimental_batch_lookup():
def compare_legacy_and_new_batch_lookup():
num_tokens = 1000
num_letters = 6
num_lines = 100000
vocab = [''.join(random.sample(string.ascii_letters * num_letters, num_letters)) for _ in range(num_tokens)]
counter = Counter()
counter.update(vocab)
legacy_vocab = Vocab(counter)
experimental_vocab = VocabExperimental(counter)
new_vocab = VocabNew(counter)
speed_ups = []
token_lengths = [i for i in range(2, 100)]
for i in token_lengths:
Expand All @@ -39,12 +51,12 @@ def compare_legacy_and_experimental_batch_lookup():

start_time = timer()
for text in lines:
experimental_vocab.lookup_indices(text)
new_vocab.lookup_indices(text)

experimental_time = timer() - start_time
new_time = timer() - start_time

speed_ups.append(legacy_time / experimental_time)
print("speed-up={} for average length={}".format(legacy_time / experimental_time, i))
speed_ups.append(legacy_time / new_time)
print("speed-up={} for average length={}".format(legacy_time / new_time, i))
del lines

plt.close()
Expand Down Expand Up @@ -89,10 +101,10 @@ def token_iterator(lines):
for token in tokenize(line):
yield token

return build_vocab_from_iterator(token_iterator(file_like_object))
return build_vocab_from_iterator_legacy(token_iterator(file_like_object))


def benchmark_experimental_vocab_construction(vocab_file_path, is_raw_text=True, is_legacy=True, num_iters=1):
def benchmark_new_vocab_construction(vocab_file_path, is_raw_text=True, is_legacy=True, num_iters=1):
f = open(vocab_file_path, 'r')
t0 = time.monotonic()
if is_raw_text:
Expand All @@ -107,15 +119,15 @@ def benchmark_experimental_vocab_construction(vocab_file_path, is_raw_text=True,
for _ in range(num_iters):
tokenizer = basic_english_normalize()
jited_tokenizer = torch.jit.script(tokenizer)
build_vocab_from_text_file(f, jited_tokenizer, num_cpus=1)
build_vocab_from_text_file(vocab_file_path, jited_tokenizer, num_cpus=1)
print("Construction time:", time.monotonic() - t0)
else:
for _ in range(num_iters):
load_vocab_from_file(f)
print("Construction time:", time.monotonic() - t0)


def benchmark_experimental_vocab_lookup(vocab_file_path=None, dataset='AG_NEWS'):
def benchmark_new_vocab_lookup(vocab_file_path=None, dataset='AG_NEWS'):
def _run_benchmark_lookup(tokens, vocab):
t0 = time.monotonic()
# list lookup
Expand All @@ -132,15 +144,11 @@ def _run_benchmark_lookup(tokens, vocab):

tokens = []
tokens_lists = []

train = DATASETS[dataset](split='train')
vocab = train.get_vocab()
for (_, text) in train:
cur_tokens = []
for id in text.tolist():
cur_tokens.append(vocab.itos[id])
tokens_lists.append(cur_tokens)
tokens += cur_tokens
tokenizer = get_tokenizer("basic_english")
for (_, text) in DATASETS[dataset](split='train'):
cur_tokens = tokenizer(text)
tokens_lists.append(cur_tokens)
tokens += cur_tokens

if vocab_file_path:
print("Loading Vocab from file {}".format(vocab_file_path))
Expand All @@ -153,14 +161,14 @@ def token_iterator(file_path):
# existing Vocab construction
print("Vocab")
t0 = time.monotonic()
v_existing = build_vocab_from_iterator(token_iterator(vocab_file_path))
v_existing = build_vocab_from_iterator_legacy(token_iterator(vocab_file_path))
print("Construction time:", time.monotonic() - t0)

# experimental Vocab construction
print("Vocab Experimental")
# new Vocab construction
print("Vocab New")
t0 = time.monotonic()
f = open(vocab_file_path, 'r')
v_experimental = load_vocab_from_file(f)
v_new = load_vocab_from_file(f)
print("Construction time:", time.monotonic() - t0)
else:
print("Loading Vocab from {}".format(dataset))
Expand All @@ -174,31 +182,31 @@ def token_iterator(file_path):
v_existing = Vocab(counter)
print("Construction time:", time.monotonic() - t0)

# experimental Vocab construction
print("Vocab Experimental")
# new Vocab construction
print("Vocab New")
t0 = time.monotonic()
v_experimental = VocabExperimental(ordered_dict)
v_new = VocabNew(ordered_dict)
print("Construction time:", time.monotonic() - t0)
jit_v_experimental = torch.jit.script(v_experimental)
jit_v_new = torch.jit.script(v_new)

# existing Vocab eager lookup
print("Vocab - Eager Mode")
_run_benchmark_lookup(tokens, v_existing)
_run_benchmark_lookup([tokens], v_existing)
_run_benchmark_lookup(tokens_lists, v_existing)

# experimental Vocab eager lookup
print("Vocab Experimental - Eager Mode")
_run_benchmark_lookup(tokens, v_experimental)
_run_benchmark_lookup([tokens], v_experimental)
_run_benchmark_lookup(tokens_lists, v_experimental)
# new Vocab eager lookup
print("Vocab New - Eager Mode")
_run_benchmark_lookup(tokens, v_new)
_run_benchmark_lookup([tokens], v_new)
_run_benchmark_lookup(tokens_lists, v_new)

jit_v_experimental = torch.jit.script(v_experimental)
# experimental Vocab jit lookup
print("Vocab Experimental - Jit Mode")
_run_benchmark_lookup(tokens, jit_v_experimental)
_run_benchmark_lookup([tokens], jit_v_experimental)
_run_benchmark_lookup(tokens_lists, jit_v_experimental)
jit_v_new = torch.jit.script(v_new)
# new Vocab jit lookup
print("Vocab New - Jit Mode")
_run_benchmark_lookup(tokens, jit_v_new)
_run_benchmark_lookup([tokens], jit_v_new)
_run_benchmark_lookup(tokens_lists, jit_v_new)


if __name__ == "__main__":
Expand All @@ -219,7 +227,7 @@ def token_iterator(file_path):

if args.run_construction_benchmark:
print("is_legacy", args.is_legacy)
benchmark_experimental_vocab_construction(args.vocab_filename_construction,
benchmark_new_vocab_construction(args.vocab_filename_construction,
is_raw_text=args.is_raw_text, is_legacy=args.is_legacy)
else:
benchmark_experimental_vocab_lookup(args.vocab_filename_lookup, args.dataset)
benchmark_new_vocab_lookup(args.vocab_filename_lookup, args.dataset)