Skip to content

Commit

Permalink
Import torchtext #1291 0790ce6
Browse files Browse the repository at this point in the history
Reviewed By: parmeet

Differential Revision: D28101664

fbshipit-source-id: a8643b3ecf85de2cb815dcfa5789a4a5d246d80f
cpuhrsch authored and facebook-github-bot committed Apr 29, 2021

Verified

This commit was signed with the committer’s verified signature.
neo1973 Markus Härer
1 parent dac4b9c commit b9a38f2
Showing 2 changed files with 42 additions and 3 deletions.
43 changes: 41 additions & 2 deletions benchmark/benchmark_experimental_vocab.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,10 @@
import argparse
from collections import (Counter, OrderedDict)
import time

import random
import string
from timeit import default_timer as timer
from matplotlib import pyplot as plt
import torch
from torchtext.experimental.datasets import DATASETS
from torchtext.experimental.vocab import (
@@ -16,6 +19,42 @@
from torchtext.experimental.transforms import basic_english_normalize


def compare_legacy_and_experimental_batch_lookup():
num_tokens = 1000
num_letters = 6
num_lines = 100000
vocab = [''.join(random.sample(string.ascii_letters * num_letters, num_letters)) for _ in range(num_tokens)]
counter = Counter()
counter.update(vocab)
legacy_vocab = Vocab(counter)
experimental_vocab = VocabExperimental(counter)
speed_ups = []
token_lengths = [i for i in range(2, 100)]
for i in token_lengths:
lines = [random.sample(vocab, i) for _ in range(num_lines)]
start_time = timer()
for text in lines:
legacy_vocab.lookup_indices(text)
legacy_time = timer() - start_time

start_time = timer()
for text in lines:
experimental_vocab.lookup_indices(text)

experimental_time = timer() - start_time

speed_ups.append(legacy_time / experimental_time)
print("speed-up={} for average length={}".format(legacy_time / experimental_time, i))
del lines

plt.close()
fig, ax = plt.subplots(1, 1)
ax.plot(token_lengths, speed_ups)
ax.set_xlabel('Average Tokens per line')
ax.set_ylabel('Speed-up')
plt.savefig("speedup.jpg")


def legacy_vocab_from_file_object(file_like_object, **kwargs):
r"""Create a `Vocab` object from a file like object.
@@ -76,7 +115,7 @@ def benchmark_experimental_vocab_construction(vocab_file_path, is_raw_text=True,
print("Construction time:", time.monotonic() - t0)


def benchmark_experimental_vocab_lookup(vocab_file_path=None, dataset = 'AG_NEWS'):
def benchmark_experimental_vocab_lookup(vocab_file_path=None, dataset='AG_NEWS'):
def _run_benchmark_lookup(tokens, vocab):
t0 = time.monotonic()
# list lookup
2 changes: 1 addition & 1 deletion version.txt
Original file line number Diff line number Diff line change
@@ -1 +1 @@
0.10.0a0
0.10.0a0

0 comments on commit b9a38f2

Please sign in to comment.