pytorch · parmeet · Apr 20, 2021 · Apr 20, 2021 · Apr 20, 2021 · Apr 21, 2021
diff --git a/benchmark/benchmark_experimental_vocab.py b/benchmark/benchmark_experimental_vocab.py
@@ -1,7 +1,10 @@
 import argparse
 from collections import (Counter, OrderedDict)
 import time
-
+from timeit import default_timer as timer
+import random
+import string
+from matplotlib import pyplot as plt
 import torch
 from torchtext.experimental.datasets import DATASETS
 from torchtext.experimental.vocab import (
@@ -16,6 +19,42 @@
 from torchtext.experimental.transforms import basic_english_normalize
 
 
+def compare_legacy_and_experimental_batch_lookup():
+    num_tokens = 1000
+    num_letters = 6
+    num_lines = 100000
+    vocab = [''.join(random.sample(string.ascii_letters * num_letters, num_letters)) for _ in range(num_tokens)]
+    counter = Counter()
+    counter.update(vocab)
+    legacy_vocab = Vocab(counter)
+    experimental_vocab = VocabExperimental(counter)
+    speed_ups = []
+    token_lengths = [i for i in range(2, 100)]
+    for i in token_lengths:
+        lines = [random.sample(vocab, i) for _ in range(num_lines)]
+        start_time = timer()
+        for l in lines:
+            legacy_vocab.lookup_indices(l)
+        legacy_time = timer() - start_time
+
+        start_time = timer()
+        for l in lines:
+            experimental_vocab.lookup_indices(l)
+
+        experimental_time = timer() - start_time
+
+        speed_ups.append(legacy_time / experimental_time)
+        print("speed-up={} for average length={}".format(legacy_time / experimental_time, i))
+        del lines
+
+    plt.close()
+    fig, ax = plt.subplots(1,1)
+    ax.plot(token_lengths, speed_ups)
+    ax.set_xlabel('Average Tokens per line')
+    ax.set_ylabel('Speed-up')
+    plt.savefig("speedup.jpg")
+
+
 def legacy_vocab_from_file_object(file_like_object, **kwargs):
     r"""Create a `Vocab` object from a file like object.
 
@@ -76,7 +115,7 @@ def benchmark_experimental_vocab_construction(vocab_file_path, is_raw_text=True,
         print("Construction time:", time.monotonic() - t0)
 
 
-def benchmark_experimental_vocab_lookup(vocab_file_path=None, dataset = 'AG_NEWS'):
+def benchmark_experimental_vocab_lookup(vocab_file_path=None, dataset='AG_NEWS'):
     def _run_benchmark_lookup(tokens, vocab):
         t0 = time.monotonic()
         # list lookup

diff --git a/test/experimental/test_vocab.py b/test/experimental/test_vocab.py
@@ -219,7 +219,7 @@ def test_vocab_load_and_save(self):
 
     def test_build_vocab_iterator(self):
         iterator = [['hello', 'hello', 'hello', 'freq_low', 'hello', 'world', 'world', 'world', 'ᑌᑎIᑕOᗪᕮ_Tᕮ᙭T',
-                    'ᑌᑎIᑕOᗪᕮ_Tᕮ᙭T', 'ᑌᑎIᑕOᗪᕮ_Tᕮ᙭T', 'freq_low', 'ᑌᑎIᑕOᗪᕮ_Tᕮ᙭T', 'ᑌᑎIᑕOᗪᕮ_Tᕮ᙭T']]
+                     'ᑌᑎIᑕOᗪᕮ_Tᕮ᙭T', 'ᑌᑎIᑕOᗪᕮ_Tᕮ᙭T', 'freq_low', 'ᑌᑎIᑕOᗪᕮ_Tᕮ᙭T', 'ᑌᑎIᑕOᗪᕮ_Tᕮ᙭T']]
         v = build_vocab_from_iterator(iterator)
         expected_itos = ['<unk>', 'ᑌᑎIᑕOᗪᕮ_Tᕮ᙭T', 'hello', 'world', 'freq_low']
         expected_stoi = {x: index for index, x in enumerate(expected_itos)}

diff --git a/test/experimental/test_with_asset.py b/test/experimental/test_with_asset.py
@@ -78,13 +78,12 @@ class TestTransformsWithAsset(TorchtextTestCase):
     def test_vocab_transform(self):
         asset_name = 'vocab_test2.txt'
         asset_path = get_asset_path(asset_name)
-        with open(asset_path, 'r') as f:
-            vocab_transform = VocabTransform(load_vocab_from_file(f))
-            self.assertEqual(vocab_transform(['of', 'that', 'new']),
-                             [7, 18, 24])
-            jit_vocab_transform = torch.jit.script(vocab_transform)
-            self.assertEqual(jit_vocab_transform(['of', 'that', 'new', 'that']),
-                             [7, 18, 24, 18])
+        vocab_transform = VocabTransform(load_vocab_from_file(asset_path))
+        self.assertEqual(vocab_transform(['of', 'that', 'new']),
+                         [7, 18, 24])
+        jit_vocab_transform = torch.jit.script(vocab_transform)
+        self.assertEqual(jit_vocab_transform(['of', 'that', 'new', 'that']),
+                         [7, 18, 24, 18])
 
     def test_errors_vectors_python(self):
         tokens = []
@@ -179,27 +178,25 @@ def test_glove_different_dims(self):
     def test_vocab_from_file(self):
         asset_name = 'vocab_test.txt'
         asset_path = get_asset_path(asset_name)
-        with open(asset_path, 'r') as f:
-            v = load_vocab_from_file(f, unk_token='<new_unk>')
-            expected_itos = ['<new_unk>', 'b', 'a', 'c']
-            expected_stoi = {x: index for index, x in enumerate(expected_itos)}
-            self.assertEqual(v.get_itos(), expected_itos)
-            self.assertEqual(dict(v.get_stoi()), expected_stoi)
+        v = load_vocab_from_file(asset_path, unk_token='<new_unk>')
+        expected_itos = ['<new_unk>', 'b', 'a', 'c']
+        expected_stoi = {x: index for index, x in enumerate(expected_itos)}
+        self.assertEqual(v.get_itos(), expected_itos)
+        self.assertEqual(dict(v.get_stoi()), expected_stoi)
 
     def test_vocab_from_raw_text_file(self):
         asset_name = 'vocab_raw_text_test.txt'
         asset_path = get_asset_path(asset_name)
-        with open(asset_path, 'r') as f:
-            tokenizer = basic_english_normalize()
-            jit_tokenizer = torch.jit.script(tokenizer)
-            v = build_vocab_from_text_file(f, jit_tokenizer, unk_token='<new_unk>')
-            expected_itos = ['<new_unk>', "'", 'after', 'talks', '.', 'are', 'at', 'disappointed',
-                             'fears', 'federal', 'firm', 'for', 'mogul', 'n', 'newall', 'parent',
-                             'pension', 'representing', 'say', 'stricken', 't', 'they', 'turner',
-                             'unions', 'with', 'workers']
-            expected_stoi = {x: index for index, x in enumerate(expected_itos)}
-            self.assertEqual(v.get_itos(), expected_itos)
-            self.assertEqual(dict(v.get_stoi()), expected_stoi)
+        tokenizer = basic_english_normalize()
+        jit_tokenizer = torch.jit.script(tokenizer)
+        v = build_vocab_from_text_file(asset_path, jit_tokenizer, unk_token='<new_unk>')
+        expected_itos = ['<new_unk>', "'", 'after', 'talks', '.', 'are', 'at', 'disappointed',
+                         'fears', 'federal', 'firm', 'for', 'mogul', 'n', 'newall', 'parent',
+                         'pension', 'representing', 'say', 'stricken', 't', 'they', 'turner',
+                         'unions', 'with', 'workers']
+        expected_stoi = {x: index for index, x in enumerate(expected_itos)}
+        self.assertEqual(v.get_itos(), expected_itos)
+        self.assertEqual(dict(v.get_stoi()), expected_stoi)
 
     def test_builtin_pretrained_sentencepiece_processor(self):
         sp_model_path = download_from_url(PRETRAINED_SP_MODEL['text_unigram_25000'])
@@ -241,11 +238,10 @@ def batch_func(data):
     def test_text_sequential_transform(self):
         asset_name = 'vocab_test2.txt'
         asset_path = get_asset_path(asset_name)
-        with open(asset_path, 'r') as f:
-            pipeline = TextSequentialTransforms(basic_english_normalize(), load_vocab_from_file(f))
-            jit_pipeline = torch.jit.script(pipeline)
-            self.assertEqual(pipeline('of that new'), [7, 18, 24])
-            self.assertEqual(jit_pipeline('of that new'), [7, 18, 24])
+        pipeline = TextSequentialTransforms(basic_english_normalize(), load_vocab_from_file(asset_path))
+        jit_pipeline = torch.jit.script(pipeline)
+        self.assertEqual(pipeline('of that new'), [7, 18, 24])
+        self.assertEqual(jit_pipeline('of that new'), [7, 18, 24])
 
     def test_vectors_from_file(self):
         asset_name = 'vectors_test.csv'

diff --git a/torchtext/experimental/vocab.py b/torchtext/experimental/vocab.py
@@ -19,12 +19,11 @@
 logger = logging.getLogger(__name__)
 
 
-def build_vocab_from_text_file(file_object, jited_tokenizer, min_freq=1, unk_token='<unk>', num_cpus=4):
+def build_vocab_from_text_file(file_path, jited_tokenizer, min_freq=1, unk_token='<unk>', num_cpus=4):
     r"""Create a `Vocab` object from a raw text file.
 
-    The `file_object` can contain any raw text. This function applies a generic JITed tokenizer in
-    parallel to the text. Note that the vocab will be created in the order that the tokens first appear
-    in the file (and not by the frequency of tokens).
+    The `file_path` can contain any raw text. This function applies a generic JITed tokenizer in
+    parallel to the text.
 
     Args:
         file_object (FileObject): a file object to read data from.
@@ -40,20 +39,18 @@ def build_vocab_from_text_file(file_object, jited_tokenizer, min_freq=1, unk_tok
     Examples:
         >>> from torchtext.experimental.vocab import build_vocab_from_text_file
         >>> from torchtext.experimental.transforms import basic_english_normalize
-        >>> f = open('vocab.txt', 'r')
-        >>>     tokenizer = basic_english_normalize()
+        >>> tokenizer = basic_english_normalize()
         >>> tokenizer = basic_english_normalize()
         >>> jit_tokenizer = torch.jit.script(tokenizer)
-        >>> v = build_vocab_from_text_file(f, jit_tokenizer)
+        >>> v = build_vocab_from_text_file('vocab.txt', jit_tokenizer)
     """
-    vocab_obj = _build_vocab_from_text_file(file_object.name, unk_token, min_freq, num_cpus, jited_tokenizer)
+    vocab_obj = _build_vocab_from_text_file(file_path, unk_token, min_freq, num_cpus, jited_tokenizer)
     return Vocab(vocab_obj)
 
 
-def load_vocab_from_file(file_object, min_freq=1, unk_token='<unk>', num_cpus=4):
+def load_vocab_from_file(file_path, min_freq=1, unk_token='<unk>', num_cpus=4):
     r"""Create a `Vocab` object from a text file.
-    The `file_object` should contain tokens separated by new lines. Note that the vocab
-    will be created in the order that the tokens first appear in the file (and not by the frequency of tokens).
+    The `file_path` should contain tokens separated by new lines.
     Format for txt file:
 
         token1
@@ -73,11 +70,10 @@ def load_vocab_from_file(file_object, min_freq=1, unk_token='<unk>', num_cpus=4)
 
     Examples:
         >>> from torchtext.experimental.vocab import load_vocab_from_file
-        >>> f = open('vocab.txt', 'r')
-        >>> v = load_vocab_from_file(f)
+        >>> v = load_vocab_from_file('vocab.txt')
     """
 
-    vocab_obj = _load_vocab_from_file(file_object.name, unk_token, min_freq, num_cpus)
+    vocab_obj = _load_vocab_from_file(file_path, unk_token, min_freq, num_cpus)
     return Vocab(vocab_obj)