Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improving Scan_Vocab speed, build_vocab_from_freq function. Iteration 2 #1695

Merged
merged 21 commits into from
Nov 8, 2017
Merged
Show file tree
Hide file tree
Changes from 15 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 11 additions & 7 deletions gensim/models/word2vec.py
Original file line number Diff line number Diff line change
Expand Up @@ -647,15 +647,19 @@ def build_vocab_from_freq(self, word_freq, keep_raw_vocab=False, corpus_count=No

Examples
--------
>>> build_vocab_from_freq({"Word1":15,"Word2":20}, update=True)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Model is undefined, please create model first (docstring should be executable, i.e. I can copy-paste this code to console and I expect that code run successfully) we plan to add doctests to our CI soon.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

👍

>>> model.build_vocab_from_freq({"Word1":15,"Word2":20}, update=True)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

PEP8: model.build_vocab_from_freq({"Word1": 15, "Word2": 20}, update=True)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

sorry, whats the problem with this ?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

spaces after :, , (in comment fixed variant)

"""
logger.info("Processing provided word frequencies")
vocab = defaultdict(int, word_freq)
raw_vocab = word_freq # Instead of scanning text, this will assign provided word frequencies dictionary(word_freq) to be directly the raw vocab
logger.info(
"collected %i different raw word, with total frequency of %i",
len(raw_vocab), sum(itervalues(raw_vocab))
)

self.corpus_count = corpus_count if corpus_count else 0
self.raw_vocab = vocab
self.corpus_count = corpus_count if corpus_count else 0 # Since no sentences are provided, this is to control the corpus_count
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

PEP8 - two spaces before #

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

These are 2 space, arent they ?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Oh, really, sorry

self.raw_vocab = raw_vocab

self.scale_vocab(keep_raw_vocab=keep_raw_vocab, trim_rule=trim_rule, update=update) # trim by min_count & precalculate downsampling
self.scale_vocab(keep_raw_vocab=keep_raw_vocab, trim_rule=trim_rule,update=update) # trim by min_count & precalculate downsampling
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Return previous variant

self.finalize_vocab(update=update) # build tables & arrays

def scan_vocab(self, sentences, progress_per=10000, trim_rule=None):
Expand All @@ -675,14 +679,14 @@ def scan_vocab(self, sentences, progress_per=10000, trim_rule=None):
type(sentence)
)
checked_string_types += 1
if sentence_no % progress_per == 0:
if sentence_no % progress_per == 0 and sentence_no != 0:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why did this need?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Because 0% anything will equal to 0; so the logger will log a statement saying sentence 0 and processed 0.

Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

But we want that :)

Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

But we want that :)

logger.info(
"PROGRESS: at sentence #%i, processed %i words, keeping %i word types",
sentence_no, total_words, len(vocab)
)
for word in sentence:
vocab[word] += 1
total_words += 1
total_words += len(sentence)

if self.max_vocab_size and len(vocab) > self.max_vocab_size:
utils.prune_vocab(vocab, min_reduce, trim_rule=trim_rule)
Expand Down
34 changes: 30 additions & 4 deletions gensim/test/test_word2vec.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,8 +97,8 @@ def testBuildVocabFromFreq(self):
model_neg = word2vec.Word2Vec(size=10, min_count=0, seed=42, hs=0, negative=5)
model_hs.build_vocab_from_freq(freq_dict)
model_neg.build_vocab_from_freq(freq_dict)
self.assertTrue(len(model_hs.wv.vocab), 12)
self.assertTrue(len(model_neg.wv.vocab), 12)
self.assertEqual(len(model_hs.wv.vocab), 12)
self.assertEqual(len(model_neg.wv.vocab), 12)
self.assertEqual(model_hs.wv.vocab['minors'].count, 2)
self.assertEqual(model_hs.wv.vocab['graph'].count, 3)
self.assertEqual(model_hs.wv.vocab['system'].count, 4)
Expand Down Expand Up @@ -126,11 +126,37 @@ def testBuildVocabFromFreq(self):
new_freq_dict = {'computer': 1, 'artificial': 4, 'human': 1, 'graph': 1, 'intelligence': 4, 'system': 1, 'trees': 1}
model_hs.build_vocab_from_freq(new_freq_dict, update=True)
model_neg.build_vocab_from_freq(new_freq_dict, update=True)
self.assertTrue(model_hs.wv.vocab['graph'].count, 4)
self.assertTrue(model_hs.wv.vocab['artificial'].count, 4)
self.assertEqual(model_hs.wv.vocab['graph'].count, 4)
self.assertEqual(model_hs.wv.vocab['artificial'].count, 4)
self.assertEqual(len(model_hs.wv.vocab), 14)
self.assertEqual(len(model_neg.wv.vocab), 14)

def testPruneVocab(self):
"""Test Prune vocab while scanning sentences"""
sentences = [
["graph", "system"],
["graph", "system"],
["system", "eps"],
["graph", "system"]
]
model = word2vec.Word2Vec(sentences, size=10, min_count=0, max_vocab_size=2, seed=42, hs=1, negative=0)
self.assertEqual(len(model.wv.vocab), 2)
self.assertEqual(model.wv.vocab['graph'].count, 3)
self.assertEqual(model.wv.vocab['system'].count, 4)

sentences = [
["graph", "system"],
["graph", "system"],
["system", "eps"],
["graph", "system"],
["minors", "survey", "minors", "survey", "minors"]
]
model = word2vec.Word2Vec(sentences, size=10, min_count=0, max_vocab_size=2, seed=42, hs=1, negative=0)
self.assertEqual(len(model.wv.vocab), 3)
self.assertEqual(model.wv.vocab['graph'].count, 3)
self.assertEqual(model.wv.vocab['minors'].count, 3)
self.assertEqual(model.wv.vocab['system'].count, 4)

def testOnlineLearning(self):
"""Test that the algorithm is able to add new words to the
vocabulary and to a trained model when using a sorted vocabulary"""
Expand Down