Skip to content

Commit

Permalink
rm Word2VecVocab class, persistent callbacks (bug piskvorky#2136)
Browse files Browse the repository at this point in the history
  • Loading branch information
gojomo committed Jan 8, 2020
1 parent 0d8f06f commit dee6207
Show file tree
Hide file tree
Showing 5 changed files with 312 additions and 325 deletions.
60 changes: 31 additions & 29 deletions gensim/models/base_any2vec.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,7 +94,6 @@ def __init__(self, workers=3, vector_size=100, epochs=5, callbacks=(), batch_wor
self.total_train_time = 0
self.batch_words = batch_words
self.model_trimmed_post_training = False
self.callbacks = callbacks

def _get_job_params(self, cur_epoch):
"""Get job parameters required for each batch."""
Expand Down Expand Up @@ -193,19 +192,20 @@ def _worker_loop(self, job_queue, progress_queue):
"""
thread_private_mem = self._get_thread_working_mem()
jobs_processed = 0
callbacks = progress_queue.callbacks
while True:
job = job_queue.get()
if job is None:
progress_queue.put(None)
break # no more jobs => quit this worker
data_iterable, job_parameters = job

for callback in self.callbacks:
for callback in callbacks:
callback.on_batch_begin(self)

tally, raw_tally = self._do_train_job(data_iterable, job_parameters, thread_private_mem)

for callback in self.callbacks:
for callback in callbacks:
callback.on_batch_end(self)

progress_queue.put((len(data_iterable), tally, raw_tally)) # report back progress
Expand Down Expand Up @@ -366,7 +366,8 @@ def _log_epoch_progress(self, progress_queue=None, job_queue=None, cur_epoch=0,
self.total_train_time += elapsed
return trained_word_count, raw_word_count, job_tally

def _train_epoch_corpusfile(self, corpus_file, cur_epoch=0, total_examples=None, total_words=None, **kwargs):
def _train_epoch_corpusfile(
self, corpus_file, cur_epoch=0, total_examples=None, total_words=None, callbacks=(), **kwargs):
"""Train the model for a single epoch.
Parameters
Expand Down Expand Up @@ -430,7 +431,7 @@ def _train_epoch_corpusfile(self, corpus_file, cur_epoch=0, total_examples=None,
return trained_word_count, raw_word_count, job_tally

def _train_epoch(self, data_iterable, cur_epoch=0, total_examples=None, total_words=None,
queue_factor=2, report_delay=1.0):
queue_factor=2, report_delay=1.0, callbacks=()):
"""Train the model for a single epoch.
Parameters
Expand Down Expand Up @@ -462,6 +463,7 @@ def _train_epoch(self, data_iterable, cur_epoch=0, total_examples=None, total_wo
"""
job_queue = Queue(maxsize=queue_factor * self.workers)
progress_queue = Queue(maxsize=(queue_factor + 1) * self.workers)
progress_queue.callbacks = callbacks # messy way to pass along for just this session

workers = [
threading.Thread(
Expand Down Expand Up @@ -522,15 +524,13 @@ def train(self, data_iterable=None, corpus_file=None, epochs=None, total_example
"""
self._set_train_params(**kwargs)
if callbacks:
self.callbacks = callbacks
self.epochs = epochs
self._check_training_sanity(
epochs=epochs,
total_examples=total_examples,
total_words=total_words, **kwargs)

for callback in self.callbacks:
for callback in callbacks:
callback.on_train_begin(self)

trained_word_count = 0
Expand All @@ -539,22 +539,24 @@ def train(self, data_iterable=None, corpus_file=None, epochs=None, total_example
job_tally = 0

for cur_epoch in range(self.epochs):
for callback in self.callbacks:
for callback in callbacks:
callback.on_epoch_begin(self)

if data_iterable is not None:
trained_word_count_epoch, raw_word_count_epoch, job_tally_epoch = self._train_epoch(
data_iterable, cur_epoch=cur_epoch, total_examples=total_examples,
total_words=total_words, queue_factor=queue_factor, report_delay=report_delay)
total_words=total_words, queue_factor=queue_factor, report_delay=report_delay,
callbacks=callbacks)
else:
trained_word_count_epoch, raw_word_count_epoch, job_tally_epoch = self._train_epoch_corpusfile(
corpus_file, cur_epoch=cur_epoch, total_examples=total_examples, total_words=total_words, **kwargs)
corpus_file, cur_epoch=cur_epoch, total_examples=total_examples, total_words=total_words,
callbacks=callbacks, **kwargs)

trained_word_count += trained_word_count_epoch
raw_word_count += raw_word_count_epoch
job_tally += job_tally_epoch

for callback in self.callbacks:
for callback in callbacks:
callback.on_epoch_end(self)

# Log overall time
Expand All @@ -564,7 +566,7 @@ def train(self, data_iterable=None, corpus_file=None, epochs=None, total_example
self.train_count += 1 # number of times train() has been called
self._clear_post_train()

for callback in self.callbacks:
for callback in callbacks:
callback.on_train_end(self)
return trained_word_count, raw_word_count

Expand Down Expand Up @@ -730,13 +732,19 @@ def __init__(self, sentences=None, corpus_file=None, workers=3, vector_size=100,
self.train(
sentences=sentences, corpus_file=corpus_file, total_examples=self.corpus_count,
total_words=self.corpus_total_words, epochs=self.epochs, start_alpha=self.alpha,
end_alpha=self.min_alpha, compute_loss=compute_loss)
end_alpha=self.min_alpha, compute_loss=compute_loss, callbacks=callbacks)
else:
if trim_rule is not None:
logger.warning(
"The rule, if given, is only used to prune vocabulary during build_vocab() "
"and is not stored as part of the model. Model initialized without sentences. "
"trim_rule provided, if any, will be ignored.")
if callbacks:
logger.warning(
"Callbacks are no longer retained by the model, so must be provided whenever "
"training is triggered, as in initialization with a corpus or calling `train()`. "
"The callbacks provided in this initialization without triggering train will "
"be ignored.")

def _clear_post_train(self):
raise NotImplementedError()
Expand Down Expand Up @@ -797,18 +805,16 @@ def build_vocab(self, sentences=None, corpus_file=None, update=False, progress_p
* `min_count` (int) - the minimum count threshold.
**kwargs : object
Key word arguments propagated to `self.vocabulary.prepare_vocab`
Key word arguments propagated to `self.prepare_vocab`
"""
total_words, corpus_count = self.vocabulary.scan_vocab(
total_words, corpus_count = self.scan_vocab(
sentences=sentences, corpus_file=corpus_file, progress_per=progress_per, trim_rule=trim_rule)
self.corpus_count = corpus_count
self.corpus_total_words = total_words
report_values = self.vocabulary.prepare_vocab(
self.hs, self.negative, self.wv, update=update, keep_raw_vocab=keep_raw_vocab,
trim_rule=trim_rule, **kwargs)
report_values = self.prepare_vocab(update=update, keep_raw_vocab=keep_raw_vocab, trim_rule=trim_rule, **kwargs)
report_values['memory'] = self.estimate_memory(vocab_size=report_values['num_retained_words'])
self.trainables.prepare_weights(self.hs, self.negative, self.wv, update=update, vocabulary=self.vocabulary)
self.trainables.prepare_weights(self.hs, self.negative, self.wv, update=update, vocabulary=self)

def build_vocab_from_freq(self, word_freq, keep_raw_vocab=False, corpus_count=None, trim_rule=None, update=False):
"""Build vocabulary from a dictionary of word frequencies.
Expand Down Expand Up @@ -850,15 +856,13 @@ def build_vocab_from_freq(self, word_freq, keep_raw_vocab=False, corpus_count=No

# Since no sentences are provided, this is to control the corpus_count.
self.corpus_count = corpus_count or 0
self.vocabulary.raw_vocab = raw_vocab
self.raw_vocab = raw_vocab

# trim by min_count & precalculate downsampling
report_values = self.vocabulary.prepare_vocab(
self.hs, self.negative, self.wv, keep_raw_vocab=keep_raw_vocab,
trim_rule=trim_rule, update=update)
report_values = self.prepare_vocab(keep_raw_vocab=keep_raw_vocab, trim_rule=trim_rule, update=update)
report_values['memory'] = self.estimate_memory(vocab_size=report_values['num_retained_words'])
self.trainables.prepare_weights(
self.hs, self.negative, self.wv, update=update, vocabulary=self.vocabulary) # build tables & arrays
self.hs, self.negative, self.wv, update=update, vocabulary=self) # build tables & arrays

def estimate_memory(self, vocab_size=None, report=None):
"""Estimate required memory for a model using current settings and provided vocabulary size.
Expand Down Expand Up @@ -1075,7 +1079,7 @@ def _check_training_sanity(self, epochs=None, total_examples=None, total_words=N
"training model with %i workers on %i vocabulary and %i features, "
"using sg=%s hs=%s sample=%s negative=%s window=%s",
self.workers, len(self.wv.vocab), self.trainables.layer1_size, self.sg,
self.hs, self.vocabulary.sample, self.negative, self.window
self.hs, self.sample, self.negative, self.window
)

@classmethod
Expand Down Expand Up @@ -1112,10 +1116,8 @@ def load(cls, *args, **kwargs):
model = super(BaseWordEmbeddingsModel, cls).load(*args, **kwargs)
if not hasattr(model, 'ns_exponent'):
model.ns_exponent = 0.75
if not hasattr(model.vocabulary, 'ns_exponent'):
model.vocabulary.ns_exponent = 0.75
if model.negative and hasattr(model.wv, 'index2word'):
model.vocabulary.make_cum_table(model.wv) # rebuild cum_table from vocabulary
model.make_cum_table() # rebuild cum_table from vocabulary ## TODO: ???
if not hasattr(model, 'corpus_count'):
model.corpus_count = None
if not hasattr(model, 'corpus_total_words'):
Expand Down
2 changes: 1 addition & 1 deletion gensim/models/deprecated/word2vec.py
Original file line number Diff line number Diff line change
Expand Up @@ -203,7 +203,7 @@ def load_old_word2vec(*args, **kwargs):
# set vocabulary attributes
new_model.wv.vocab = old_model.wv.vocab
new_model.wv.index2word = old_model.wv.index2word
new_model.vocabulary.cum_table = old_model.__dict__.get('cum_table', None)
new_model.cum_table = old_model.__dict__.get('cum_table', None)

new_model.train_count = old_model.__dict__.get('train_count', None)
new_model.corpus_count = old_model.__dict__.get('corpus_count', None)
Expand Down
Loading

0 comments on commit dee6207

Please sign in to comment.