From 6afb3ff9bcaa4f5faef05a4223c324852b2e4c22 Mon Sep 17 00:00:00 2001 From: Paul Andrey Date: Mon, 7 Jun 2021 17:52:16 +0200 Subject: [PATCH 01/13] Implemented `reduced_windows` argument for Word2Vec. Co-Authored-By: Mathis Demay --- gensim/models/word2vec.py | 43 +++++++++++++++++++-------- gensim/models/word2vec_corpusfile.pyx | 24 +++++++++++---- gensim/models/word2vec_inner.pyx | 26 ++++++++++++---- 3 files changed, 68 insertions(+), 25 deletions(-) diff --git a/gensim/models/word2vec.py b/gensim/models/word2vec.py index 2593a373b0..e3c22dd95c 100755 --- a/gensim/models/word2vec.py +++ b/gensim/models/word2vec.py @@ -223,13 +223,13 @@ def train_epoch_sg( model, corpus_file, offset, _cython_vocab, _cur_epoch, _expected_examples, _expected_words, - _work, _neu1, compute_loss, + _work, _neu1, compute_loss, reduced_windows, ): raise RuntimeError("Training with corpus_file argument is not supported") def train_epoch_cbow( model, corpus_file, offset, _cython_vocab, _cur_epoch, _expected_examples, _expected_words, - _work, _neu1, compute_loss, + _work, _neu1, compute_loss, reduced_windows, ): raise RuntimeError("Training with corpus_file argument is not supported") @@ -240,7 +240,7 @@ def __init__( max_vocab_size=None, sample=1e-3, seed=1, workers=3, min_alpha=0.0001, sg=0, hs=0, negative=5, ns_exponent=0.75, cbow_mean=1, hashfxn=hash, epochs=5, null_word=0, trim_rule=None, sorted_vocab=1, batch_words=MAX_WORDS_IN_BATCH, compute_loss=False, callbacks=(), - comment=None, max_final_vocab=None, + comment=None, max_final_vocab=None, reduced_windows=True ): """Train, use and evaluate neural networks described in https://code.google.com/p/word2vec/. @@ -345,6 +345,9 @@ def __init__( :meth:`~gensim.models.word2vec.Word2Vec.get_latest_training_loss`. callbacks : iterable of :class:`~gensim.models.callbacks.CallbackAny2Vec`, optional Sequence of callbacks to be executed at specific stages during training. + reduced_windows : bool, optional + If True, the window size is uniformly sampled from {1, `window`} + during training. Otherwise, it is fixed to `window`. Examples -------- @@ -377,6 +380,7 @@ def __init__( self.min_alpha = float(min_alpha) self.window = int(window) + self.reduced_windows = bool(reduced_windows) self.random = np.random.RandomState(seed) self.hs = int(hs) @@ -419,7 +423,8 @@ def __init__( self.train( corpus_iterable=corpus_iterable, corpus_file=corpus_file, total_examples=self.corpus_count, total_words=self.corpus_total_words, epochs=self.epochs, start_alpha=self.alpha, - end_alpha=self.min_alpha, compute_loss=self.compute_loss, callbacks=callbacks) + end_alpha=self.min_alpha, compute_loss=self.compute_loss, callbacks=callbacks, + reduced_windows=self.reduced_windows) else: if trim_rule is not None: logger.warning( @@ -910,12 +915,14 @@ def _do_train_epoch( if self.sg: examples, tally, raw_tally = train_epoch_sg( self, corpus_file, offset, cython_vocab, cur_epoch, - total_examples, total_words, work, neu1, self.compute_loss, + total_examples, total_words, work, neu1, + self.compute_loss, self.reduced_windows, ) else: examples, tally, raw_tally = train_epoch_cbow( self, corpus_file, offset, cython_vocab, cur_epoch, - total_examples, total_words, work, neu1, self.compute_loss, + total_examples, total_words, work, neu1, + self.compute_loss, self.reduced_windows, ) return examples, tally, raw_tally @@ -941,9 +948,15 @@ def _do_train_job(self, sentences, alpha, inits): work, neu1 = inits tally = 0 if self.sg: - tally += train_batch_sg(self, sentences, alpha, work, self.compute_loss) + tally += train_batch_sg( + self, sentences, alpha, work, + self.compute_loss, self.reduced_windows, + ) else: - tally += train_batch_cbow(self, sentences, alpha, work, neu1, self.compute_loss) + tally += train_batch_cbow( + self, sentences, alpha, work, neu1, + self.compute_loss, self.reduced_windows, + ) return tally, self._raw_word_count(sentences) def _clear_post_train(self): @@ -951,10 +964,10 @@ def _clear_post_train(self): self.wv.norms = None def train( - self, corpus_iterable=None, corpus_file=None, total_examples=None, total_words=None, - epochs=None, start_alpha=None, end_alpha=None, word_count=0, - queue_factor=2, report_delay=1.0, compute_loss=False, callbacks=(), - **kwargs, + self, corpus_iterable=None, corpus_file=None, total_examples=None, + total_words=None, epochs=None, start_alpha=None, end_alpha=None, + word_count=0, queue_factor=2, report_delay=1.0, compute_loss=False, + reduced_windows=True, callbacks=(), **kwargs, ): """Update the model's neural weights from a sequence of sentences. @@ -1011,6 +1024,9 @@ def train( compute_loss: bool, optional If True, computes and stores loss value which can be retrieved using :meth:`~gensim.models.word2vec.Word2Vec.get_latest_training_loss`. + reduced_windows : bool, optional + If True, the window size is uniformly sampled from {1, `window`} + during training. Otherwise, it is fixed to `window`. callbacks : iterable of :class:`~gensim.models.callbacks.CallbackAny2Vec`, optional Sequence of callbacks to be executed at specific stages during training. @@ -1030,6 +1046,7 @@ def train( self.alpha = start_alpha or self.alpha self.min_alpha = end_alpha or self.min_alpha self.epochs = epochs + self.reduced_windows = reduced_windows self._check_training_sanity(epochs=epochs, total_examples=total_examples, total_words=total_words) self._check_corpus_sanity(corpus_iterable=corpus_iterable, corpus_file=corpus_file, passes=epochs) @@ -1039,7 +1056,7 @@ def train( msg=( f"training model with {self.workers} workers on {len(self.wv)} vocabulary and " f"{self.layer1_size} features, using sg={self.sg} hs={self.hs} sample={self.sample} " - f"negative={self.negative} window={self.window}" + f"negative={self.negative} window={self.window} reduced_windows={self.reduced_windows}" ), ) diff --git a/gensim/models/word2vec_corpusfile.pyx b/gensim/models/word2vec_corpusfile.pyx index 19b9b8c165..a59ab721ea 100644 --- a/gensim/models/word2vec_corpusfile.pyx +++ b/gensim/models/word2vec_corpusfile.pyx @@ -186,7 +186,8 @@ cdef void prepare_c_structures_for_batch( vector[vector[string]] &sentences, int sample, int hs, int window, long long *total_words, int *effective_words, int *effective_sentences, unsigned long long *next_random, cvocab_t *vocab, int *sentence_idx, np.uint32_t *indexes, int *codelens, - np.uint8_t **codes, np.uint32_t **points, np.uint32_t *reduced_windows) nogil: + np.uint8_t **codes, np.uint32_t **points, np.uint32_t *reduced_windows, + bint do_reduced_windows) nogil: cdef VocabItem word cdef string token cdef vector[string] sent @@ -225,7 +226,10 @@ cdef void prepare_c_structures_for_batch( # precompute "reduced window" offsets in a single randint() call for i in range(effective_words[0]): - reduced_windows[i] = random_int32(next_random) % window + if do_reduced_windows: + reduced_windows[i] = random_int32(next_random) % window + else: + reduced_windows[i] = window cdef REAL_t get_alpha(REAL_t alpha, REAL_t end_alpha, int cur_epoch, int num_epochs) nogil: @@ -250,7 +254,7 @@ cdef REAL_t get_next_alpha( def train_epoch_sg(model, corpus_file, offset, _cython_vocab, _cur_epoch, _expected_examples, _expected_words, _work, - _neu1, compute_loss): + _neu1, compute_loss, reduced_windows): """Train Skipgram model for one epoch by training on an input stream. This function is used only in multistream mode. Called internally from :meth:`~gensim.models.word2vec.Word2Vec.train`. @@ -269,6 +273,9 @@ def train_epoch_sg(model, corpus_file, offset, _cython_vocab, _cur_epoch, _expec Private working memory for each worker. compute_loss : bool Whether or not the training loss should be computed in this batch. + reduced_windows : bool + Whether or not the window size should be reduced based on random + uniform sampling. Returns ------- @@ -295,6 +302,7 @@ def train_epoch_sg(model, corpus_file, offset, _cython_vocab, _cur_epoch, _expec cdef long long total_sentences = 0 cdef long long total_effective_words = 0, total_words = 0 cdef int sent_idx, idx_start, idx_end + cdef bint do_reduced_windows = reduced_windows init_w2v_config(&c, model, _alpha, compute_loss, _work) @@ -311,7 +319,7 @@ def train_epoch_sg(model, corpus_file, offset, _cython_vocab, _cur_epoch, _expec prepare_c_structures_for_batch( sentences, c.sample, c.hs, c.window, &total_words, &effective_words, &effective_sentences, &c.next_random, vocab.get_vocab_ptr(), c.sentence_idx, c.indexes, - c.codelens, c.codes, c.points, c.reduced_windows) + c.codelens, c.codes, c.points, c.reduced_windows, do_reduced_windows) for sent_idx in range(effective_sentences): idx_start = c.sentence_idx[sent_idx] @@ -350,7 +358,7 @@ def train_epoch_sg(model, corpus_file, offset, _cython_vocab, _cur_epoch, _expec def train_epoch_cbow(model, corpus_file, offset, _cython_vocab, _cur_epoch, _expected_examples, _expected_words, _work, - _neu1, compute_loss): + _neu1, compute_loss, reduced_windows): """Train CBOW model for one epoch by training on an input stream. This function is used only in multistream mode. Called internally from :meth:`~gensim.models.word2vec.Word2Vec.train`. @@ -369,6 +377,9 @@ def train_epoch_cbow(model, corpus_file, offset, _cython_vocab, _cur_epoch, _exp Private working memory for each worker. compute_loss : bool Whether or not the training loss should be computed in this batch. + reduced_windows : bool + Whether or not the window size should be reduced based on random + uniform sampling. Returns ------- @@ -395,6 +406,7 @@ def train_epoch_cbow(model, corpus_file, offset, _cython_vocab, _cur_epoch, _exp cdef long long total_sentences = 0 cdef long long total_effective_words = 0, total_words = 0 cdef int sent_idx, idx_start, idx_end + cdef bint do_reduced_windows = reduced_windows init_w2v_config(&c, model, _alpha, compute_loss, _work, _neu1) @@ -411,7 +423,7 @@ def train_epoch_cbow(model, corpus_file, offset, _cython_vocab, _cur_epoch, _exp prepare_c_structures_for_batch( sentences, c.sample, c.hs, c.window, &total_words, &effective_words, &effective_sentences, &c.next_random, vocab.get_vocab_ptr(), c.sentence_idx, - c.indexes, c.codelens, c.codes, c.points, c.reduced_windows) + c.indexes, c.codelens, c.codes, c.points, c.reduced_windows, do_reduced_windows) for sent_idx in range(effective_sentences): idx_start = c.sentence_idx[sent_idx] diff --git a/gensim/models/word2vec_inner.pyx b/gensim/models/word2vec_inner.pyx index 9dc47fb040..64525094aa 100755 --- a/gensim/models/word2vec_inner.pyx +++ b/gensim/models/word2vec_inner.pyx @@ -71,10 +71,10 @@ cdef void our_saxpy_noblas(const int *N, const float *alpha, const float *X, con for i from 0 <= i < N[0] by 1: Y[i * (incY[0])] = (alpha[0]) * X[i * (incX[0])] + Y[i * (incY[0])] -cdef long long _mul(const np.uint32_t a, const int b) nogil: +cdef long long _mul(const np.uint32_t a, const int b) nogil: """Safe multiplication of ints with explict typecasting""" return a * b - + cdef void w2v_fast_sentence_sg_hs( const np.uint32_t *word_point, const np.uint8_t *word_code, const int codelen, REAL_t *syn0, REAL_t *syn1, const int size, @@ -502,7 +502,7 @@ cdef init_w2v_config(Word2VecConfig *c, model, alpha, compute_loss, _work, _neu1 c[0].neu1 = np.PyArray_DATA(_neu1) -def train_batch_sg(model, sentences, alpha, _work, compute_loss): +def train_batch_sg(model, sentences, alpha, _work, compute_loss, reduced_windows): """Update skip-gram model by training on a batch of sentences. Called internally from :meth:`~gensim.models.word2vec.Word2Vec.train`. @@ -519,6 +519,9 @@ def train_batch_sg(model, sentences, alpha, _work, compute_loss): Private working memory for each worker. compute_loss : bool Whether or not the training loss should be computed in this batch. + reduced_windows : bool + Whether or not the window size should be reduced based on random + uniform sampling. Returns ------- @@ -570,7 +573,11 @@ def train_batch_sg(model, sentences, alpha, _work, compute_loss): break # TODO: log warning, tally overflow? # precompute "reduced window" offsets in a single randint() call - for i, item in enumerate(model.random.randint(0, c.window, effective_words)): + if reduced_windows: + window_size = model.random.randint(0, c.window, effective_words) + else: + window_size = [0] * effective_words + for i, item in enumerate(window_size): c.reduced_windows[i] = item # release GIL & train on all sentences @@ -597,7 +604,7 @@ def train_batch_sg(model, sentences, alpha, _work, compute_loss): return effective_words -def train_batch_cbow(model, sentences, alpha, _work, _neu1, compute_loss): +def train_batch_cbow(model, sentences, alpha, _work, _neu1, compute_loss, reduced_windows): """Update CBOW model by training on a batch of sentences. Called internally from :meth:`~gensim.models.word2vec.Word2Vec.train`. @@ -616,6 +623,9 @@ def train_batch_cbow(model, sentences, alpha, _work, _neu1, compute_loss): Private working memory for each worker. compute_loss : bool Whether or not the training loss should be computed in this batch. + reduced_windows : bool + Whether or not the window size should be reduced based on random + uniform sampling. Returns ------- @@ -666,7 +676,11 @@ def train_batch_cbow(model, sentences, alpha, _work, _neu1, compute_loss): break # TODO: log warning, tally overflow? # precompute "reduced window" offsets in a single randint() call - for i, item in enumerate(model.random.randint(0, c.window, effective_words)): + if reduced_windows: + window_size = model.random.randint(0, c.window, effective_words) + else: + window_size = [0] * effective_words + for i, item in enumerate(window_size): c.reduced_windows[i] = item # release GIL & train on all sentences From 9bdb5a1ecb3e42a5cd789473a34a7413b923e139 Mon Sep 17 00:00:00 2001 From: Paul Andrey Date: Thu, 10 Jun 2021 16:59:29 +0200 Subject: [PATCH 02/13] Improve the way `reduced_windows` is passed around and used. --- gensim/models/word2vec.py | 31 ++++++++++++++++----------- gensim/models/word2vec_corpusfile.pyx | 18 ++++++---------- gensim/models/word2vec_inner.pyx | 28 +++++++++--------------- 3 files changed, 34 insertions(+), 43 deletions(-) diff --git a/gensim/models/word2vec.py b/gensim/models/word2vec.py index e3c22dd95c..53a02d80fe 100755 --- a/gensim/models/word2vec.py +++ b/gensim/models/word2vec.py @@ -223,13 +223,13 @@ def train_epoch_sg( model, corpus_file, offset, _cython_vocab, _cur_epoch, _expected_examples, _expected_words, - _work, _neu1, compute_loss, reduced_windows, + _work, _neu1, compute_loss, ): raise RuntimeError("Training with corpus_file argument is not supported") def train_epoch_cbow( model, corpus_file, offset, _cython_vocab, _cur_epoch, _expected_examples, _expected_words, - _work, _neu1, compute_loss, reduced_windows, + _work, _neu1, compute_loss, ): raise RuntimeError("Training with corpus_file argument is not supported") @@ -240,7 +240,7 @@ def __init__( max_vocab_size=None, sample=1e-3, seed=1, workers=3, min_alpha=0.0001, sg=0, hs=0, negative=5, ns_exponent=0.75, cbow_mean=1, hashfxn=hash, epochs=5, null_word=0, trim_rule=None, sorted_vocab=1, batch_words=MAX_WORDS_IN_BATCH, compute_loss=False, callbacks=(), - comment=None, max_final_vocab=None, reduced_windows=True + comment=None, max_final_vocab=None, reduced_windows=True, ): """Train, use and evaluate neural networks described in https://code.google.com/p/word2vec/. @@ -346,8 +346,10 @@ def __init__( callbacks : iterable of :class:`~gensim.models.callbacks.CallbackAny2Vec`, optional Sequence of callbacks to be executed at specific stages during training. reduced_windows : bool, optional - If True, the window size is uniformly sampled from {1, `window`} - during training. Otherwise, it is fixed to `window`. + If True, the effective window size is uniformly sampled from [1, `window`] + for each target word during training, to match the original word2vec algorithm's + approximate weighting of context words by distance. Otherwise, the effective + window size is always fixed to `window` words to either side. Examples -------- @@ -916,13 +918,13 @@ def _do_train_epoch( examples, tally, raw_tally = train_epoch_sg( self, corpus_file, offset, cython_vocab, cur_epoch, total_examples, total_words, work, neu1, - self.compute_loss, self.reduced_windows, + self.compute_loss, ) else: examples, tally, raw_tally = train_epoch_cbow( self, corpus_file, offset, cython_vocab, cur_epoch, total_examples, total_words, work, neu1, - self.compute_loss, self.reduced_windows, + self.compute_loss, ) return examples, tally, raw_tally @@ -950,12 +952,12 @@ def _do_train_job(self, sentences, alpha, inits): if self.sg: tally += train_batch_sg( self, sentences, alpha, work, - self.compute_loss, self.reduced_windows, + self.compute_loss, ) else: tally += train_batch_cbow( self, sentences, alpha, work, neu1, - self.compute_loss, self.reduced_windows, + self.compute_loss, ) return tally, self._raw_word_count(sentences) @@ -967,7 +969,7 @@ def train( self, corpus_iterable=None, corpus_file=None, total_examples=None, total_words=None, epochs=None, start_alpha=None, end_alpha=None, word_count=0, queue_factor=2, report_delay=1.0, compute_loss=False, - reduced_windows=True, callbacks=(), **kwargs, + reduced_windows=None, callbacks=(), **kwargs, ): """Update the model's neural weights from a sequence of sentences. @@ -1025,8 +1027,10 @@ def train( If True, computes and stores loss value which can be retrieved using :meth:`~gensim.models.word2vec.Word2Vec.get_latest_training_loss`. reduced_windows : bool, optional - If True, the window size is uniformly sampled from {1, `window`} - during training. Otherwise, it is fixed to `window`. + If True, the effective window size is uniformly sampled from [1, `window`] + for each target word during training, to match the original word2vec algorithm's + approximate weighting of context words by distance. Otherwise, the effective + window size is always fixed to `window` words to either side. callbacks : iterable of :class:`~gensim.models.callbacks.CallbackAny2Vec`, optional Sequence of callbacks to be executed at specific stages during training. @@ -1046,7 +1050,8 @@ def train( self.alpha = start_alpha or self.alpha self.min_alpha = end_alpha or self.min_alpha self.epochs = epochs - self.reduced_windows = reduced_windows + if reduced_windows is not None: + self.reduced_windows = bool(reduced_windows) self._check_training_sanity(epochs=epochs, total_examples=total_examples, total_words=total_words) self._check_corpus_sanity(corpus_iterable=corpus_iterable, corpus_file=corpus_file, passes=epochs) diff --git a/gensim/models/word2vec_corpusfile.pyx b/gensim/models/word2vec_corpusfile.pyx index a59ab721ea..2af398e58b 100644 --- a/gensim/models/word2vec_corpusfile.pyx +++ b/gensim/models/word2vec_corpusfile.pyx @@ -187,7 +187,7 @@ cdef void prepare_c_structures_for_batch( int *effective_words, int *effective_sentences, unsigned long long *next_random, cvocab_t *vocab, int *sentence_idx, np.uint32_t *indexes, int *codelens, np.uint8_t **codes, np.uint32_t **points, np.uint32_t *reduced_windows, - bint do_reduced_windows) nogil: + int do_reduced_windows) nogil: cdef VocabItem word cdef string token cdef vector[string] sent @@ -229,7 +229,7 @@ cdef void prepare_c_structures_for_batch( if do_reduced_windows: reduced_windows[i] = random_int32(next_random) % window else: - reduced_windows[i] = window + reduced_windows[i] = 0 cdef REAL_t get_alpha(REAL_t alpha, REAL_t end_alpha, int cur_epoch, int num_epochs) nogil: @@ -254,7 +254,7 @@ cdef REAL_t get_next_alpha( def train_epoch_sg(model, corpus_file, offset, _cython_vocab, _cur_epoch, _expected_examples, _expected_words, _work, - _neu1, compute_loss, reduced_windows): + _neu1, compute_loss,): """Train Skipgram model for one epoch by training on an input stream. This function is used only in multistream mode. Called internally from :meth:`~gensim.models.word2vec.Word2Vec.train`. @@ -273,9 +273,6 @@ def train_epoch_sg(model, corpus_file, offset, _cython_vocab, _cur_epoch, _expec Private working memory for each worker. compute_loss : bool Whether or not the training loss should be computed in this batch. - reduced_windows : bool - Whether or not the window size should be reduced based on random - uniform sampling. Returns ------- @@ -302,7 +299,7 @@ def train_epoch_sg(model, corpus_file, offset, _cython_vocab, _cur_epoch, _expec cdef long long total_sentences = 0 cdef long long total_effective_words = 0, total_words = 0 cdef int sent_idx, idx_start, idx_end - cdef bint do_reduced_windows = reduced_windows + cdef int do_reduced_windows = int(model.reduced_windows) init_w2v_config(&c, model, _alpha, compute_loss, _work) @@ -358,7 +355,7 @@ def train_epoch_sg(model, corpus_file, offset, _cython_vocab, _cur_epoch, _expec def train_epoch_cbow(model, corpus_file, offset, _cython_vocab, _cur_epoch, _expected_examples, _expected_words, _work, - _neu1, compute_loss, reduced_windows): + _neu1, compute_loss,): """Train CBOW model for one epoch by training on an input stream. This function is used only in multistream mode. Called internally from :meth:`~gensim.models.word2vec.Word2Vec.train`. @@ -377,9 +374,6 @@ def train_epoch_cbow(model, corpus_file, offset, _cython_vocab, _cur_epoch, _exp Private working memory for each worker. compute_loss : bool Whether or not the training loss should be computed in this batch. - reduced_windows : bool - Whether or not the window size should be reduced based on random - uniform sampling. Returns ------- @@ -406,7 +400,7 @@ def train_epoch_cbow(model, corpus_file, offset, _cython_vocab, _cur_epoch, _exp cdef long long total_sentences = 0 cdef long long total_effective_words = 0, total_words = 0 cdef int sent_idx, idx_start, idx_end - cdef bint do_reduced_windows = reduced_windows + cdef int do_reduced_windows = int(model.reduced_windows) init_w2v_config(&c, model, _alpha, compute_loss, _work, _neu1) diff --git a/gensim/models/word2vec_inner.pyx b/gensim/models/word2vec_inner.pyx index 64525094aa..7389f70054 100755 --- a/gensim/models/word2vec_inner.pyx +++ b/gensim/models/word2vec_inner.pyx @@ -502,7 +502,7 @@ cdef init_w2v_config(Word2VecConfig *c, model, alpha, compute_loss, _work, _neu1 c[0].neu1 = np.PyArray_DATA(_neu1) -def train_batch_sg(model, sentences, alpha, _work, compute_loss, reduced_windows): +def train_batch_sg(model, sentences, alpha, _work, compute_loss): """Update skip-gram model by training on a batch of sentences. Called internally from :meth:`~gensim.models.word2vec.Word2Vec.train`. @@ -519,9 +519,6 @@ def train_batch_sg(model, sentences, alpha, _work, compute_loss, reduced_windows Private working memory for each worker. compute_loss : bool Whether or not the training loss should be computed in this batch. - reduced_windows : bool - Whether or not the window size should be reduced based on random - uniform sampling. Returns ------- @@ -573,12 +570,11 @@ def train_batch_sg(model, sentences, alpha, _work, compute_loss, reduced_windows break # TODO: log warning, tally overflow? # precompute "reduced window" offsets in a single randint() call - if reduced_windows: - window_size = model.random.randint(0, c.window, effective_words) + if model.reduced_windows: + for i, item in enumerate(model.random.randint(0, c.window, effective_words)): + c.reduced_windows[i] = item else: - window_size = [0] * effective_words - for i, item in enumerate(window_size): - c.reduced_windows[i] = item + c.reduced_windows[:] = int(0) # release GIL & train on all sentences with nogil: @@ -604,7 +600,7 @@ def train_batch_sg(model, sentences, alpha, _work, compute_loss, reduced_windows return effective_words -def train_batch_cbow(model, sentences, alpha, _work, _neu1, compute_loss, reduced_windows): +def train_batch_cbow(model, sentences, alpha, _work, _neu1, compute_loss): """Update CBOW model by training on a batch of sentences. Called internally from :meth:`~gensim.models.word2vec.Word2Vec.train`. @@ -623,9 +619,6 @@ def train_batch_cbow(model, sentences, alpha, _work, _neu1, compute_loss, reduce Private working memory for each worker. compute_loss : bool Whether or not the training loss should be computed in this batch. - reduced_windows : bool - Whether or not the window size should be reduced based on random - uniform sampling. Returns ------- @@ -676,12 +669,11 @@ def train_batch_cbow(model, sentences, alpha, _work, _neu1, compute_loss, reduce break # TODO: log warning, tally overflow? # precompute "reduced window" offsets in a single randint() call - if reduced_windows: - window_size = model.random.randint(0, c.window, effective_words) + if model.reduced_windows: + for i, item in enumerate(model.random.randint(0, c.window, effective_words)): + c.reduced_windows[i] = item else: - window_size = [0] * effective_words - for i, item in enumerate(window_size): - c.reduced_windows[i] = item + c.reduced_windows[:] = int(0) # release GIL & train on all sentences with nogil: From 77698cd502c268a41fed3db5ba3833f95d8576f7 Mon Sep 17 00:00:00 2001 From: Paul Andrey Date: Thu, 10 Jun 2021 17:01:45 +0200 Subject: [PATCH 03/13] Renamed `reduced_windows` to `shrink_windows`. --- gensim/models/word2vec.py | 18 +++++++++--------- gensim/models/word2vec_corpusfile.pyx | 12 ++++++------ gensim/models/word2vec_inner.pyx | 4 ++-- 3 files changed, 17 insertions(+), 17 deletions(-) diff --git a/gensim/models/word2vec.py b/gensim/models/word2vec.py index 53a02d80fe..aa245c369a 100755 --- a/gensim/models/word2vec.py +++ b/gensim/models/word2vec.py @@ -240,7 +240,7 @@ def __init__( max_vocab_size=None, sample=1e-3, seed=1, workers=3, min_alpha=0.0001, sg=0, hs=0, negative=5, ns_exponent=0.75, cbow_mean=1, hashfxn=hash, epochs=5, null_word=0, trim_rule=None, sorted_vocab=1, batch_words=MAX_WORDS_IN_BATCH, compute_loss=False, callbacks=(), - comment=None, max_final_vocab=None, reduced_windows=True, + comment=None, max_final_vocab=None, shrink_windows=True, ): """Train, use and evaluate neural networks described in https://code.google.com/p/word2vec/. @@ -345,7 +345,7 @@ def __init__( :meth:`~gensim.models.word2vec.Word2Vec.get_latest_training_loss`. callbacks : iterable of :class:`~gensim.models.callbacks.CallbackAny2Vec`, optional Sequence of callbacks to be executed at specific stages during training. - reduced_windows : bool, optional + shrink_windows : bool, optional If True, the effective window size is uniformly sampled from [1, `window`] for each target word during training, to match the original word2vec algorithm's approximate weighting of context words by distance. Otherwise, the effective @@ -382,7 +382,7 @@ def __init__( self.min_alpha = float(min_alpha) self.window = int(window) - self.reduced_windows = bool(reduced_windows) + self.shrink_windows = bool(shrink_windows) self.random = np.random.RandomState(seed) self.hs = int(hs) @@ -426,7 +426,7 @@ def __init__( corpus_iterable=corpus_iterable, corpus_file=corpus_file, total_examples=self.corpus_count, total_words=self.corpus_total_words, epochs=self.epochs, start_alpha=self.alpha, end_alpha=self.min_alpha, compute_loss=self.compute_loss, callbacks=callbacks, - reduced_windows=self.reduced_windows) + shrink_windows=self.shrink_windows) else: if trim_rule is not None: logger.warning( @@ -969,7 +969,7 @@ def train( self, corpus_iterable=None, corpus_file=None, total_examples=None, total_words=None, epochs=None, start_alpha=None, end_alpha=None, word_count=0, queue_factor=2, report_delay=1.0, compute_loss=False, - reduced_windows=None, callbacks=(), **kwargs, + shrink_windows=None, callbacks=(), **kwargs, ): """Update the model's neural weights from a sequence of sentences. @@ -1026,7 +1026,7 @@ def train( compute_loss: bool, optional If True, computes and stores loss value which can be retrieved using :meth:`~gensim.models.word2vec.Word2Vec.get_latest_training_loss`. - reduced_windows : bool, optional + shrink_windows : bool, optional If True, the effective window size is uniformly sampled from [1, `window`] for each target word during training, to match the original word2vec algorithm's approximate weighting of context words by distance. Otherwise, the effective @@ -1050,8 +1050,8 @@ def train( self.alpha = start_alpha or self.alpha self.min_alpha = end_alpha or self.min_alpha self.epochs = epochs - if reduced_windows is not None: - self.reduced_windows = bool(reduced_windows) + if shrink_windows is not None: + self.shrink_windows = bool(shrink_windows) self._check_training_sanity(epochs=epochs, total_examples=total_examples, total_words=total_words) self._check_corpus_sanity(corpus_iterable=corpus_iterable, corpus_file=corpus_file, passes=epochs) @@ -1061,7 +1061,7 @@ def train( msg=( f"training model with {self.workers} workers on {len(self.wv)} vocabulary and " f"{self.layer1_size} features, using sg={self.sg} hs={self.hs} sample={self.sample} " - f"negative={self.negative} window={self.window} reduced_windows={self.reduced_windows}" + f"negative={self.negative} window={self.window} shrink_windows={self.shrink_windows}" ), ) diff --git a/gensim/models/word2vec_corpusfile.pyx b/gensim/models/word2vec_corpusfile.pyx index 2af398e58b..da94e78ec1 100644 --- a/gensim/models/word2vec_corpusfile.pyx +++ b/gensim/models/word2vec_corpusfile.pyx @@ -187,7 +187,7 @@ cdef void prepare_c_structures_for_batch( int *effective_words, int *effective_sentences, unsigned long long *next_random, cvocab_t *vocab, int *sentence_idx, np.uint32_t *indexes, int *codelens, np.uint8_t **codes, np.uint32_t **points, np.uint32_t *reduced_windows, - int do_reduced_windows) nogil: + int shrink_windows) nogil: cdef VocabItem word cdef string token cdef vector[string] sent @@ -226,7 +226,7 @@ cdef void prepare_c_structures_for_batch( # precompute "reduced window" offsets in a single randint() call for i in range(effective_words[0]): - if do_reduced_windows: + if shrink_windows: reduced_windows[i] = random_int32(next_random) % window else: reduced_windows[i] = 0 @@ -299,7 +299,7 @@ def train_epoch_sg(model, corpus_file, offset, _cython_vocab, _cur_epoch, _expec cdef long long total_sentences = 0 cdef long long total_effective_words = 0, total_words = 0 cdef int sent_idx, idx_start, idx_end - cdef int do_reduced_windows = int(model.reduced_windows) + cdef int shrink_windows = int(model.shrink_windows) init_w2v_config(&c, model, _alpha, compute_loss, _work) @@ -316,7 +316,7 @@ def train_epoch_sg(model, corpus_file, offset, _cython_vocab, _cur_epoch, _expec prepare_c_structures_for_batch( sentences, c.sample, c.hs, c.window, &total_words, &effective_words, &effective_sentences, &c.next_random, vocab.get_vocab_ptr(), c.sentence_idx, c.indexes, - c.codelens, c.codes, c.points, c.reduced_windows, do_reduced_windows) + c.codelens, c.codes, c.points, c.reduced_windows, shrink_windows) for sent_idx in range(effective_sentences): idx_start = c.sentence_idx[sent_idx] @@ -400,7 +400,7 @@ def train_epoch_cbow(model, corpus_file, offset, _cython_vocab, _cur_epoch, _exp cdef long long total_sentences = 0 cdef long long total_effective_words = 0, total_words = 0 cdef int sent_idx, idx_start, idx_end - cdef int do_reduced_windows = int(model.reduced_windows) + cdef int shrink_windows = int(model.shrink_windows) init_w2v_config(&c, model, _alpha, compute_loss, _work, _neu1) @@ -417,7 +417,7 @@ def train_epoch_cbow(model, corpus_file, offset, _cython_vocab, _cur_epoch, _exp prepare_c_structures_for_batch( sentences, c.sample, c.hs, c.window, &total_words, &effective_words, &effective_sentences, &c.next_random, vocab.get_vocab_ptr(), c.sentence_idx, - c.indexes, c.codelens, c.codes, c.points, c.reduced_windows, do_reduced_windows) + c.indexes, c.codelens, c.codes, c.points, c.reduced_windows, shrink_windows) for sent_idx in range(effective_sentences): idx_start = c.sentence_idx[sent_idx] diff --git a/gensim/models/word2vec_inner.pyx b/gensim/models/word2vec_inner.pyx index 7389f70054..cd3b5feefa 100755 --- a/gensim/models/word2vec_inner.pyx +++ b/gensim/models/word2vec_inner.pyx @@ -570,7 +570,7 @@ def train_batch_sg(model, sentences, alpha, _work, compute_loss): break # TODO: log warning, tally overflow? # precompute "reduced window" offsets in a single randint() call - if model.reduced_windows: + if model.shrink_windows: for i, item in enumerate(model.random.randint(0, c.window, effective_words)): c.reduced_windows[i] = item else: @@ -669,7 +669,7 @@ def train_batch_cbow(model, sentences, alpha, _work, _neu1, compute_loss): break # TODO: log warning, tally overflow? # precompute "reduced window" offsets in a single randint() call - if model.reduced_windows: + if model.shrink_windows: for i, item in enumerate(model.random.randint(0, c.window, effective_words)): c.reduced_windows[i] = item else: From 11a464ffdae1fbc8e29f44400722d5cf626e9edb Mon Sep 17 00:00:00 2001 From: Paul Andrey Date: Fri, 11 Jun 2021 18:24:04 +0200 Subject: [PATCH 04/13] Removed `shrink_windows` argument from `Word2Vec.train`. --- gensim/models/word2vec.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/gensim/models/word2vec.py b/gensim/models/word2vec.py index aa245c369a..8b8160fa12 100755 --- a/gensim/models/word2vec.py +++ b/gensim/models/word2vec.py @@ -425,8 +425,7 @@ def __init__( self.train( corpus_iterable=corpus_iterable, corpus_file=corpus_file, total_examples=self.corpus_count, total_words=self.corpus_total_words, epochs=self.epochs, start_alpha=self.alpha, - end_alpha=self.min_alpha, compute_loss=self.compute_loss, callbacks=callbacks, - shrink_windows=self.shrink_windows) + end_alpha=self.min_alpha, compute_loss=self.compute_loss, callbacks=callbacks) else: if trim_rule is not None: logger.warning( @@ -969,7 +968,7 @@ def train( self, corpus_iterable=None, corpus_file=None, total_examples=None, total_words=None, epochs=None, start_alpha=None, end_alpha=None, word_count=0, queue_factor=2, report_delay=1.0, compute_loss=False, - shrink_windows=None, callbacks=(), **kwargs, + callbacks=(), **kwargs, ): """Update the model's neural weights from a sequence of sentences. @@ -1050,8 +1049,6 @@ def train( self.alpha = start_alpha or self.alpha self.min_alpha = end_alpha or self.min_alpha self.epochs = epochs - if shrink_windows is not None: - self.shrink_windows = bool(shrink_windows) self._check_training_sanity(epochs=epochs, total_examples=total_examples, total_words=total_words) self._check_corpus_sanity(corpus_iterable=corpus_iterable, corpus_file=corpus_file, passes=epochs) From a65751889960e7a76ebe3471cf2268d9b289a5b7 Mon Sep 17 00:00:00 2001 From: Paul Andrey Date: Tue, 15 Jun 2021 22:47:24 +0200 Subject: [PATCH 05/13] Aesthetic fix. --- gensim/models/word2vec_corpusfile.pyx | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/gensim/models/word2vec_corpusfile.pyx b/gensim/models/word2vec_corpusfile.pyx index da94e78ec1..13b69d30da 100644 --- a/gensim/models/word2vec_corpusfile.pyx +++ b/gensim/models/word2vec_corpusfile.pyx @@ -187,7 +187,8 @@ cdef void prepare_c_structures_for_batch( int *effective_words, int *effective_sentences, unsigned long long *next_random, cvocab_t *vocab, int *sentence_idx, np.uint32_t *indexes, int *codelens, np.uint8_t **codes, np.uint32_t **points, np.uint32_t *reduced_windows, - int shrink_windows) nogil: + int shrink_windows, + ) nogil: cdef VocabItem word cdef string token cdef vector[string] sent From ff0d30e3ae79f52a3e5fff3b7f32d83ff5a8b6e8 Mon Sep 17 00:00:00 2001 From: Paul Andrey Date: Tue, 15 Jun 2021 22:48:04 +0200 Subject: [PATCH 06/13] Fixed old word2vec models' reloading. --- gensim/models/word2vec.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/gensim/models/word2vec.py b/gensim/models/word2vec.py index 8b8160fa12..6c9a826a59 100755 --- a/gensim/models/word2vec.py +++ b/gensim/models/word2vec.py @@ -1989,6 +1989,8 @@ def _load_specials(self, *args, **kwargs): self.syn1 = self.syn1 del self.syn1 del self.trainables + if not hasattr(self, 'shrink_windows'): + self.shrink_windows = True def get_latest_training_loss(self): """Get current value of the training loss. From f98eb43c4aea38de32640b42b24d385bd73d5e12 Mon Sep 17 00:00:00 2001 From: Paul Andrey Date: Tue, 15 Jun 2021 22:55:22 +0200 Subject: [PATCH 07/13] Fixed undue docstring. --- gensim/models/word2vec.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/gensim/models/word2vec.py b/gensim/models/word2vec.py index 6c9a826a59..91817c2b15 100755 --- a/gensim/models/word2vec.py +++ b/gensim/models/word2vec.py @@ -1025,11 +1025,6 @@ def train( compute_loss: bool, optional If True, computes and stores loss value which can be retrieved using :meth:`~gensim.models.word2vec.Word2Vec.get_latest_training_loss`. - shrink_windows : bool, optional - If True, the effective window size is uniformly sampled from [1, `window`] - for each target word during training, to match the original word2vec algorithm's - approximate weighting of context words by distance. Otherwise, the effective - window size is always fixed to `window` words to either side. callbacks : iterable of :class:`~gensim.models.callbacks.CallbackAny2Vec`, optional Sequence of callbacks to be executed at specific stages during training. From 3b27f1c3cb6e3ee20d75b7d2834816f60b49cac5 Mon Sep 17 00:00:00 2001 From: Paul Andrey Date: Tue, 15 Jun 2021 22:55:49 +0200 Subject: [PATCH 08/13] Added `shrink_windows` argument to Doc2Vec. --- gensim/models/doc2vec.py | 8 +++++++- gensim/models/doc2vec_corpusfile.pyx | 17 ++++++++++++----- gensim/models/doc2vec_inner.pyx | 16 +++++++++++----- 3 files changed, 30 insertions(+), 11 deletions(-) diff --git a/gensim/models/doc2vec.py b/gensim/models/doc2vec.py index 23f7d9fc7e..072bb8e233 100644 --- a/gensim/models/doc2vec.py +++ b/gensim/models/doc2vec.py @@ -158,7 +158,7 @@ def count(self, new_val): class Doc2Vec(Word2Vec): def __init__(self, documents=None, corpus_file=None, vector_size=100, dm_mean=None, dm=1, dbow_words=0, dm_concat=0, dm_tag_count=1, dv=None, dv_mapfile=None, comment=None, trim_rule=None, callbacks=(), - window=5, epochs=10, **kwargs): + window=5, epochs=10, shrink_windows=True, **kwargs): """Class for training, using and evaluating neural networks described in `Distributed Representations of Sentences and Documents `_. @@ -248,6 +248,11 @@ def __init__(self, documents=None, corpus_file=None, vector_size=100, dm_mean=No callbacks : :obj: `list` of :obj: `~gensim.models.callbacks.CallbackAny2Vec`, optional List of callbacks that need to be executed/run at specific stages during training. + shrink_windows : bool, optional + If True, the effective window size is uniformly sampled from [1, `window`] + for each target word during training, to match the original word2vec algorithm's + approximate weighting of context words by distance. Otherwise, the effective + window size is always fixed to `window` words to either side. Some important internal attributes are the following: @@ -294,6 +299,7 @@ def __init__(self, documents=None, corpus_file=None, vector_size=100, dm_mean=No callbacks=callbacks, window=window, epochs=epochs, + shrink_windows=shrink_windows, **kwargs, ) diff --git a/gensim/models/doc2vec_corpusfile.pyx b/gensim/models/doc2vec_corpusfile.pyx index 40bf20bdd3..a7173ca026 100644 --- a/gensim/models/doc2vec_corpusfile.pyx +++ b/gensim/models/doc2vec_corpusfile.pyx @@ -59,7 +59,7 @@ cdef void prepare_c_structures_for_batch( int *effective_words, unsigned long long *next_random, cvocab_t *vocab, np.uint32_t *indexes, int *codelens, np.uint8_t **codes, np.uint32_t **points, np.uint32_t *reduced_windows, int *document_len, int train_words, - int docvecs_count, int doc_tag, + int docvecs_count, int doc_tag, int shrink_windows, ) nogil: cdef VocabItem predict_word cdef string token @@ -88,7 +88,10 @@ cdef void prepare_c_structures_for_batch( if train_words and reduced_windows != NULL: for i in range(document_len[0]): - reduced_windows[i] = random_int32(next_random) % window + if shrink_windows: + reduced_windows[i] = random_int32(next_random) % window + else: + reduced_windows[i] = 0 if doc_tag < docvecs_count: effective_words[0] += 1 @@ -160,6 +163,7 @@ def d2v_train_epoch_dbow( cdef long long total_documents = 0 cdef long long total_effective_words = 0, total_words = 0 cdef int sent_idx, idx_start, idx_end + cdef int shrink_windows = int(model.shrink_windows) cdef vector[string] doc_words cdef long long _doc_tag = start_doctag @@ -183,7 +187,7 @@ def d2v_train_epoch_dbow( prepare_c_structures_for_batch( doc_words, c.sample, c.hs, c.window, &total_words, &effective_words, &c.next_random, vocab.get_vocab_ptr(), c.indexes, c.codelens, c.codes, c.points, - c.reduced_windows, &document_len, c.train_words, c.docvecs_count, _doc_tag) + c.reduced_windows, &document_len, c.train_words, c.docvecs_count, _doc_tag, shrink_windows) for i in range(document_len): if c.train_words: # simultaneous skip-gram wordvec-training @@ -300,6 +304,7 @@ def d2v_train_epoch_dm( cdef long long total_effective_words = 0, total_words = 0 cdef int sent_idx, idx_start, idx_end cdef REAL_t count, inv_count = 1.0 + cdef int shrink_windows = int(model.shrink_windows) cdef vector[string] doc_words cdef long long _doc_tag = start_doctag @@ -323,7 +328,7 @@ def d2v_train_epoch_dm( prepare_c_structures_for_batch( doc_words, c.sample, c.hs, c.window, &total_words, &effective_words, &c.next_random, vocab.get_vocab_ptr(), c.indexes, c.codelens, c.codes, c.points, c.reduced_windows, - &document_len, c.train_words, c.docvecs_count, _doc_tag) + &document_len, c.train_words, c.docvecs_count, _doc_tag, shrink_windows) for i in range(document_len): j = i - c.window + c.reduced_windows[i] @@ -453,6 +458,7 @@ def d2v_train_epoch_dm_concat( cdef long long total_documents = 0 cdef long long total_effective_words = 0, total_words = 0 cdef int sent_idx, idx_start, idx_end + cdef int shrink_windows = int(model.shrink_windows) cdef vector[string] doc_words cdef long long _doc_tag = start_doctag @@ -490,7 +496,8 @@ def d2v_train_epoch_dm_concat( prepare_c_structures_for_batch( doc_words, c.sample, c.hs, c.window, &total_words, &effective_words, &c.next_random, vocab.get_vocab_ptr(), c.indexes, c.codelens, c.codes, - c.points, NULL, &document_len, c.train_words, c.docvecs_count, _doc_tag) + c.points, NULL, &document_len, c.train_words, c.docvecs_count, _doc_tag, + shrink_windows) for i in range(document_len): j = i - c.window # negative OK: will pad with null word diff --git a/gensim/models/doc2vec_inner.pyx b/gensim/models/doc2vec_inner.pyx index 23ede53c90..ce1e214c3a 100644 --- a/gensim/models/doc2vec_inner.pyx +++ b/gensim/models/doc2vec_inner.pyx @@ -364,9 +364,12 @@ def train_document_dbow(model, doc_words, doctag_indexes, alpha, work=None, c.document_len = i if c.train_words: - # single randint() call avoids a big thread-synchronization slowdown - for i, item in enumerate(model.random.randint(0, c.window, c.document_len)): - c.reduced_windows[i] = item + if model.shrink_windows: + # single randint() call avoids a big thread-synchronization slowdown + for i, item in enumerate(model.random.randint(0, c.window, c.document_len)): + c.reduced_windows[i] = item + else: + c.reduced_windows[:] = int(0) for i in range(c.doctag_len): c.doctag_indexes[i] = doctag_indexes[i] @@ -497,8 +500,11 @@ def train_document_dm(model, doc_words, doctag_indexes, alpha, work=None, neu1=N c.document_len = i # single randint() call avoids a big thread-sync slowdown - for i, item in enumerate(model.random.randint(0, c.window, c.document_len)): - c.reduced_windows[i] = item + if model.shrink_windows: + for i, item in enumerate(model.random.randint(0, c.window, c.document_len)): + c.reduced_windows[i] = item + else: + c.reduced_windows[:] = int(0) for i in range(c.doctag_len): c.doctag_indexes[i] = doctag_indexes[i] From 27a49d77dde635bc8c499eed157d70a76966234d Mon Sep 17 00:00:00 2001 From: Paul Andrey Date: Tue, 15 Jun 2021 23:07:53 +0200 Subject: [PATCH 09/13] Added `shrink_windows` argument to FastText. --- gensim/models/fasttext.py | 10 ++++++++-- gensim/models/fasttext_corpusfile.pyx | 14 ++++++++++---- gensim/models/fasttext_inner.pyx | 7 +++++-- 3 files changed, 23 insertions(+), 8 deletions(-) diff --git a/gensim/models/fasttext.py b/gensim/models/fasttext.py index 61c093a3be..8d396400f2 100644 --- a/gensim/models/fasttext.py +++ b/gensim/models/fasttext.py @@ -276,7 +276,7 @@ def __init__(self, sentences=None, corpus_file=None, sg=0, hs=0, vector_size=100 max_vocab_size=None, word_ngrams=1, sample=1e-3, seed=1, workers=3, min_alpha=0.0001, negative=5, ns_exponent=0.75, cbow_mean=1, hashfxn=hash, epochs=5, null_word=0, min_n=3, max_n=6, sorted_vocab=1, bucket=2000000, trim_rule=None, batch_words=MAX_WORDS_IN_BATCH, callbacks=(), - max_final_vocab=None): + max_final_vocab=None, shrink_windows=True,): """Train, use and evaluate word representations learned using the method described in `Enriching Word Vectors with Subword Information `_, aka FastText. @@ -385,6 +385,11 @@ def __init__(self, sentences=None, corpus_file=None, sg=0, hs=0, vector_size=100 ``min_count```. If the specified ``min_count`` is more than the automatically calculated ``min_count``, the former will be used. Set to ``None`` if not required. + shrink_windows : bool, optional + If True, the effective window size is uniformly sampled from [1, `window`] + for each target word during training, to match the original word2vec algorithm's + approximate weighting of context words by distance. Otherwise, the effective + window size is always fixed to `window` words to either side. Examples -------- @@ -432,7 +437,8 @@ def __init__(self, sentences=None, corpus_file=None, sg=0, hs=0, vector_size=100 max_vocab_size=max_vocab_size, max_final_vocab=max_final_vocab, min_count=min_count, sample=sample, sorted_vocab=sorted_vocab, null_word=null_word, ns_exponent=ns_exponent, hashfxn=hashfxn, - seed=seed, hs=hs, negative=negative, cbow_mean=cbow_mean, min_alpha=min_alpha) + seed=seed, hs=hs, negative=negative, cbow_mean=cbow_mean, + min_alpha=min_alpha, shrink_windows=shrink_windows) def _init_post_load(self, hidden_output): num_vectors = len(self.wv.vectors) diff --git a/gensim/models/fasttext_corpusfile.pyx b/gensim/models/fasttext_corpusfile.pyx index e5ec611aa0..d2abb4c04f 100644 --- a/gensim/models/fasttext_corpusfile.pyx +++ b/gensim/models/fasttext_corpusfile.pyx @@ -46,7 +46,8 @@ cdef void prepare_c_structures_for_batch( vector[vector[string]] &sentences, int sample, int hs, int window, long long *total_words, int *effective_words, int *effective_sentences, unsigned long long *next_random, cvocab_t *vocab, int *sentence_idx, np.uint32_t *indexes, int *codelens, np.uint8_t **codes, np.uint32_t **points, - np.uint32_t *reduced_windows, int *subwords_idx_len, np.uint32_t **subwords_idx) nogil: + np.uint32_t *reduced_windows, int *subwords_idx_len, np.uint32_t **subwords_idx, int shrink_windows, + ) nogil: cdef VocabItem word cdef string token cdef vector[string] sent @@ -89,7 +90,10 @@ cdef void prepare_c_structures_for_batch( # precompute "reduced window" offsets in a single randint() call for i in range(effective_words[0]): - reduced_windows[i] = random_int32(next_random) % window + if shrink_windows: + reduced_windows[i] = random_int32(next_random) % window + else: + reduced_windows[i] = 0 def train_epoch_sg( @@ -136,6 +140,7 @@ def train_epoch_sg( cdef long long total_sentences = 0 cdef long long total_effective_words = 0, total_words = 0 cdef int sent_idx, idx_start, idx_end + cdef int shrink_windows = int(model.shrink_windows) init_ft_config(&c, model, _alpha, _work, _l1) @@ -153,7 +158,7 @@ def train_epoch_sg( prepare_c_structures_for_batch( sentences, c.sample, c.hs, c.window, &total_words, &effective_words, &effective_sentences, &c.next_random, vocab.get_vocab_ptr(), c.sentence_idx, c.indexes, c.codelens, - c.codes, c.points, c.reduced_windows, c.subwords_idx_len, c.subwords_idx) + c.codes, c.points, c.reduced_windows, c.subwords_idx_len, c.subwords_idx, shrink_windows) for sent_idx in range(effective_sentences): idx_start = c.sentence_idx[sent_idx] @@ -226,6 +231,7 @@ def train_epoch_cbow(model, corpus_file, offset, _cython_vocab, _cur_epoch, _exp cdef long long total_sentences = 0 cdef long long total_effective_words = 0, total_words = 0 cdef int sent_idx, idx_start, idx_end + cdef int shrink_windows = int(model.shrink_windows) init_ft_config(&c, model, _alpha, _work, _neu1) @@ -243,7 +249,7 @@ def train_epoch_cbow(model, corpus_file, offset, _cython_vocab, _cur_epoch, _exp prepare_c_structures_for_batch( sentences, c.sample, c.hs, c.window, &total_words, &effective_words, &effective_sentences, &c.next_random, vocab.get_vocab_ptr(), c.sentence_idx, c.indexes, c.codelens, - c.codes, c.points, c.reduced_windows, c.subwords_idx_len, c.subwords_idx) + c.codes, c.points, c.reduced_windows, c.subwords_idx_len, c.subwords_idx, shrink_windows) for sent_idx in range(effective_sentences): idx_start = c.sentence_idx[sent_idx] diff --git a/gensim/models/fasttext_inner.pyx b/gensim/models/fasttext_inner.pyx index e71ed6f31d..7416ce7dd0 100644 --- a/gensim/models/fasttext_inner.pyx +++ b/gensim/models/fasttext_inner.pyx @@ -601,8 +601,11 @@ def train_batch_any(model, sentences, alpha, _work, _neu1): num_words, num_sentences = populate_ft_config(&c, model.wv, model.wv.buckets_word, sentences) # precompute "reduced window" offsets in a single randint() call - for i, randint in enumerate(model.random.randint(0, c.window, num_words)): - c.reduced_windows[i] = randint + if model.shrink_windows: + for i, randint in enumerate(model.random.randint(0, c.window, num_words)): + c.reduced_windows[i] = randint + else: + c.reduced_windows[:] = int(0) # release GIL & train on all sentences in the batch with nogil: From 6fde7e89e438efbd36495fe6081510c8a7fdb78f Mon Sep 17 00:00:00 2001 From: Paul Andrey Date: Thu, 17 Jun 2021 10:45:06 +0200 Subject: [PATCH 10/13] Fixed and optimized `shrink_windows` backend use. * `c.reduced_windows[:] = 0` syntax is not supported; as a consequence, all zero-value assignments due to `shrink_windows=False` have been rewritten as `for` loops. * Those changes have now been tested (cf. next commit). * NOTE: another way to proceed could be to initialize the `reduced_windows` array with zeros as values; it would then only be altered when `shrink_windows=True`. --- gensim/models/doc2vec_corpusfile.pyx | 7 ++++--- gensim/models/doc2vec_inner.pyx | 8 +++++--- gensim/models/fasttext_corpusfile.pyx | 7 ++++--- gensim/models/fasttext_inner.pyx | 3 ++- gensim/models/word2vec_corpusfile.pyx | 7 ++++--- gensim/models/word2vec_inner.pyx | 6 ++++-- 6 files changed, 23 insertions(+), 15 deletions(-) diff --git a/gensim/models/doc2vec_corpusfile.pyx b/gensim/models/doc2vec_corpusfile.pyx index a7173ca026..9216d13bd4 100644 --- a/gensim/models/doc2vec_corpusfile.pyx +++ b/gensim/models/doc2vec_corpusfile.pyx @@ -87,10 +87,11 @@ cdef void prepare_c_structures_for_batch( document_len[0] = i if train_words and reduced_windows != NULL: - for i in range(document_len[0]): - if shrink_windows: + if shrink_windows: + for i in range(document_len[0]): reduced_windows[i] = random_int32(next_random) % window - else: + else: + for i in range(document_len[0]): reduced_windows[i] = 0 if doc_tag < docvecs_count: diff --git a/gensim/models/doc2vec_inner.pyx b/gensim/models/doc2vec_inner.pyx index ce1e214c3a..1657c59787 100644 --- a/gensim/models/doc2vec_inner.pyx +++ b/gensim/models/doc2vec_inner.pyx @@ -364,12 +364,13 @@ def train_document_dbow(model, doc_words, doctag_indexes, alpha, work=None, c.document_len = i if c.train_words: + # single randint() call avoids a big thread-synchronization slowdown if model.shrink_windows: - # single randint() call avoids a big thread-synchronization slowdown for i, item in enumerate(model.random.randint(0, c.window, c.document_len)): c.reduced_windows[i] = item else: - c.reduced_windows[:] = int(0) + for i in range(c.document_len): + c.reduced_windows[i] = 0 for i in range(c.doctag_len): c.doctag_indexes[i] = doctag_indexes[i] @@ -504,7 +505,8 @@ def train_document_dm(model, doc_words, doctag_indexes, alpha, work=None, neu1=N for i, item in enumerate(model.random.randint(0, c.window, c.document_len)): c.reduced_windows[i] = item else: - c.reduced_windows[:] = int(0) + for i in range(c.document_len): + c.reduced_windows[i] = 0 for i in range(c.doctag_len): c.doctag_indexes[i] = doctag_indexes[i] diff --git a/gensim/models/fasttext_corpusfile.pyx b/gensim/models/fasttext_corpusfile.pyx index d2abb4c04f..5d275b42b6 100644 --- a/gensim/models/fasttext_corpusfile.pyx +++ b/gensim/models/fasttext_corpusfile.pyx @@ -89,10 +89,11 @@ cdef void prepare_c_structures_for_batch( break # precompute "reduced window" offsets in a single randint() call - for i in range(effective_words[0]): - if shrink_windows: + if shrink_windows: + for i in range(effective_words[0]): reduced_windows[i] = random_int32(next_random) % window - else: + else: + for i in range(effective_words[0]): reduced_windows[i] = 0 diff --git a/gensim/models/fasttext_inner.pyx b/gensim/models/fasttext_inner.pyx index 7416ce7dd0..e27bd62feb 100644 --- a/gensim/models/fasttext_inner.pyx +++ b/gensim/models/fasttext_inner.pyx @@ -605,7 +605,8 @@ def train_batch_any(model, sentences, alpha, _work, _neu1): for i, randint in enumerate(model.random.randint(0, c.window, num_words)): c.reduced_windows[i] = randint else: - c.reduced_windows[:] = int(0) + for i in range(num_words): + c.reduced_windows[i] = 0 # release GIL & train on all sentences in the batch with nogil: diff --git a/gensim/models/word2vec_corpusfile.pyx b/gensim/models/word2vec_corpusfile.pyx index 13b69d30da..5d7f5004e4 100644 --- a/gensim/models/word2vec_corpusfile.pyx +++ b/gensim/models/word2vec_corpusfile.pyx @@ -226,10 +226,11 @@ cdef void prepare_c_structures_for_batch( break # TODO: log warning, tally overflow? # precompute "reduced window" offsets in a single randint() call - for i in range(effective_words[0]): - if shrink_windows: + if shrink_windows: + for i in range(effective_words[0]): reduced_windows[i] = random_int32(next_random) % window - else: + else: + for i in range(effective_words[0]): reduced_windows[i] = 0 diff --git a/gensim/models/word2vec_inner.pyx b/gensim/models/word2vec_inner.pyx index a351dc04cd..ffdc908b5c 100755 --- a/gensim/models/word2vec_inner.pyx +++ b/gensim/models/word2vec_inner.pyx @@ -570,7 +570,8 @@ def train_batch_sg(model, sentences, alpha, _work, compute_loss): for i, item in enumerate(model.random.randint(0, c.window, effective_words)): c.reduced_windows[i] = item else: - c.reduced_windows[:] = int(0) + for i in range(effective_words): + c.reduced_windows[i] = 0 # release GIL & train on all sentences with nogil: @@ -669,7 +670,8 @@ def train_batch_cbow(model, sentences, alpha, _work, _neu1, compute_loss): for i, item in enumerate(model.random.randint(0, c.window, effective_words)): c.reduced_windows[i] = item else: - c.reduced_windows[:] = int(0) + for i in range(effective_words): + c.reduced_windows[i] = 0 # release GIL & train on all sentences with nogil: From 486b3f502e45b4bc06bf1a9196c9de41587478b1 Mon Sep 17 00:00:00 2001 From: Paul Andrey Date: Thu, 17 Jun 2021 11:15:26 +0200 Subject: [PATCH 11/13] Added tests for `shrink_windows=False` in Word2Vec-based models. --- gensim/test/test_doc2vec.py | 38 ++++++++++++++++++++++++++++++++++++ gensim/test/test_fasttext.py | 28 ++++++++++++++++++-------- gensim/test/test_word2vec.py | 32 ++++++++++++++++++++++++++++++ 3 files changed, 90 insertions(+), 8 deletions(-) diff --git a/gensim/test/test_doc2vec.py b/gensim/test/test_doc2vec.py index 60c9158744..ebb1af7cbe 100644 --- a/gensim/test/test_doc2vec.py +++ b/gensim/test/test_doc2vec.py @@ -589,6 +589,44 @@ def test_dmc_neg_fromfile(self): ) self.model_sanity(model) + def test_dmm_fixedwindowsize(self): + """Test DMM doc2vec training with fixed window size.""" + model = doc2vec.Doc2Vec( + list_corpus, vector_size=24, + dm=1, dm_mean=1, window=4, shrink_windows=False, + hs=0, negative=10, alpha=0.05, min_count=2, epochs=20 + ) + self.model_sanity(model) + + def test_dmm_fixedwindowsize_fromfile(self): + """Test DMM doc2vec training with fixed window size, from file.""" + with temporary_file(get_tmpfile('gensim_doc2vec.tst')) as corpus_file: + save_lee_corpus_as_line_sentence(corpus_file) + model = doc2vec.Doc2Vec( + corpus_file=corpus_file, vector_size=24, + dm=1, dm_mean=1, window=4, shrink_windows=False, + hs=0, negative=10, alpha=0.05, min_count=2, epochs=20 + ) + self.model_sanity(model) + + def test_dbow_fixedwindowsize(self): + """Test DBOW doc2vec training with fixed window size.""" + model = doc2vec.Doc2Vec( + list_corpus, vector_size=16, shrink_windows=False, + dm=0, hs=0, negative=5, min_count=2, epochs=20 + ) + self.model_sanity(model) + + def test_dbow_fixedwindowsize_fromfile(self): + """Test DBOW doc2vec training with fixed window size, from file.""" + with temporary_file(get_tmpfile('gensim_doc2vec.tst')) as corpus_file: + save_lee_corpus_as_line_sentence(corpus_file) + model = doc2vec.Doc2Vec( + corpus_file=corpus_file, vector_size=16, shrink_windows=False, + dm=0, hs=0, negative=5, min_count=2, epochs=20 + ) + self.model_sanity(model) + def test_parallel(self): """Test doc2vec parallel training with more than default 3 threads.""" # repeat the ~300 doc (~60000 word) Lee corpus to get 6000 docs (~1.2M words) diff --git a/gensim/test/test_fasttext.py b/gensim/test/test_fasttext.py index a91a9000a7..382ecdfb2c 100644 --- a/gensim/test/test_fasttext.py +++ b/gensim/test/test_fasttext.py @@ -397,12 +397,12 @@ def test_wm_distance(self): dist = self.test_model.wv.wmdistance(doc, oov_doc) self.assertNotEqual(float('inf'), dist) - def test_cbow_hs_training(self): + def test_cbow_hs_training(self, shrink_windows=True): model_gensim = FT_gensim( vector_size=48, sg=0, cbow_mean=1, alpha=0.05, window=5, hs=1, negative=0, min_count=5, epochs=10, batch_words=1000, word_ngrams=1, sample=1e-3, min_n=3, max_n=6, - sorted_vocab=1, workers=1, min_alpha=0.0, bucket=BUCKET) + sorted_vocab=1, workers=1, min_alpha=0.0, bucket=BUCKET, shrink_windows=shrink_windows) lee_data = LineSentence(datapath('lee_background.cor')) model_gensim.build_vocab(lee_data) @@ -429,12 +429,12 @@ def test_cbow_hs_training(self): overlap_count, 2, "only %i overlap in expected %s & actual %s" % (overlap_count, expected_sims_words, sims_gensim_words)) - def test_cbow_hs_training_fromfile(self): + def test_cbow_hs_training_fromfile(self, shrink_windows=True): with temporary_file('gensim_fasttext.tst') as corpus_file: model_gensim = FT_gensim( vector_size=48, sg=0, cbow_mean=1, alpha=0.05, window=5, hs=1, negative=0, min_count=5, epochs=10, batch_words=1000, word_ngrams=1, sample=1e-3, min_n=3, max_n=6, - sorted_vocab=1, workers=1, min_alpha=0.0, bucket=BUCKET * 4) + sorted_vocab=1, workers=1, min_alpha=0.0, bucket=BUCKET * 4, shrink_windows=shrink_windows) lee_data = LineSentence(datapath('lee_background.cor')) utils.save_as_line_sentence(lee_data, corpus_file) @@ -465,12 +465,12 @@ def test_cbow_hs_training_fromfile(self): overlap_count, 2, "only %i overlap in expected %s & actual %s" % (overlap_count, expected_sims_words, sims_gensim_words)) - def test_sg_hs_training(self): + def test_sg_hs_training(self, shrink_windows=True): model_gensim = FT_gensim( vector_size=48, sg=1, cbow_mean=1, alpha=0.025, window=5, hs=1, negative=0, min_count=5, epochs=10, batch_words=1000, word_ngrams=1, sample=1e-3, min_n=3, max_n=6, - sorted_vocab=1, workers=1, min_alpha=0.0, bucket=BUCKET) + sorted_vocab=1, workers=1, min_alpha=0.0, bucket=BUCKET, shrink_windows=shrink_windows) lee_data = LineSentence(datapath('lee_background.cor')) model_gensim.build_vocab(lee_data) @@ -497,12 +497,12 @@ def test_sg_hs_training(self): overlap_count, 2, "only %i overlap in expected %s & actual %s" % (overlap_count, expected_sims_words, sims_gensim_words)) - def test_sg_hs_training_fromfile(self): + def test_sg_hs_training_fromfile(self, shrink_windows=True): with temporary_file('gensim_fasttext.tst') as corpus_file: model_gensim = FT_gensim( vector_size=48, sg=1, cbow_mean=1, alpha=0.025, window=5, hs=1, negative=0, min_count=5, epochs=10, batch_words=1000, word_ngrams=1, sample=1e-3, min_n=3, max_n=6, - sorted_vocab=1, workers=1, min_alpha=0.0, bucket=BUCKET) + sorted_vocab=1, workers=1, min_alpha=0.0, bucket=BUCKET, shrink_windows=shrink_windows) lee_data = LineSentence(datapath('lee_background.cor')) utils.save_as_line_sentence(lee_data, corpus_file) @@ -533,6 +533,18 @@ def test_sg_hs_training_fromfile(self): overlap_count, 2, "only %i overlap in expected %s & actual %s" % (overlap_count, expected_sims_words, sims_gensim_words)) + def test_cbow_hs_training_fixedwindowsize(self): + self.test_cbow_hs_training(shrink_windows=False) + + def test_cbow_hs_training_fixedwindowsize_fromfile(self): + self.test_cbow_hs_training_fromfile(shrink_windows=False) + + def test_sg_hs_training_fixedwindowsize(self): + self.test_sg_hs_training(shrink_windows=False) + + def test_sg_hs_training_fixedwindowsize_fromfile(self): + self.test_sg_hs_training_fromfile(shrink_windows=False) + def test_cbow_neg_training(self): model_gensim = FT_gensim( diff --git a/gensim/test/test_word2vec.py b/gensim/test/test_word2vec.py index d46b2f3e37..e85cee0d5d 100644 --- a/gensim/test/test_word2vec.py +++ b/gensim/test/test_word2vec.py @@ -686,6 +686,38 @@ def test_cbow_neg_fromfile(self): ) self.model_sanity(model, with_corpus_file=True) + def test_sg_fixedwindowsize(self): + """Test skipgram with fixed window size. Use NS.""" + model = word2vec.Word2Vec( + sg=1, window=5, shrink_windows=False, hs=0, + negative=15, min_count=5, epochs=10, workers=2 + ) + self.model_sanity(model) + + def test_sg_fixedwindowsize_fromfile(self): + """Test skipgram with fixed window size. Use HS and train from file.""" + model = word2vec.Word2Vec( + sg=1, window=5, shrink_windows=False, hs=1, + negative=0, min_count=5, epochs=10, workers=2 + ) + self.model_sanity(model, with_corpus_file=True) + + def test_cbow_fixedwindowsize(self, ranks=None): + """Test CBOW with fixed window size. Use HS.""" + model = word2vec.Word2Vec( + sg=0, cbow_mean=1, alpha=0.1, window=5, shrink_windows=False, + hs=1, negative=0, min_count=5, epochs=10, workers=2 + ) + self.model_sanity(model, ranks=ranks) + + def test_cbow_fixedwindowsize_fromfile(self): + """Test CBOW with fixed window size. Use NS and train from file.""" + model = word2vec.Word2Vec( + sg=0, cbow_mean=1, alpha=0.1, window=5, shrink_windows=False, + hs=0, negative=15, min_count=5, epochs=10, workers=2 + ) + self.model_sanity(model, with_corpus_file=True) + def test_cosmul(self): model = word2vec.Word2Vec(sentences, vector_size=2, min_count=1, hs=1, negative=0) sims = model.wv.most_similar_cosmul('graph', topn=10) From f0af84ead2777b76e0898bec730e07babefcf0fc Mon Sep 17 00:00:00 2001 From: Paul Andrey Date: Thu, 17 Jun 2021 11:16:36 +0200 Subject: [PATCH 12/13] Added docstring mentions of `shrink_window` being experimental. --- gensim/models/doc2vec.py | 1 + gensim/models/fasttext.py | 1 + gensim/models/word2vec.py | 1 + 3 files changed, 3 insertions(+) diff --git a/gensim/models/doc2vec.py b/gensim/models/doc2vec.py index 072bb8e233..e6fcb72f0e 100644 --- a/gensim/models/doc2vec.py +++ b/gensim/models/doc2vec.py @@ -249,6 +249,7 @@ def __init__(self, documents=None, corpus_file=None, vector_size=100, dm_mean=No callbacks : :obj: `list` of :obj: `~gensim.models.callbacks.CallbackAny2Vec`, optional List of callbacks that need to be executed/run at specific stages during training. shrink_windows : bool, optional + New in 4.1. Experimental. If True, the effective window size is uniformly sampled from [1, `window`] for each target word during training, to match the original word2vec algorithm's approximate weighting of context words by distance. Otherwise, the effective diff --git a/gensim/models/fasttext.py b/gensim/models/fasttext.py index 8d396400f2..ba08a15a92 100644 --- a/gensim/models/fasttext.py +++ b/gensim/models/fasttext.py @@ -386,6 +386,7 @@ def __init__(self, sentences=None, corpus_file=None, sg=0, hs=0, vector_size=100 automatically calculated ``min_count``, the former will be used. Set to ``None`` if not required. shrink_windows : bool, optional + New in 4.1. Experimental. If True, the effective window size is uniformly sampled from [1, `window`] for each target word during training, to match the original word2vec algorithm's approximate weighting of context words by distance. Otherwise, the effective diff --git a/gensim/models/word2vec.py b/gensim/models/word2vec.py index 91817c2b15..50999acb3f 100755 --- a/gensim/models/word2vec.py +++ b/gensim/models/word2vec.py @@ -346,6 +346,7 @@ def __init__( callbacks : iterable of :class:`~gensim.models.callbacks.CallbackAny2Vec`, optional Sequence of callbacks to be executed at specific stages during training. shrink_windows : bool, optional + New in 4.1. Experimental. If True, the effective window size is uniformly sampled from [1, `window`] for each target word during training, to match the original word2vec algorithm's approximate weighting of context words by distance. Otherwise, the effective From 49cf50ecf010138a0573346e34e952f0c79dc1b0 Mon Sep 17 00:00:00 2001 From: Paul Andrey Date: Tue, 22 Jun 2021 14:54:17 +0200 Subject: [PATCH 13/13] Rolled back some purely aesthetic changes. --- gensim/models/word2vec.py | 24 ++++++++---------------- 1 file changed, 8 insertions(+), 16 deletions(-) diff --git a/gensim/models/word2vec.py b/gensim/models/word2vec.py index 50999acb3f..265364890b 100755 --- a/gensim/models/word2vec.py +++ b/gensim/models/word2vec.py @@ -917,14 +917,12 @@ def _do_train_epoch( if self.sg: examples, tally, raw_tally = train_epoch_sg( self, corpus_file, offset, cython_vocab, cur_epoch, - total_examples, total_words, work, neu1, - self.compute_loss, + total_examples, total_words, work, neu1, self.compute_loss ) else: examples, tally, raw_tally = train_epoch_cbow( self, corpus_file, offset, cython_vocab, cur_epoch, - total_examples, total_words, work, neu1, - self.compute_loss, + total_examples, total_words, work, neu1, self.compute_loss ) return examples, tally, raw_tally @@ -950,15 +948,9 @@ def _do_train_job(self, sentences, alpha, inits): work, neu1 = inits tally = 0 if self.sg: - tally += train_batch_sg( - self, sentences, alpha, work, - self.compute_loss, - ) + tally += train_batch_sg(self, sentences, alpha, work, self.compute_loss) else: - tally += train_batch_cbow( - self, sentences, alpha, work, neu1, - self.compute_loss, - ) + tally += train_batch_cbow(self, sentences, alpha, work, neu1, self.compute_loss) return tally, self._raw_word_count(sentences) def _clear_post_train(self): @@ -966,10 +958,10 @@ def _clear_post_train(self): self.wv.norms = None def train( - self, corpus_iterable=None, corpus_file=None, total_examples=None, - total_words=None, epochs=None, start_alpha=None, end_alpha=None, - word_count=0, queue_factor=2, report_delay=1.0, compute_loss=False, - callbacks=(), **kwargs, + self, corpus_iterable=None, corpus_file=None, total_examples=None, total_words=None, + epochs=None, start_alpha=None, end_alpha=None, word_count=0, + queue_factor=2, report_delay=1.0, compute_loss=False, callbacks=(), + **kwargs, ): """Update the model's neural weights from a sequence of sentences.