From feb3c321aa12561125918a2fc3b86142f973d87e Mon Sep 17 00:00:00 2001 From: "Stergiadis, E" Date: Wed, 28 Feb 2018 16:11:33 +0100 Subject: [PATCH 01/41] Remove useless methods --- gensim/models/base_any2vec.py | 16 ---------------- 1 file changed, 16 deletions(-) diff --git a/gensim/models/base_any2vec.py b/gensim/models/base_any2vec.py index e6a31263ec..1d1467f72b 100644 --- a/gensim/models/base_any2vec.py +++ b/gensim/models/base_any2vec.py @@ -273,13 +273,6 @@ def train(self, data_iterable, epochs=None, total_examples=None, callback.on_train_end(self) return trained_word_count, raw_word_count - @classmethod - def load(cls, fname_or_handle, **kwargs): - return super(BaseAny2VecModel, cls).load(fname_or_handle, **kwargs) - - def save(self, fname_or_handle, **kwargs): - super(BaseAny2VecModel, self).save(fname_or_handle, **kwargs) - class BaseWordEmbeddingsModel(BaseAny2VecModel): """ @@ -288,15 +281,6 @@ class BaseWordEmbeddingsModel(BaseAny2VecModel): """ - def _clear_post_train(self): - raise NotImplementedError() - - def _do_train_job(self, data_iterable, job_parameters, thread_private_mem): - raise NotImplementedError() - - def _set_train_params(self, **kwargs): - raise NotImplementedError() - def __init__(self, sentences=None, workers=3, vector_size=100, epochs=5, callbacks=(), batch_words=10000, trim_rule=None, sg=0, alpha=0.025, window=5, seed=1, hs=0, negative=5, cbow_mean=1, min_alpha=0.0001, compute_loss=False, fast_version=0, **kwargs): From 52eb1b39cdc30d9c3275dd2e067ff861ae8427d2 Mon Sep 17 00:00:00 2001 From: "Stergiadis, E" Date: Wed, 28 Feb 2018 17:44:28 +0100 Subject: [PATCH 02/41] started working on docstrings --- gensim/models/base_any2vec.py | 80 +++++++++++++++++++++++++++++++++-- 1 file changed, 76 insertions(+), 4 deletions(-) diff --git a/gensim/models/base_any2vec.py b/gensim/models/base_any2vec.py index 1d1467f72b..0bdf488e0c 100644 --- a/gensim/models/base_any2vec.py +++ b/gensim/models/base_any2vec.py @@ -449,7 +449,7 @@ def build_vocab(self, sentences, update=False, progress_per=10000, keep_raw_voca Parameters ---------- - sentences : iterable of iterables + sentences : iterable of iterables of str The `sentences` iterable can be simply a list of lists of tokens, but for larger corpora, consider an iterable that streams the sentences directly from disk/network. See :class:`~gensim.models.word2vec.BrownCorpus`, :class:`~gensim.models.word2vec.Text8Corpus` @@ -476,10 +476,10 @@ def build_vocab_from_freq(self, word_freq, keep_raw_vocab=False, corpus_count=No Parameters ---------- - word_freq : dict - Word,Word_Count dictionary. + word_freq : dict of (str, int) + A mapping from a word in the vocabulary to its frequency count. keep_raw_vocab : bool - If not true, delete the raw vocabulary after the scaling is done and free up RAM. + If False, delete the raw vocabulary after the scaling is done to free up RAM. corpus_count : int Even if no corpus is provided, this argument can set corpus_count explicitly. trim_rule : function @@ -576,6 +576,31 @@ def _raw_word_count(self, job): return sum(len(sentence) for sentence in job) def _check_training_sanity(self, epochs=None, total_examples=None, total_words=None, **kwargs): + """Checks whether the training parameters make sense. + + Called right before training starts in :meth:`~gensim.models.base_any2vec.BaseWordEmbeddingsModel.train` + and raises warning or errors depending on the severity of the issue in case an inconsistent parameter combination + is detected. + + Parameters + ---------- + epochs : int + Number of training epochs. Must have a (non None) value. + total_examples : int + Number of documents in the corpus. Either `total_examples` or `total_words` **must** be supplied. + total_words : int + Number of words in the corpus. Either `total_examples` or `total_words` **must** be supplied. + **kwargs + Unused. Present to preserve signature among base and inherited implementations. + + Raises + ------ + RuntimeError + If one of the required training pre/post processing steps have not been performed. + ValueError + If the combination of input parameters is inconsistent. + + """ if self.alpha > self.min_alpha_yet_reached: logger.warning("Effective 'alpha' higher than previous training cycles") if self.model_trimmed_post_training: @@ -610,6 +635,35 @@ def _check_training_sanity(self, epochs=None, total_examples=None, total_words=N @classmethod def load(cls, *args, **kwargs): + """Load a previously saved object (using :meth:`~gensim.models.base_any2vec.BaseWordEmbeddingsModel.save`) from file. + + + Also initializes extra instance attributes in case the loaded model does not include them. + `*args` or `**kwargs` **MUST** include the fname argument (path to saved file). + See :meth:`~gensim.utils.SaveLoad.load`. + + Parameters + ---------- + *args + Positional arguments passed to :meth:`~gensim.utils.SaveLoad.load`. + **kwargs + Key word arguments passed to :meth:`~gensim.utils.SaveLoad.load`. + + See Also + -------- + :meth:`~gensim.models.base_any2vec.BaseWordEmbeddingsModel.save` + + Returns + ------- + :class:`~gensim.models.base_any2vec.BaseWordEmbeddingsModel` + Model loaded from disk. + + Raises + ------ + IOError + When methods are called on instance (should be called from class). + """ + model = super(BaseWordEmbeddingsModel, cls).load(*args, **kwargs) if model.negative and hasattr(model.wv, 'index2word'): model.vocabulary.make_cum_table(model.wv) # rebuild cum_table from vocabulary @@ -626,6 +680,24 @@ def load(cls, *args, **kwargs): def _log_progress(self, job_queue, progress_queue, cur_epoch, example_count, total_examples, raw_word_count, total_words, trained_word_count, elapsed): + """Callback used to log progress for long running jobs. + + Parameters + ---------- + job_queue : + progress_queue + cur_epoch + example_count + total_examples + raw_word_count + total_words + trained_word_count + elapsed + + Returns + ------- + + """ if total_examples: # examples-based progress % logger.info( From cb7b71ae26f7701ef3002710b58e7abb5121a915 Mon Sep 17 00:00:00 2001 From: Manos Stergiadis Date: Wed, 28 Feb 2018 22:44:01 +0100 Subject: [PATCH 03/41] more work done --- gensim/models/base_any2vec.py | 125 +++++++++++++++++++++++++++++++--- 1 file changed, 115 insertions(+), 10 deletions(-) diff --git a/gensim/models/base_any2vec.py b/gensim/models/base_any2vec.py index 0bdf488e0c..ba8126ec90 100644 --- a/gensim/models/base_any2vec.py +++ b/gensim/models/base_any2vec.py @@ -445,12 +445,11 @@ def __str__(self): def build_vocab(self, sentences, update=False, progress_per=10000, keep_raw_vocab=False, trim_rule=None, **kwargs): """Build vocabulary from a sequence of sentences (can be a once-only generator stream). - Each sentence is a iterable of iterables (can simply be a list of unicode strings too). Parameters ---------- - sentences : iterable of iterables of str - The `sentences` iterable can be simply a list of lists of tokens, but for larger corpora, + sentences : iterable of iterable of str + Can be simply a list of lists of tokens, but for larger corpora, consider an iterable that streams the sentences directly from disk/network. See :class:`~gensim.models.word2vec.BrownCorpus`, :class:`~gensim.models.word2vec.Text8Corpus` or :class:`~gensim.models.word2vec.LineSentence` in :mod:`~gensim.models.word2vec` module for such examples. @@ -458,6 +457,19 @@ def build_vocab(self, sentences, update=False, progress_per=10000, keep_raw_voca If true, the new words in `sentences` will be added to model's vocab. progress_per : int Indicates how many words to process before showing/updating the progress. + keep_raw_vocab : bool + If False, the raw vocabulary will be deleted after the scaling is done to free up RAM. + trim_rule : function + Vocabulary trimming rule, specifies whether certain words should remain in the vocabulary, + be trimmed away, or handled using the default (discard if word count < min_count). + Can be None (min_count will be used, look to :func:`~gensim.utils.keep_vocab_item`), + or a callable that accepts parameters (word, count, min_count) and returns either + :attr:`gensim.utils.RULE_DISCARD`, :attr:`gensim.utils.RULE_KEEP` or :attr:`gensim.utils.RULE_DEFAULT`. + Note: The rule, if given, is only used to prune vocabulary during build_vocab() and is not stored as part + of the model. + **kwargs + Key word arguments propagated to `self.vocabulary.prepare_vocab` + """ total_words, corpus_count = self.vocabulary.scan_vocab( @@ -471,12 +483,10 @@ def build_vocab(self, sentences, update=False, progress_per=10000, keep_raw_voca def build_vocab_from_freq(self, word_freq, keep_raw_vocab=False, corpus_count=None, trim_rule=None, update=False): """Build vocabulary from a dictionary of word frequencies. - Build model vocabulary from a passed dictionary that contains (word,word count). - Words must be of type unicode strings. Parameters ---------- - word_freq : dict of (str, int) + word_freq : dict of (unicode str, int) A mapping from a word in the vocabulary to its frequency count. keep_raw_vocab : bool If False, delete the raw vocabulary after the scaling is done to free up RAM. @@ -510,7 +520,7 @@ def build_vocab_from_freq(self, word_freq, keep_raw_vocab=False, corpus_count=No len(raw_vocab), sum(itervalues(raw_vocab)) ) - # Since no sentences are provided, this is to control the corpus_count + # Since no sentences are provided, this is to control the `corpus_count` self.corpus_count = corpus_count or 0 self.vocabulary.raw_vocab = raw_vocab @@ -523,7 +533,20 @@ def build_vocab_from_freq(self, word_freq, keep_raw_vocab=False, corpus_count=No self.hs, self.negative, self.wv, update=update, vocabulary=self.vocabulary) # build tables & arrays def estimate_memory(self, vocab_size=None, report=None): - """Estimate required memory for a model using current settings and provided vocabulary size.""" + """Estimate required memory for a model using current settings and provided vocabulary size. + + Parameters + ---------- + vocab_size : int, optional + Number of unique tokens in the vocabulary + report : dict of (str, int), optional + A dictionary from string representations of the model's memory consuming members to their size in bytes. + + Returns + ------- + dict of (str, int), optional + A dictionary from string representations of the model's memory consuming members to their size in bytes. + """ vocab_size = vocab_size or len(self.wv.vocab) report = report or {} report['vocab'] = vocab_size * (700 if self.hs else 500) @@ -542,6 +565,42 @@ def estimate_memory(self, vocab_size=None, report=None): def train(self, sentences, total_examples=None, total_words=None, epochs=None, start_alpha=None, end_alpha=None, word_count=0, queue_factor=2, report_delay=1.0, compute_loss=False, callbacks=()): + """Train the model. If the hyper-parameters are passed, they override the ones set in the constructor. + + Parameters + ---------- + sentences : iterable of iterable of str + Can be simply a list of lists of tokens, but for larger corpora, + consider an iterable that streams the sentences directly from disk/network. + See :class:`~gensim.models.word2vec.BrownCorpus`, :class:`~gensim.models.word2vec.Text8Corpus` + or :class:`~gensim.models.word2vec.LineSentence` in :mod:`~gensim.models.word2vec` module for such examples. + total_examples : int + Count of sentences. + total_words : int + Count of raw words in sentences. + epochs : int + Number of iterations (epochs) over the corpus. + start_alpha : float + Initial learning rate. + end_alpha : float + Final learning rate. Drops linearly with the number of iterations from `start_alpha`. + word_count : int + Count of words already trained. Leave this to 0 for the usual case of training on all words in sentences. + queue_factor : int + Multiplier for size of queue -> size = number of workers * queue_factor. + report_delay : float + Seconds to wait before reporting progress. + compute_loss : bool + If True, loss will be computed while training the Word2Vec model and stored in + :attr:`~gensim.models.base_any2vec.BaseWordEmbeddingsModel.running_training_loss`. + callbacks : list of :class: `~gensim.models.callbacks.CallbackAny2Vec` + List of callbacks that need to be executed/run at specific stages during training. + + Returns + ------- + (int, int) + Tuple of (effective word count after ignoring unknown words and sentence length trimming, total word count). + """ self.alpha = start_alpha or self.alpha self.min_alpha = end_alpha or self.min_alpha @@ -553,11 +612,39 @@ def train(self, sentences, total_examples=None, total_words=None, queue_factor=queue_factor, report_delay=report_delay, compute_loss=compute_loss, callbacks=callbacks) def _get_job_params(self, cur_epoch): - """Get the parameter required for each batch.""" + """Get the learning rate used in the current epoch. + + Parameters + ---------- + cur_epoch : int + Current iteration through the corpus + + Returns + ------- + float + The learning rate for this epoch (it is linearly reduced with epochs from `self.alpha` to `self.min_alpha`). + """ alpha = self.alpha - ((self.alpha - self.min_alpha) * float(cur_epoch) / self.epochs) return alpha def _update_job_params(self, job_params, epoch_progress, cur_epoch): + """Returns the correct learning rate for the next iteration. + + Parameters + ---------- + job_params : dict of (str, obj) + Unused. TODO: Delete this. + epoch_progress : float + Ratio of finished work in the current epoch. + cur_epoch : int + Number of current iteration. + + Returns + ------- + float + The learning rate to be used in the next training epoch. + + """ start_alpha = self.alpha end_alpha = self.min_alpha progress = (cur_epoch + epoch_progress) / self.epochs @@ -567,12 +654,30 @@ def _update_job_params(self, job_params, epoch_progress, cur_epoch): return next_alpha def _get_thread_working_mem(self): + """Computes the memory used per worker thread. + + Returns + ------- + (np.ndarray, np.ndarray) + Each worker threads private work memory. + """ work = matutils.zeros_aligned(self.trainables.layer1_size, dtype=REAL) # per-thread private work memory neu1 = matutils.zeros_aligned(self.trainables.layer1_size, dtype=REAL) return work, neu1 def _raw_word_count(self, job): - """Get the number of words in a given job.""" + """Get the number of words in a given job. + + Parameters + ---------- + job: iterable of iterable of str + The corpus chunk processed in a single batch. + + Returns + ------- + int + Number of raw words in the corpus chunk. + """ return sum(len(sentence) for sentence in job) def _check_training_sanity(self, epochs=None, total_examples=None, total_words=None, **kwargs): From 347cdb030aa7d9a77e75b421a08f535c2c702f94 Mon Sep 17 00:00:00 2001 From: Manos Stergiadis Date: Thu, 1 Mar 2018 12:41:10 +0100 Subject: [PATCH 04/41] Finished documentation for the `BaseWordEmbeddingsModel --- gensim/models/base_any2vec.py | 183 +++++++++++++++++++++++++++------- 1 file changed, 149 insertions(+), 34 deletions(-) diff --git a/gensim/models/base_any2vec.py b/gensim/models/base_any2vec.py index ba8126ec90..1e2f4da916 100644 --- a/gensim/models/base_any2vec.py +++ b/gensim/models/base_any2vec.py @@ -284,6 +284,64 @@ class BaseWordEmbeddingsModel(BaseAny2VecModel): def __init__(self, sentences=None, workers=3, vector_size=100, epochs=5, callbacks=(), batch_words=10000, trim_rule=None, sg=0, alpha=0.025, window=5, seed=1, hs=0, negative=5, cbow_mean=1, min_alpha=0.0001, compute_loss=False, fast_version=0, **kwargs): + """Construct a base word embeddings model. + + Parameters + ---------- + sentences : iterable of iterable of str + Can be simply a list of lists of tokens, but for larger corpora, + consider an iterable that streams the sentences directly from disk/network. + See :class:`~gensim.models.word2vec.BrownCorpus`, :class:`~gensim.models.word2vec.Text8Corpus` + or :class:`~gensim.models.word2vec.LineSentence` in :mod:`~gensim.models.word2vec` module for such examples. + workers : int + Number of working threads, used for multiprocessing. + vector_size : int + Dimensionality of the feature vectors. + epochs : int + Number of iterations (epochs) of training through the corpus. + callbacks : list of :class: `~gensim.models.callbacks.CallbackAny2Vec`, optional + List of callbacks that need to be executed/run at specific stages during training. + batch_words : int + Number of words to be processed by a single job. + trim_rule : function, optional + Vocabulary trimming rule, specifies whether certain words should remain in the vocabulary, + be trimmed away, or handled using the default (discard if word count < min_count). + Can be None (min_count will be used, look to :func:`~gensim.utils.keep_vocab_item`), + or a callable that accepts parameters (word, count, min_count) and returns either + :attr:`gensim.utils.RULE_DISCARD`, :attr:`gensim.utils.RULE_KEEP` or :attr:`gensim.utils.RULE_DEFAULT`. + Note: The rule, if given, is only used to prune vocabulary during build_vocab() and is not stored as part + of the model. + sg : int {1, 0} + Defines the training algorithm. If 1, skip-gram is used, otherwise, CBOW is employed. + alpha : float + The beginning learning rate. This will linearly reduce with iterations until it reaches `min_alpha`. + window : int + The maximum distance between the current and predicted word within a sentence. + seed : int + Seed for the random number generator. Initial vectors for each word are seeded with a hash of + the concatenation of word + `str(seed)`. Note that for a fully deterministically-reproducible run, + you must also limit the model to a single worker thread (`workers=1`), to eliminate ordering jitter + from OS thread scheduling. (In Python 3, reproducibility between interpreter launches also requires + use of the `PYTHONHASHSEED` environment variable to control hash randomization). + hs : int {1,0} + If 1, hierarchical softmax will be used for model training. + If set to 0, and `negative` is non-zero, negative sampling will be used. + negative : int + If > 0, negative sampling will be used, the int for negative specifies how many "noise words" + should be drawn (usually between 5-20). + If set to 0, no negative sampling is used. + cbow_mean : int {1,0} + If 0, use the sum of the context word vectors. If 1, use the mean, only applies when cbow is used. + min_alpha : float, optional + Final learning rate. Drops linearly with the number of iterations from `alpha`. + compute_loss : bool, optional + If True, loss will be computed while training the Word2Vec model and stored in + :attr:`~gensim.models.base_any2vec.BaseWordEmbeddingsModel.running_training_loss`. + fast_version : int {-1, 1} + Whether or not the fast cython implementation of the internal training methods is available. 1 means it is. + **kwargs + Key word arguments needed to allow children classes to accept more arguments. + """ self.sg = int(sg) if vector_size % 4 != 0: logger.warning("consider setting layer size to a multiple of 4 for greater performance") @@ -439,6 +497,14 @@ def cum_table(self): del self.vocabulary.cum_table def __str__(self): + """Return a human readable representation of the object. + + Returns + ------- + str + A human readable string containing the class name, as well as the id to word mapping, number of + features and starting learning rate used by the object. + """ return "%s(vocab=%s, size=%s, alpha=%s)" % ( self.__class__.__name__, len(self.wv.index2word), self.vector_size, self.alpha ) @@ -453,13 +519,13 @@ def build_vocab(self, sentences, update=False, progress_per=10000, keep_raw_voca consider an iterable that streams the sentences directly from disk/network. See :class:`~gensim.models.word2vec.BrownCorpus`, :class:`~gensim.models.word2vec.Text8Corpus` or :class:`~gensim.models.word2vec.LineSentence` in :mod:`~gensim.models.word2vec` module for such examples. - update : bool + update : bool, optional If true, the new words in `sentences` will be added to model's vocab. - progress_per : int + progress_per : int, optional Indicates how many words to process before showing/updating the progress. - keep_raw_vocab : bool + keep_raw_vocab : bool, optional If False, the raw vocabulary will be deleted after the scaling is done to free up RAM. - trim_rule : function + trim_rule : function, optional Vocabulary trimming rule, specifies whether certain words should remain in the vocabulary, be trimmed away, or handled using the default (discard if word count < min_count). Can be None (min_count will be used, look to :func:`~gensim.utils.keep_vocab_item`), @@ -488,11 +554,11 @@ def build_vocab_from_freq(self, word_freq, keep_raw_vocab=False, corpus_count=No ---------- word_freq : dict of (unicode str, int) A mapping from a word in the vocabulary to its frequency count. - keep_raw_vocab : bool + keep_raw_vocab : bool, optional If False, delete the raw vocabulary after the scaling is done to free up RAM. - corpus_count : int + corpus_count : int, optional Even if no corpus is provided, this argument can set corpus_count explicitly. - trim_rule : function + trim_rule : function, optional Vocabulary trimming rule, specifies whether certain words should remain in the vocabulary, be trimmed away, or handled using the default (discard if word count < min_count). Can be None (min_count will be used, look to :func:`~gensim.utils.keep_vocab_item`), @@ -500,7 +566,7 @@ def build_vocab_from_freq(self, word_freq, keep_raw_vocab=False, corpus_count=No :attr:`gensim.utils.RULE_DISCARD`, :attr:`gensim.utils.RULE_KEEP` or :attr:`gensim.utils.RULE_DEFAULT`. Note: The rule, if given, is only used to prune vocabulary during build_vocab() and is not stored as part of the model. - update : bool + update : bool, optional If true, the new provided words in `word_freq` dict will be added to model's vocab. Examples @@ -574,26 +640,26 @@ def train(self, sentences, total_examples=None, total_words=None, consider an iterable that streams the sentences directly from disk/network. See :class:`~gensim.models.word2vec.BrownCorpus`, :class:`~gensim.models.word2vec.Text8Corpus` or :class:`~gensim.models.word2vec.LineSentence` in :mod:`~gensim.models.word2vec` module for such examples. - total_examples : int + total_examples : int, optional Count of sentences. - total_words : int + total_words : int, optional Count of raw words in sentences. - epochs : int + epochs : int, optional Number of iterations (epochs) over the corpus. - start_alpha : float + start_alpha : float, optional Initial learning rate. - end_alpha : float + end_alpha : float, optional Final learning rate. Drops linearly with the number of iterations from `start_alpha`. - word_count : int + word_count : int, optional Count of words already trained. Leave this to 0 for the usual case of training on all words in sentences. - queue_factor : int + queue_factor : int, optional Multiplier for size of queue -> size = number of workers * queue_factor. - report_delay : float + report_delay : float, optional Seconds to wait before reporting progress. - compute_loss : bool + compute_loss : bool, optional If True, loss will be computed while training the Word2Vec model and stored in :attr:`~gensim.models.base_any2vec.BaseWordEmbeddingsModel.running_training_loss`. - callbacks : list of :class: `~gensim.models.callbacks.CallbackAny2Vec` + callbacks : list of :class: `~gensim.models.callbacks.CallbackAny2Vec`, optional List of callbacks that need to be executed/run at specific stages during training. Returns @@ -689,11 +755,11 @@ def _check_training_sanity(self, epochs=None, total_examples=None, total_words=N Parameters ---------- - epochs : int + epochs : int, optional Number of training epochs. Must have a (non None) value. - total_examples : int + total_examples : int, optional Number of documents in the corpus. Either `total_examples` or `total_words` **must** be supplied. - total_words : int + total_words : int, optional Number of words in the corpus. Either `total_examples` or `total_words` **must** be supplied. **kwargs Unused. Present to preserve signature among base and inherited implementations. @@ -742,7 +808,6 @@ def _check_training_sanity(self, epochs=None, total_examples=None, total_words=N def load(cls, *args, **kwargs): """Load a previously saved object (using :meth:`~gensim.models.base_any2vec.BaseWordEmbeddingsModel.save`) from file. - Also initializes extra instance attributes in case the loaded model does not include them. `*args` or `**kwargs` **MUST** include the fname argument (path to saved file). See :meth:`~gensim.utils.SaveLoad.load`. @@ -789,18 +854,29 @@ def _log_progress(self, job_queue, progress_queue, cur_epoch, example_count, tot Parameters ---------- - job_queue : - progress_queue - cur_epoch - example_count - total_examples - raw_word_count - total_words - trained_word_count - elapsed - - Returns - ------- + job_queue : Queue of (iterable of object, dict of (str, float)) + The queue of jobs still to be performed by workers. Each job is represented as a tuple containing + the batch of data to be processed and the parameters to be used for the processing as a dict. + progress_queue : Queue of (int, int, int) + A queue of progress reports. Each report is represented as a tuple of these 3 elements: + * size of data chunk processed, for example number of sentences in the corpus chunk. + * Effective word count used in training (after ignoring unknown words and trimming the sentence length). + * Total word count used in training. + cur_epoch : int + The current training iteration through the corpus. + example_count : int + Number of examples (could be sentences for example) processed until now. + total_examples : int + Number of all examples present in the input corpus. + raw_word_count : int + Number of words used in training until now. + total_words : int + Number of all words in the input corpus. + trained_word_count : int + Number of effective words used in training until now (after ignoring unknown words and trimming + the sentence length). + elapsed : int + Elapsed time since the beginning of training in seconds. """ if total_examples: @@ -820,6 +896,31 @@ def _log_progress(self, job_queue, progress_queue, cur_epoch, example_count, tot def _log_epoch_end(self, cur_epoch, example_count, total_examples, raw_word_count, total_words, trained_word_count, elapsed): + """Callback used to log the end of a training epoch + + Parameters + ---------- + cur_epoch : int + The current training iteration through the corpus. + example_count : int + Number of examples (could be sentences for example) processed until now. + total_examples : int + Number of all examples present in the input corpus. + raw_word_count : int + Number of words used in training until now. + total_words : int + Number of all words in the input corpus. + trained_word_count : int + Number of effective words used in training until now (after ignoring unknown words and trimming + the sentence length). + elapsed : int + Elapsed time since the beginning of training in seconds. + + Warnings + -------- + In case the corpus is changed while the epoch was running. + + """ logger.info( "EPOCH - %i : training on %i raw words (%i effective words) took %.1fs, %.0f effective words/s", cur_epoch + 1, raw_word_count, trained_word_count, elapsed, trained_word_count / elapsed @@ -838,6 +939,20 @@ def _log_epoch_end(self, cur_epoch, example_count, total_examples, raw_word_coun ) def _log_train_end(self, raw_word_count, trained_word_count, total_elapsed, job_tally): + """Callback to log the end of training. + + Parameters + ---------- + raw_word_count : int + Number of words used in the whole training. + trained_word_count : int + Number of effective words used in training (after ignoring unknown words and trimming the sentence length). + total_elapsed : int + Total time spent during training in seconds. + job_tally : int + Total number of jobs processed during training. + + """ logger.info( "training on a %i raw words (%i effective words) took %.1fs, %.0f effective words/s", raw_word_count, trained_word_count, total_elapsed, trained_word_count / total_elapsed From 327afc592b83171e402fc841b6ceaafbe260d162 Mon Sep 17 00:00:00 2001 From: Manos Stergiadis Date: Thu, 1 Mar 2018 12:48:47 +0100 Subject: [PATCH 05/41] PEP-8 --- gensim/models/base_any2vec.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/gensim/models/base_any2vec.py b/gensim/models/base_any2vec.py index 1e2f4da916..8a0031ec37 100644 --- a/gensim/models/base_any2vec.py +++ b/gensim/models/base_any2vec.py @@ -750,8 +750,8 @@ def _check_training_sanity(self, epochs=None, total_examples=None, total_words=N """Checks whether the training parameters make sense. Called right before training starts in :meth:`~gensim.models.base_any2vec.BaseWordEmbeddingsModel.train` - and raises warning or errors depending on the severity of the issue in case an inconsistent parameter combination - is detected. + and raises warning or errors depending on the severity of the issue in case an inconsistent parameter + combination is detected. Parameters ---------- From bb8e3a38955c46011a42f8b5ba9e6d284e223156 Mon Sep 17 00:00:00 2001 From: "Stergiadis, E" Date: Fri, 2 Mar 2018 13:24:58 +0100 Subject: [PATCH 06/41] Revert "Remove useless methods" This reverts commit feb3c321aa12561125918a2fc3b86142f973d87e. --- gensim/models/base_any2vec.py | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/gensim/models/base_any2vec.py b/gensim/models/base_any2vec.py index 8a0031ec37..fe298905c2 100644 --- a/gensim/models/base_any2vec.py +++ b/gensim/models/base_any2vec.py @@ -273,6 +273,13 @@ def train(self, data_iterable, epochs=None, total_examples=None, callback.on_train_end(self) return trained_word_count, raw_word_count + @classmethod + def load(cls, fname_or_handle, **kwargs): + return super(BaseAny2VecModel, cls).load(fname_or_handle, **kwargs) + + def save(self, fname_or_handle, **kwargs): + super(BaseAny2VecModel, self).save(fname_or_handle, **kwargs) + class BaseWordEmbeddingsModel(BaseAny2VecModel): """ @@ -281,6 +288,15 @@ class BaseWordEmbeddingsModel(BaseAny2VecModel): """ + def _clear_post_train(self): + raise NotImplementedError() + + def _do_train_job(self, data_iterable, job_parameters, thread_private_mem): + raise NotImplementedError() + + def _set_train_params(self, **kwargs): + raise NotImplementedError() + def __init__(self, sentences=None, workers=3, vector_size=100, epochs=5, callbacks=(), batch_words=10000, trim_rule=None, sg=0, alpha=0.025, window=5, seed=1, hs=0, negative=5, cbow_mean=1, min_alpha=0.0001, compute_loss=False, fast_version=0, **kwargs): From 7e89ca9c88315e3b346b4d6bf3dd4972ac3b3d97 Mon Sep 17 00:00:00 2001 From: "Stergiadis, E" Date: Mon, 5 Mar 2018 13:48:44 +0100 Subject: [PATCH 07/41] added documentation for the class and all its helper methods --- gensim/models/base_any2vec.py | 8 +- gensim/models/word2vec.py | 391 ++++++++++++++++++++++++++++------ 2 files changed, 332 insertions(+), 67 deletions(-) diff --git a/gensim/models/base_any2vec.py b/gensim/models/base_any2vec.py index fe298905c2..d5098ce277 100644 --- a/gensim/models/base_any2vec.py +++ b/gensim/models/base_any2vec.py @@ -282,9 +282,11 @@ def save(self, fname_or_handle, **kwargs): class BaseWordEmbeddingsModel(BaseAny2VecModel): - """ - Base class containing common methods for training, using & evaluating word embeddings learning models. - For example - `Word2Vec`, `FastText`, etc. + """Base class containing common methods for training, using & evaluating word embeddings learning models. + + Example implementations are + * :class:`~gensim.models.word2vec.Word2Vec` + * :class:`~gensim.models.word2vec.FastText`, etc. """ diff --git a/gensim/models/word2vec.py b/gensim/models/word2vec.py index 50be048f4a..97b90dae97 100755 --- a/gensim/models/word2vec.py +++ b/gensim/models/word2vec.py @@ -145,12 +145,34 @@ MAX_WORDS_IN_BATCH = 10000 def train_batch_sg(model, sentences, alpha, work=None, compute_loss=False): - """ - Update skip-gram model by training on a sequence of sentences. - Each sentence is a list of string tokens, which are looked up in the model's - vocab dictionary. Called internally from `Word2Vec.train()`. + """Update skip-gram model by training on a batch of sentences. + + Called internally from :meth:`~gensim.models.word2vec.Word2Vec.train`. + + Notes + ----- This is the non-optimized, Python version. If you have cython installed, gensim - will use the optimized version from word2vec_inner instead. + will use the optimized version found in :mod:`~gensim.models.word2vec_inner` instead. + + Parameters + ---------- + model : :class:`~gensim.models.word2Vec.Word2Vec` + The Word2Vec model instance to train. + sentences : iterable of iterable of str + The corpus used to train the model. + alpha : float + The learning rate + work : object, optional + Unused. + compute_loss : bool, optional + Whether or not the training loss should be computed in this batch. + + Returns + ------- + int + Number of words in the vocabulary actually used for training (They already existed in the vocabulary + and were not discarded by negative sampling). + """ result = 0 for sentence in sentences: @@ -172,12 +194,35 @@ def train_batch_sg(model, sentences, alpha, work=None, compute_loss=False): return result def train_batch_cbow(model, sentences, alpha, work=None, neu1=None, compute_loss=False): - """ - Update CBOW model by training on a sequence of sentences. - Each sentence is a list of string tokens, which are looked up in the model's - vocab dictionary. Called internally from `Word2Vec.train()`. + """Update CBOW model by training on a batch of sentences. + + Called internally from :meth:`~gensim.models.word2vec.Word2Vec.train`. + + Notes + ----- This is the non-optimized, Python version. If you have cython installed, gensim - will use the optimized version from word2vec_inner instead. + will use the optimized version found in :mod:`~gensim.models.word2vec_inner` instead. + + Parameters + ---------- + model : :class:`~gensim.models.word2Vec.Word2Vec` + The Word2Vec model instance to train. + sentences : iterable of iterable of str + The corpus used to train the model. + alpha : float + The learning rate + work : object, optional + Unused. + neu1 : object, optional + Unused. + compute_loss : bool, optional + Whether or not the training loss should be computed in this batch. + + Returns + ------- + int + Number of words in the vocabulary actually used for training (They already existed in the vocabulary + and were not discarded by negative sampling). """ result = 0 for sentence in sentences: @@ -197,11 +242,27 @@ def train_batch_cbow(model, sentences, alpha, work=None, neu1=None, compute_loss def score_sentence_sg(model, sentence, work=None): """ - Obtain likelihood score for a single sentence in a fitted skip-gram representaion. - The sentence is a list of Vocab objects (or None, when the corresponding - word is not in the vocabulary). Called internally from `Word2Vec.score()`. + Obtain likelihood score for a single sentence in a fitted skip-gram representation. + + Notes + ----- This is the non-optimized, Python version. If you have cython installed, gensim - will use the optimized version from word2vec_inner instead. + will use the optimized version found in :mod:`~gensim.models.word2vec_inner` instead. + + Parameters + ---------- + model : :class:`~gensim.models.word2vec.Word2Vec` + The trained model. It **MUST** have been trained using hierarchical softmax and the skip-gram algorithm. + sentence : list of str + The words comprising the sentence to be scored. + work : object, optional + Unused. + + Returns + ------- + float + The probability assigned to this sentence by the Skip-Gram model. + """ log_prob_sentence = 0.0 if model.negative: @@ -223,11 +284,29 @@ def score_sentence_sg(model, sentence, work=None): def score_sentence_cbow(model, sentence, work=None, neu1=None): """ - Obtain likelihood score for a single sentence in a fitted CBOW representaion. - The sentence is a list of Vocab objects (or None, where the corresponding - word is not in the vocabulary. Called internally from `Word2Vec.score()`. + Obtain likelihood score for a single sentence in a fitted CBOW representation. + + Notes + ----- This is the non-optimized, Python version. If you have cython installed, gensim - will use the optimized version from word2vec_inner instead. + will use the optimized version found in :mod:`~gensim.models.word2vec_inner` instead. + + Parameters + ---------- + model : :class:`~gensim.models.word2vec.Word2Vec` + The trained model. It **MUST** have been trained using hierarchical softmax and the CBOW algorithm. + sentence : list of str + The words comprising the sentence to be scored. + work : object, optional + Unused. + neu1 : object, optional + Unused. + + Returns + ------- + float + The probability assigned to this sentence by the Skip-Gram model. + """ log_prob_sentence = 0.0 if model.negative: @@ -251,6 +330,38 @@ def score_sentence_cbow(model, sentence, work=None, neu1=None): def train_sg_pair(model, word, context_index, alpha, learn_vectors=True, learn_hidden=True, context_vectors=None, context_locks=None, compute_loss=False, is_ft=False): + """Trains the passed model instance on a word and its context, using the Skip-gram algorithm. + + Parameters + ---------- + model : :class:`~gensim.models.word2vec.Word2Vec` + The model to be trained. + word : str + The label (predicted) word. + context_index : list of int + The vocabulary indices of the words in the context. + alpha : float + Learning rate. + learn_vectors : bool, optional + Whether the vectors should be updated. + learn_hidden : bool, optional + Whether the weights of the hidden layer should be updated. + context_vectors : list of list of float, optional + Vector representations of the words in the context. If None, these will be retrieved from the model. + context_locks : list of float, optional + The lock factors for each word in the context. + compute_loss : bool, optional + Whether or not the training loss should be computed. + is_ft : bool, optional + If True, weights will be computed using `model.wv.syn0_vocab` and `model.wv.syn0_ngrams` + instead of `model.wv.syn0` + + Returns + ------- + list of float + Error vector to be back-propagated. + + """ if context_vectors is None: if is_ft: context_vectors_vocab = model.wv.syn0_vocab @@ -327,6 +438,40 @@ def train_sg_pair(model, word, context_index, alpha, learn_vectors=True, learn_h def train_cbow_pair(model, word, input_word_indices, l1, alpha, learn_vectors=True, learn_hidden=True, compute_loss=False, context_vectors=None, context_locks=None, is_ft=False): + """Trains the passed model instance on a word and its context, using the CBOW algorithm. + + Parameters + ---------- + model : :class:`~gensim.models.word2vec.Word2Vec` + The model to be trained. + word : str + The label (predicted) word. + input_word_indices : list of int + The vocabulary indices of the words in the context. + l1 : list of float + Vector representation of the label word. + alpha : float + Learning rate. + learn_vectors : bool, optional + Whether the vectors should be updated. + learn_hidden : bool, optional + Whether the weights of the hidden layer should be updated. + compute_loss : bool, optional + Whether or not the training loss should be computed. + context_vectors : list of list of float, optional + Vector representations of the words in the context. If None, these will be retrieved from the model. + context_locks : list of float, optional + The lock factors for each word in the context. + is_ft : bool, optional + If True, weights will be computed using `model.wv.syn0_vocab` and `model.wv.syn0_ngrams` + instead of `model.wv.syn0` + + Returns + ------- + list of float + Error vector to be back-propagated. + + """ if context_vectors is None: if is_ft: context_vectors_vocab = model.wv.syn0_vocab @@ -395,6 +540,22 @@ def train_cbow_pair(model, word, input_word_indices, l1, alpha, learn_vectors=Tr def score_sg_pair(model, word, word2): + """Score the trained Skip-gram model on a pair of words. + + Parameters + ---------- + model : :class:`~gensim.models.word2vec.Word2Vec` + The trained model. + word : :class:`~gensim.models.keyedvectors.Vocab` + Vocabulary representation of the first word. + word2 : :class:`~gensim.models.keyedvectors.Vocab` + Vocabulary representation of the second word. + + Returns + ------- + float + Logarithm of the sum of exponentiations of input words. + """ l1 = model.wv.syn0[word2.index] l2a = deepcopy(model.syn1[word.point]) # 2d matrix, codelen x layer1_size sgn = (-1.0) ** word.code # ch function, 0-> 1, 1 -> -1 @@ -403,6 +564,22 @@ def score_sg_pair(model, word, word2): def score_cbow_pair(model, word, l1): + """Score the trained CBOW model on a pair of words. + + Parameters + ---------- + model : :class:`~gensim.models.word2vec.Word2Vec` + The trained model. + word : :class:`~gensim.models.keyedvectors.Vocab` + Vocabulary representation of the first word. + l1 : list of float + Vector representation of the second word. + + Returns + ------- + float + Logarithm of the sum of exponentiations of input words. + """ l2a = model.syn1[word.point] # 2d matrix, codelen x layer1_size sgn = (-1.0) ** word.code # ch function, 0-> 1, 1 -> -1 lprob = -logaddexp(0, -sgn * dot(l1, l2a.T)) @@ -527,9 +704,22 @@ def __init__(self, sentences=None, size=100, alpha=0.025, window=5, min_count=5, fast_version=FAST_VERSION) def _do_train_job(self, sentences, alpha, inits): - """ - Train a single batch of sentences. Return 2-tuple `(effective word count after - ignoring unknown words and sentence length trimming, total word count)`. + """Train the model on a single batch of sentences. Return`. + + Parameters + ---------- + sentences : iterable of iterable of str + Corpus chunk to be used in this training batch. + alpha : float + The learning rate used in this batch. + inits : (np.ndarray, np.ndarray) + Each worker threads private work memory. + + Returns + ------- + (int, int) + 2-tuple (effective word count after ignoring unknown words and sentence length trimming, total word count). + """ work, neu1 = inits tally = 0 @@ -540,10 +730,23 @@ def _do_train_job(self, sentences, alpha, inits): return tally, self._raw_word_count(sentences) def _clear_post_train(self): - """Resets certain properties of the model, post training.""" + """Removes all L2-normalized vectors for words from the model. + + Notes + ----- + You will have to recompute them using :meth:`~gensim.models.word2vec.Word2Vec.init_sims`. + """ self.wv.vectors_norm = None def _set_train_params(self, **kwargs): + """If `compute_loss` is passed, then it overrides the value set in the constructor. + + Parameters + ---------- + **kwargs + Key word model parameters which could include the `compute_loss` boolean. + + """ if 'compute_loss' in kwargs: self.compute_loss = kwargs['compute_loss'] self.running_training_loss = 0 @@ -572,24 +775,24 @@ def train(self, sentences, total_examples=None, total_words=None, consider an iterable that streams the sentences directly from disk/network. See :class:`~gensim.models.word2vec.BrownCorpus`, :class:`~gensim.models.word2vec.Text8Corpus` or :class:`~gensim.models.word2vec.LineSentence` in :mod:`~gensim.models.word2vec` module for such examples. - total_examples : int + total_examples : int, optional Count of sentences. - total_words : int + total_words : int, optional Count of raw words in sentences. - epochs : int + epochs : int, optional Number of iterations (epochs) over the corpus. - start_alpha : float + start_alpha : float, optional Initial learning rate. - end_alpha : float + end_alpha : float, optional Final learning rate. Drops linearly from `start_alpha`. - word_count : int + word_count : int, optional Count of words already trained. Set this to 0 for the usual case of training on all words in sentences. - queue_factor : int + queue_factor : int, optional Multiplier for size of queue (number of workers * queue_factor). - report_delay : float + report_delay : float, optional Seconds to wait before reporting progress. - compute_loss: bool + compute_loss: bool, optional If True, computes and stores loss value which can be retrieved using `model.get_latest_training_loss()`. callbacks : :obj: `list` of :obj: `~gensim.models.callbacks.CallbackAny2Vec` List of callbacks that need to be executed/run at specific stages during training. @@ -616,7 +819,7 @@ def score(self, sentences, total_sentences=int(1e6), chunksize=100, queue_factor This does not change the fitted model in any way (see Word2Vec.train() for that). We have currently only implemented score for the hierarchical softmax scheme, - so you need to have run word2vec with hs=1 and negative=0 for this to work. + so you need to have run word2vec with `hs = 1` and `negative = 0` for this to work. Note that you should specify total_sentences; we'll run into problems if you ask to score more than this number of sentences but it is inefficient to set the value too high. @@ -635,13 +838,13 @@ def score(self, sentences, total_sentences=int(1e6), chunksize=100, queue_factor consider an iterable that streams the sentences directly from disk/network. See :class:`~gensim.models.word2vec.BrownCorpus`, :class:`~gensim.models.word2vec.Text8Corpus` or :class:`~gensim.models.word2vec.LineSentence` in :mod:`~gensim.models.word2vec` module for such examples. - total_sentences : int + total_sentences : int, optional Count of sentences. - chunksize : int + chunksize : int, optional Chunksize of jobs - queue_factor : int + queue_factor : int, optional Multiplier for size of queue (number of workers * queue_factor). - report_delay : float + report_delay : float, optional Seconds to wait before reporting progress. """ @@ -750,29 +953,36 @@ def worker_loop(): def clear_sims(self): """Removes all L2-normalized vectors for words from the model. - You will have to recompute them using init_sims method. - """ + Notes + ----- + You will have to recompute them using :meth:`~gensim.models.word2vec.Word2Vec.init_sims`. + """ self.wv.vectors_norm = None def intersect_word2vec_format(self, fname, lockf=0.0, binary=False, encoding='utf8', unicode_errors='strict'): - """Merge the input-hidden weight matrix from the original C word2vec-tool format - given, where it intersects with the current vocabulary. (No words are added to the - existing vocabulary, but intersecting words adopt the file's weights, and - non-intersecting words are left alone.) + """Merge the input-hidden weight matrix from the original C word2vec-tool format given, + where it intersects with the current vocabulary. + + Notes + ----- + No words are added to the existing vocabulary, but intersecting words adopt the file's weights, and + non-intersecting words are left alone. Parameters ---------- fname : str The file path used to save the vectors in - - binary : bool - If True, the data wil be saved in binary word2vec format, else it will be saved in plain text. - - lockf : float + lockf : float, optional Lock-factor value to be set for any imported word-vectors; the default value of 0.0 prevents further updating of the vector during subsequent training. Use 1.0 to allow further training updates of merged vectors. + binary : bool, optional + If True, the data wil be saved in binary word2vec format, else it will be saved in plain text. + encoding : str, optional + Encoding of `text` for `unicode` function (python2 only). + unicode_errors : str, optional + Error handling behaviour, used as parameter for `unicode` function (python2 only). """ overlap_count = 0 @@ -834,14 +1044,14 @@ def predict_output_word(self, context_words_list, topn=10): Parameters ---------- - context_words_list : :obj: `list` of :obj: `str` - List of context words - topn: int + context_words_list : list of str + List of context words. + topn: int, optional Return `topn` words and their probabilities Returns ------- - :obj: `list` of :obj: `tuple` + list of tuple of (str, float) `topn` length list of tuples of (word, probability) """ @@ -873,18 +1083,40 @@ def predict_output_word(self, context_words_list, topn=10): return [(self.wv.index2word[index1], prob_values[index1]) for index1 in top_indices] def init_sims(self, replace=False): - """ - init_sims() resides in KeyedVectors because it deals with syn0/vectors mainly, but because syn1 is not an + """Precompute L2-normalized vectors. + + Parameters + ---------- + replace : bool, optional + If True, forget the original vectors and only keep the normalized ones. This will save a lot of RAM. + + Notes + ----- + `init_sims` resides in KeyedVectors because it deals with syn0/vectors mainly, but because syn1 is not an attribute of KeyedVectors, it has to be deleted in this class, and the normalizing of syn0/vectors happens - inside of KeyedVectors + inside of KeyedVectors. + """ if replace and hasattr(self.trainables, 'syn1'): del self.trainables.syn1 - return self.wv.init_sims(replace) + self.wv.init_sims(replace) def reset_from(self, other_model): - """Borrow shareable pre-built structures (like vocab) from the other_model. Useful - if testing multiple models in parallel on the same corpus. + """Borrow shareable pre-built structures the other_model and reset hidden layer weights. + + Structures copied are: + * Vocabulary + * Index to word mapping + * Cum table (used for negative sampling) + * Cached Corpus length + + Useful if testing multiple models in parallel on the same corpus. + + Parameters + ---------- + other_model : :class:`~gensim.models.word2vec.Word2Vec` + Another model from where internal structures will be copied. + """ self.wv.vocab = other_model.wv.vocab self.wv.index2word = other_model.wv.index2word @@ -894,7 +1126,17 @@ def reset_from(self, other_model): @staticmethod def log_accuracy(section): - return Word2VecKeyedVectors.log_accuracy(section) + """Logs the models accuracy scored on a single section of a corpus. + + Static wrapper for :meth:`~gensim.models.word2vec.Word2VecKeyedVectors.log_accuracy`. + + Parameters + ---------- + section : iterable of iterable of str + Chunk of sentences use to score the models accuracy. + + """ + Word2VecKeyedVectors.log_accuracy(section) @deprecated("Method will be removed in 4.0.0, use self.wv.accuracy() instead") def accuracy(self, questions, restrict_vocab=30000, most_similar=None, case_insensitive=True): @@ -902,14 +1144,27 @@ def accuracy(self, questions, restrict_vocab=30000, most_similar=None, case_inse return self.wv.accuracy(questions, restrict_vocab, most_similar, case_insensitive) def __str__(self): + """Human readable representation of the model's state. + + Returns + ------- + str + Human readable representation of the model's state (vocabulary size, vector size and learning rate). + + """ return "%s(vocab=%s, size=%s, alpha=%s)" % ( self.__class__.__name__, len(self.wv.index2word), self.wv.vector_size, self.alpha ) def delete_temporary_training_data(self, replace_word_vectors_with_normalized=False): - """Discard parameters that are used in training and score. Use if you're sure you're done training a model. - If `replace_word_vectors_with_normalized` is set, forget the original vectors and only keep the normalized - ones = saves lots of memory! + """Discard parameters that are used in training and score. + + Use if you're sure you're done training a model. + + Parameters + ---------- + replace_word_vectors_with_normalized : bool, optional + If True, forget the original vectors and only keep the normalizedto save RAM. """ if replace_word_vectors_with_normalized: self.init_sims(replace=True) @@ -930,6 +1185,14 @@ def save(self, *args, **kwargs): super(Word2Vec, self).save(*args, **kwargs) def get_latest_training_loss(self): + """Getter for the current value of the training loss. + + Returns + ------- + float + Current training loss. + + """ return self.running_training_loss @deprecated( @@ -959,7 +1222,7 @@ def save_word2vec_format(self, fname, fvocab=None, binary=False): @classmethod def load(cls, *args, **kwargs): - """Loads a previously saved `Word2Vec` model. Also see `save()`. + """Loads a previously saved `Word2Vec` model. Also see :meth:`~gensim.models.word2vec.Word2Vec.save`. Parameters ---------- @@ -968,8 +1231,8 @@ def load(cls, *args, **kwargs): Returns ------- - :obj: `~gensim.models.word2vec.Word2Vec` - Returns the loaded model as an instance of :class: `~gensim.models.word2vec.Word2Vec`. + :class:`~gensim.models.word2vec.Word2Vec` + Loaded model as an instance of :class:`~gensim.models.word2vec.Word2Vec`. """ try: return super(Word2Vec, cls).load(*args, **kwargs) From e0fe6652b8f4baa6b075c76947edd640b0d906a7 Mon Sep 17 00:00:00 2001 From: "Stergiadis, E" Date: Mon, 5 Mar 2018 16:32:52 +0100 Subject: [PATCH 08/41] remove duplicated type info --- gensim/models/word2vec.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/gensim/models/word2vec.py b/gensim/models/word2vec.py index 97b90dae97..49c41257dc 100755 --- a/gensim/models/word2vec.py +++ b/gensim/models/word2vec.py @@ -1222,7 +1222,8 @@ def save_word2vec_format(self, fname, fvocab=None, binary=False): @classmethod def load(cls, *args, **kwargs): - """Loads a previously saved `Word2Vec` model. Also see :meth:`~gensim.models.word2vec.Word2Vec.save`. + """Loads a previously saved :class:`~gensim.models.word2vec.Word2Vec` model. + Also see :meth:`~gensim.models.word2vec.Word2Vec.save`. Parameters ---------- @@ -1232,7 +1233,7 @@ def load(cls, *args, **kwargs): Returns ------- :class:`~gensim.models.word2vec.Word2Vec` - Loaded model as an instance of :class:`~gensim.models.word2vec.Word2Vec`. + Loaded model. """ try: return super(Word2Vec, cls).load(*args, **kwargs) From 8aa85bc5831d6c40250630da02f2855e2fbc005a Mon Sep 17 00:00:00 2001 From: "Stergiadis, E" Date: Mon, 5 Mar 2018 16:33:22 +0100 Subject: [PATCH 09/41] Added documentation for `Doc2vec` model and all its helper methods --- gensim/models/doc2vec.py | 470 ++++++++++++++++++++++++++++----------- 1 file changed, 337 insertions(+), 133 deletions(-) diff --git a/gensim/models/doc2vec.py b/gensim/models/doc2vec.py index f57694273d..e5a2902ce0 100644 --- a/gensim/models/doc2vec.py +++ b/gensim/models/doc2vec.py @@ -85,20 +85,52 @@ def train_document_dbow(model, doc_words, doctag_indexes, alpha, work=None, train_words=False, learn_doctags=True, learn_words=True, learn_hidden=True, word_vectors=None, word_locks=None, doctag_vectors=None, doctag_locks=None): - """ - Update distributed bag of words model ("PV-DBOW") by training on a single document. - Called internally from `Doc2Vec.train()` and `Doc2Vec.infer_vector()`. - The document is provided as `doc_words`, a list of word tokens which are looked up - in the model's vocab dictionary, and `doctag_indexes`, which provide indexes - into the doctag_vectors array. - If `train_words` is True, simultaneously train word-to-word (not just doc-to-word) - examples, exactly as per Word2Vec skip-gram training. (Without this option, - word vectors are neither consulted nor updated during DBOW doc vector training.) - Any of `learn_doctags', `learn_words`, and `learn_hidden` may be set False to - prevent learning-updates to those respective model weights, as if using the - (partially-)frozen model to infer other compatible vectors. + """Update distributed bag of words model ("PV-DBOW") by training on a single document. + + Called internally from :meth:`~gensim.models.doc2vec.Doc2Vec.train` and + :meth:`~gensim.models.doc2vec.Doc2Vec.infer_vector()`. + + Notes + ----- This is the non-optimized, Python version. If you have cython installed, gensim - will use the optimized version from doc2vec_inner instead. + will use the optimized version from :mod:`~gensim.models.doc2vec_inner` instead. + + Parameters + ---------- + model : :class:`~gensim.models.doc2vec.Doc2Vec` + The model to train. + doc_words : list of str + The input document as a list of words to be used for training. Each word will be looked up in + the model's vocabulary. + doctag_indexes : list of int + Indices into `doctag_vectors` used to obtain the tags of the document. + alpha : float + Learning rate. + work : + train_words : bool, optional + Word vectors will be updated exactly as per Word2Vec skip-gram training only if **both** + `learn_words` and `train_words` are set to True. + learn_doctags : bool, optional + Whether the tag vectors should be updated. + learn_words : bool, optional + Word vectors will be updated exactly as per Word2Vec skip-gram training only if **both** + `learn_words` and `train_words` are set to True. + learn_hidden : bool, optional + Whether or not the weights of the hidden layer will be updated. + word_vectors : object, optional + Unused. + word_locks : object, optional + Unused. + doctag_vectors : list of list of float, optional + Vector representations of the tags. If None, these will be retrieved from the model. + doctag_locks : list of float, optional + The lock factors for each tag. + + Returns + ------- + int + Number of words in the input document. + """ if doctag_vectors is None: doctag_vectors = model.docvecs.doctag_syn0 @@ -119,21 +151,56 @@ def train_document_dbow(model, doc_words, doctag_indexes, alpha, work=None, def train_document_dm(model, doc_words, doctag_indexes, alpha, work=None, neu1=None, learn_doctags=True, learn_words=True, learn_hidden=True, word_vectors=None, word_locks=None, doctag_vectors=None, doctag_locks=None): - """ - Update distributed memory model ("PV-DM") by training on a single document. - Called internally from `Doc2Vec.train()` and `Doc2Vec.infer_vector()`. This - method implements the DM model with a projection (input) layer that is - either the sum or mean of the context vectors, depending on the model's - `dm_mean` configuration field. See `train_document_dm_concat()` for the DM - model with a concatenated input layer. - The document is provided as `doc_words`, a list of word tokens which are looked up - in the model's vocab dictionary, and `doctag_indexes`, which provide indexes - into the doctag_vectors array. - Any of `learn_doctags', `learn_words`, and `learn_hidden` may be set False to - prevent learning-updates to those respective model weights, as if using the - (partially-)frozen model to infer other compatible vectors. - This is the non-optimized, Python version. If you have a C compiler, gensim - will use the optimized version from doc2vec_inner instead. + + """Update distributed memory model ("PV-DM") by training on a single document. + + Called internally from :meth:`~gensim.models.doc2vec.Doc2Vec.train` and + :meth:`~gensim.models.doc2vec.Doc2Vec.infer_vector()`. This method implements + the DM model with a projection (input) layer that is either the sum or mean of + the context vectors, depending on the model's `dm_mean` configuration field. + + Notes + ----- + This is the non-optimized, Python version. If you have cython installed, gensim + will use the optimized version from :mod:`~gensim.models.doc2vec_inner` instead. + + Parameters + ---------- + model : :class:`~gensim.models.doc2vec.Doc2Vec` + The model to train. + doc_words : list of str + The input document as a list of words to be used for training. Each word will be looked up in + the model's vocabulary. + doctag_indexes : list of int + Indices into `doctag_vectors` used to obtain the tags of the document. + alpha : float + Learning rate. + work : object + Unused + neu1 : object + Unused. + learn_doctags : bool, optional + Whether the tag vectors should be updated. + learn_words : bool, optional + Word vectors will be updated exactly as per Word2Vec skip-gram training only if **both** + `learn_words` and `train_words` are set to True. + learn_hidden : bool, optional + Whether or not the weights of the hidden layer will be updated. + word_vectors : iterable of iterable of float, optional + Vector representations of each word in the model's vocabulary. + word_locks : listf of float, optional + Lock factors for each word in the vocabulary. + doctag_vectors : list of list of float, optional + Vector representations of the tags. If None, these will be retrieved from the model. + doctag_locks : list of float, optional + The lock factors for each tag. + + Returns + ------- + int + Number of words in the input document that were actually used for training (they were found in the + vocavulary and they were not discarded by negative sampling). + """ if word_vectors is None: word_vectors = model.wv.syn0 @@ -172,18 +239,54 @@ def train_document_dm(model, doc_words, doctag_indexes, alpha, work=None, neu1=N def train_document_dm_concat(model, doc_words, doctag_indexes, alpha, work=None, neu1=None, learn_doctags=True, learn_words=True, learn_hidden=True, word_vectors=None, word_locks=None, doctag_vectors=None, doctag_locks=None): - """ - Update distributed memory model ("PV-DM") by training on a single document, using a - concatenation of the context window word vectors (rather than a sum or average). + """Update distributed memory model ("PV-DM") by training on a single document, using a + concatenation of the context window word vectors (rather than a sum or average). This + might be slower since the input at each batch will be significantly larger. + Called internally from `Doc2Vec.train()` and `Doc2Vec.infer_vector()`. - The document is provided as `doc_words`, a list of word tokens which are looked up - in the model's vocab dictionary, and `doctag_indexes`, which provide indexes - into the doctag_vectors array. - Any of `learn_doctags', `learn_words`, and `learn_hidden` may be set False to - prevent learning-updates to those respective model weights, as if using the - (partially-)frozen model to infer other compatible vectors. - This is the non-optimized, Python version. If you have a C compiler, gensim - will use the optimized version from doc2vec_inner instead. + + Notes + ----- + This is the non-optimized, Python version. If you have cython installed, gensim + will use the optimized version from :mod:`~gensim.models.doc2vec_inner` instead. + + Parameters + ---------- + model : :class:`~gensim.models.doc2vec.Doc2Vec` + The model to train. + doc_words : list of str + The input document as a list of words to be used for training. Each word will be looked up in + the model's vocabulary. + doctag_indexes : list of int + Indices into `doctag_vectors` used to obtain the tags of the document. + alpha : float + Learning rate. + work : object + Unused. + neu1 : object + Unused. + learn_doctags : bool, optional + Whether the tag vectors should be updated. + learn_words : bool, optional + Word vectors will be updated exactly as per Word2Vec skip-gram training only if **both** + `learn_words` and `train_words` are set to True. + learn_hidden : bool, optional + Whether or not the weights of the hidden layer will be updated. + word_vectors : iterable of iterable of float, optional + Vector representations of each word in the model's vocabulary. + word_locks : listf of float, optional + Lock factors for each word in the vocabulary. + doctag_vectors : list of list of float, optional + Vector representations of the tags. If None, these will be retrieved from the model. + doctag_locks : list of float, optional + The lock factors for each tag. + + Returns + ------- + int + Number of words in the input document that were actually used for training (they were found in the + vocavulary and they were not discarded by negative sampling). + """ if word_vectors is None: word_vectors = model.wv.syn0 @@ -234,7 +337,7 @@ def train_document_dm_concat(model, doc_words, doctag_indexes, alpha, work=None, class TaggedDocument(namedtuple('TaggedDocument', 'words tags')): - """ + """Represents a document along with a tag. A single document, made up of `words` (a list of unicode string tokens) and `tags` (a list of tokens). Tags may be one or more unicode string tokens, but typical practice (which will also be most memory-efficient) is @@ -281,65 +384,62 @@ def __init__(self, documents=None, dm_mean=None, dm=1, dbow_words=0, dm_concat=0 Parameters ---------- - documents : iterable of iterables - The `documents` iterable can be simply a list of TaggedDocument elements, but for larger corpora, - consider an iterable that streams the documents directly from disk/network. - If you don't supply `documents`, the model is left uninitialized -- use if - you plan to initialize it in some other way. - - dm : int {1,0} + documents : iterable of iterables of :class:`~gensim.models.doc2vec.TaggedDocument`, optional + Can be simply a list of elements, but for larger corpora,consider an iterable that streams + the documents directly from disk/network. If you don't supply `documents`, the model is + left uninitialized -- use if you plan to initialize it in some other way. + dm : int {1,0}, optional Defines the training algorithm. If `dm=1`, 'distributed memory' (PV-DM) is used. Otherwise, `distributed bag of words` (PV-DBOW) is employed. - - size : int + size : int, optional Dimensionality of the feature vectors. - window : int + window : int, optional The maximum distance between the current and predicted word within a sentence. - alpha : float + alpha : float, optional The initial learning rate. - min_alpha : float + min_alpha : float, optional Learning rate will linearly drop to `min_alpha` as training progresses. - seed : int + seed : int, optional Seed for the random number generator. Initial vectors for each word are seeded with a hash of the concatenation of word + `str(seed)`. Note that for a fully deterministically-reproducible run, you must also limit the model to a single worker thread (`workers=1`), to eliminate ordering jitter from OS thread scheduling. (In Python 3, reproducibility between interpreter launches also requires use of the `PYTHONHASHSEED` environment variable to control hash randomization). - min_count : int + min_count : int, optional Ignores all words with total frequency lower than this. - max_vocab_size : int + max_vocab_size : int, optional Limits the RAM during vocabulary building; if there are more unique words than this, then prune the infrequent ones. Every 10 million word types need about 1GB of RAM. Set to `None` for no limit. - sample : float + sample : float, optional The threshold for configuring which higher-frequency words are randomly downsampled, useful range is (0, 1e-5). - workers : int + workers : int, optional Use these many worker threads to train the model (=faster training with multicore machines). - iter : int + iter : int, optional Number of iterations (epochs) over the corpus. - hs : int {1,0} + hs : int {1,0}, optional If 1, hierarchical softmax will be used for model training. If set to 0, and `negative` is non-zero, negative sampling will be used. - negative : int + negative : int, optional If > 0, negative sampling will be used, the int for negative specifies how many "noise words" should be drawn (usually between 5-20). If set to 0, no negative sampling is used. - dm_mean : int {1,0} + dm_mean : int {1,0}, optional If 0 , use the sum of the context word vectors. If 1, use the mean. Only applies when `dm` is used in non-concatenative mode. - dm_concat : int {1,0} + dm_concat : int {1,0}, optional If 1, use concatenation of context vectors rather than sum/average; Note concatenation results in a much-larger model, as the input is no longer the size of one (sampled or arithmetically combined) word vector, but the size of the tag(s) and all words in the context strung together. - dm_tag_count : int + dm_tag_count : int, optional Expected constant number of document tags per document, when using - dm_concat mode; default is 1. - dbow_words : int {1,0} + dm_concat mode. + dbow_words : int {1,0}, optional If set to 1 trains word-vectors (in skip-gram fashion) simultaneous with DBOW doc-vector training; If 0, only trains doc-vectors (faster). - trim_rule : function + trim_rule : function, optional Vocabulary trimming rule, specifies whether certain words should remain in the vocabulary, be trimmed away, or handled using the default (discard if word count < min_count). Can be None (min_count will be used, look to :func:`~gensim.utils.keep_vocab_item`), @@ -347,7 +447,7 @@ def __init__(self, documents=None, dm_mean=None, dm=1, dbow_words=0, dm_concat=0 :attr:`gensim.utils.RULE_DISCARD`, :attr:`gensim.utils.RULE_KEEP` or :attr:`gensim.utils.RULE_DEFAULT`. Note: The rule, if given, is only used to prune vocabulary during build_vocab() and is not stored as part of the model. - callbacks : :obj: `list` of :obj: `~gensim.models.callbacks.CallbackAny2Vec` + callbacks : :obj: `list` of :obj: `~gensim.models.callbacks.CallbackAny2Vec`, optional List of callbacks that need to be executed/run at specific stages during training. """ @@ -407,14 +507,20 @@ def __init__(self, documents=None, dm_mean=None, dm=1, dbow_words=0, dm_concat=0 @property def dm(self): - """int {1,0} : `dm=1` indicates 'distributed memory' (PV-DM) else - `distributed bag of words` (PV-DBOW) is used.""" + """Indicates whether 'distributed memory' (PV-DM) will be used, else `distributed bag of words` + (PV-DBOW) is used. + + Either this or :meth:`~gensim.models.doc2vec.Doc2Vec.dbow` will return True. + """ return not self.sg # opposite of SG @property def dbow(self): - """int {1,0} : `dbow=1` indicates `distributed bag of words` (PV-DBOW) else - 'distributed memory' (PV-DM) is used.""" + """Indicates whether `distributed bag of words` (PV-DBOW) will be used, else 'distributed memory' + (PV-DM) is used. + + Either this or :meth:`~gensim.models.doc2vec.Doc2Vec.dm` will return True. + """ return self.sg # same as SG def _set_train_params(self, **kwargs): @@ -424,11 +530,19 @@ def _clear_post_train(self): self.clear_sims() def clear_sims(self): + """Resets the current word vectors. """ self.wv.vectors_norm = None self.wv.vectors_docs_norm = None def reset_from(self, other_model): - """Reuse shareable structures from other_model.""" + """Copy shareable data structures from another (possibly pretrained) model. + + Parameters + ---------- + other_model : :class:`~gensim.models.doc2vec.Doc2Vec` + Another model whose internal data structures will be copied over to the current object. + + """ self.wv.vocab = other_model.wv.vocab self.wv.index2word = other_model.wv.index2word self.vocabulary.cum_table = other_model.vocabulary.cum_table @@ -439,6 +553,23 @@ def reset_from(self, other_model): self.trainables.reset_weights(self.hs, self.negative, self.wv, self.docvecs) def _do_train_job(self, job, alpha, inits): + """ + + Parameters + ---------- + job : iterable of iterable of str + The corpus chunk to be used for training this batch. + alpha : float + Learning rate to be used for training this batch. + inits : (np.ndarray, np.ndarray) + Each worker threads private work memory. + + Returns + ------- + (int, int) + 2-tuple (effective word count after ignoring unknown words and sentence length trimming, total word count). + + """ work, neu1 = inits tally = 0 for doc in job: @@ -481,29 +612,28 @@ def train(self, documents, total_examples=None, total_words=None, Parameters ---------- - documents : iterable of iterables - The `documents` iterable can be simply a list of TaggedDocument elements, but for larger corpora, - consider an iterable that streams the documents directly from disk/network. - See :class:`~gensim.models.doc2vec.TaggedBrownCorpus` or :class:`~gensim.models.doc2vec.TaggedLineDocument` - in :mod:`~gensim.models.doc2vec` module for such examples. - total_examples : int + documents : iterable of iterables of :class:`~gensim.models.doc2vec.TaggedDocument` + Can be simply a list of elements, but for larger corpora,consider an iterable that streams + the documents directly from disk/network. If you don't supply `documents`, the model is + left uninitialized -- use if you plan to initialize it in some other way. + total_examples : int, optional Count of sentences. - total_words : int + total_words : int, optional Count of raw words in documents. - epochs : int + epochs : int, optional Number of iterations (epochs) over the corpus. - start_alpha : float + start_alpha : float, optional Initial learning rate. - end_alpha : float + end_alpha : float, optional Final learning rate. Drops linearly from `start_alpha`. - word_count : int + word_count : int, optional Count of words already trained. Set this to 0 for the usual case of training on all words in sentences. - queue_factor : int + queue_factor : int, optional Multiplier for size of queue (number of workers * queue_factor). - report_delay : float + report_delay : float, optional Seconds to wait before reporting progress. - callbacks : :obj: `list` of :obj: `~gensim.models.callbacks.CallbackAny2Vec` + callbacks : :obj: `list` of :obj: `~gensim.models.callbacks.CallbackAny2Vec`, optional List of callbacks that need to be executed/run at specific stages during training. """ super(Doc2Vec, self).train( @@ -512,32 +642,49 @@ def train(self, documents, total_examples=None, total_words=None, queue_factor=queue_factor, report_delay=report_delay, callbacks=callbacks) def _raw_word_count(self, job): - """Return the number of words in a given job.""" + """Return the number of words in a given job. + + Parameters + ---------- + job : iterable of iterable of str + Corpus chunk. + + Returns + ------- + int + Number of raw words in the corpus chunk. + + """ return sum(len(sentence.words) for sentence in job) def estimated_lookup_memory(self): - """Estimated memory for tag lookup; 0 if using pure int tags.""" + """Estimated memory for tag lookup; 0 if using pure int tags. + + Returns + ------- + int + The estimated RAM required to look up a tag in bytes. + """ return 60 * len(self.docvecs.offset2doctag) + 140 * len(self.docvecs.doctags) def infer_vector(self, doc_words, alpha=0.1, min_alpha=0.0001, steps=5): - """ - Infer a vector for given post-bulk training document. + """Infer a vector for given post-bulk training document. Parameters ---------- - doc_words : :obj: `list` of :obj: `str` - Document should be a list of (word) tokens. - alpha : float + doc_words : list of str + A (potentially unseen) document. + alpha : float, optional The initial learning rate. - min_alpha : float + min_alpha : float, optional Learning rate will linearly drop to `min_alpha` as training progresses. - steps : int + steps : int, optional Number of times to train the new document. Returns ------- - :obj: `numpy.ndarray` - Returns the inferred vector for the new document. + np.ndarray + The inferred paragraph vector for the new document. """ doctag_vectors, doctag_locks = self.trainables.get_doctag_trainables(doc_words, self.docvecs.vector_size) @@ -567,6 +714,19 @@ def infer_vector(self, doc_words, alpha=0.1, min_alpha=0.0001, steps=5): return doctag_vectors[0] def __getitem__(self, tag): + """Get the vector representation of (possible multi-term) tag. + + Parameters + ---------- + tag : {str, int, list of str, list of int} + The tag (or tags) to be looked up in the model. + + Returns + ------- + np.ndarray + The vector representations of each tag as a matrix (will be 1D if `tag` was a single tag) + + """ if isinstance(tag, string_types + integer_types + (integer,)): if tag not in self.wv.vocab: return self.docvecs[tag] @@ -574,7 +734,13 @@ def __getitem__(self, tag): return vstack([self[i] for i in tag]) def __str__(self): - """Abbreviated name reflecting major configuration paramaters.""" + """Abbreviated name reflecting major configuration paramaters. + + Returns + ------- + str + Human readable representation of the models internal state. + """ segments = [] if self.comment: segments.append('"%s"' % self.comment) @@ -612,11 +778,13 @@ def delete_temporary_training_data(self, keep_doctags_vectors=True, keep_inferen Parameters ---------- - keep_doctags_vectors : bool - Set `keep_doctags_vectors` to False if you don't want to save doctags vectors, - in this case you can't to use docvecs's most_similar, similarity etc. methods. - keep_inference : bool - Set `keep_inference` to False if you don't want to store parameters that is used for infer_vector method + keep_doctags_vectors : bool, optional + Set to False if you don't want to save doctags vectors. In this case you will not be able to + use :meth:`~gensim.models.keyedvectors.Doc2VecKeyedVectors.most_similar`, + :meth:`~gensim.models.keyedvectors.Doc2VecKeyedVectors.similarity` etc. methods. + keep_inference : bool, optional + Set to False if you don't want to store parameters that are used for + :meth:`~gensim.models.doc2vec.Doc2Vec.infer_vector` method. """ if not keep_inference: @@ -640,16 +808,16 @@ def save_word2vec_format(self, fname, doctag_vec=False, word_vec=True, prefix='* ---------- fname : str The file path used to save the vectors in. - doctag_vec : bool + doctag_vec : bool, optional Indicates whether to store document vectors. - word_vec : bool + word_vec : bool, optional Indicates whether to store word vectors. - prefix : str + prefix : str, optional Uniquely identifies doctags from word vocab, and avoids collision in case of repeated string in doctag and word vocab. - fvocab : str + fvocab : str, optional Optional file path used to save the vocabulary - binary : bool + binary : bool, optional If True, the data wil be saved in binary word2vec format, else it will be saved in plain text. """ @@ -670,21 +838,39 @@ def save_word2vec_format(self, fname, doctag_vec=False, word_vec=True, prefix='* binary=binary, write_first_line=write_first_line) def init_sims(self, replace=False): - """ - Precompute L2-normalized vectors. + """Precompute L2-normalized vectors. - If `replace` is set, forget the original vectors and only keep the normalized - ones = saves lots of memory! + Parameters + ---------- + replace : bool + If set, forget the original vectors and only keep the normalized ones to saved RAM. - Note that you **cannot continue training or inference** after doing a replace. - The model becomes effectively read-only = you can call `most_similar`, `similarity` - etc., but not `train` or `infer_vector`. + Notes + ----- + You **cannot continue training or inference** after doing a replace. + The model becomes effectively read-only - you can call + :meth:`~gensim.models.keyedvectors.Doc2VecKeyedVectors.most_similar`, + :meth:`~gensim.models.keyedvectors.Doc2VecKeyedVectors.similarity` etc., but not + :meth:`~gensim.models.doc2vec.Doc2Vec.train` or :meth:`~gensim.models.doc2vec.Doc2Vec.infer_vector`. """ - return self.docvecs.init_sims(replace=replace) + self.docvecs.init_sims(replace=replace) @classmethod def load(cls, *args, **kwargs): + """Loads a previously saved :class:`~gensim.models.doc2vec.Doc2Vec` model. + Also see :meth:`~gensim.models.doc2vec.Doc2Vec.save`. + + Parameters + ---------- + fname : str + Path to the saved file. + + Returns + ------- + :class:`~gensim.models.doc2vec.Doc2Vec` + Loaded model. + """ try: return super(Doc2Vec, cls).load(*args, **kwargs) except AttributeError: @@ -693,7 +879,23 @@ def load(cls, *args, **kwargs): return load_old_doc2vec(*args, **kwargs) def estimate_memory(self, vocab_size=None, report=None): - """Estimate required memory for a model using current settings.""" + """Estimate required memory for a model using current settings. + + Parameters + ---------- + vocab_size : int + Number of raw words in the vocabulary. + report : dict of (str, int), optional + A dictionary from string representations of the **specific** model's memory consuming members + to their size in bytes. + + Returns + ------- + dict of (str, int), optional + A dictionary from string representations of the model's memory consuming members to their size in bytes. + Includes members from the base classes as well as weights and tag lookup memory estimation specific to the + class. + """ report = report or {} report['doctag_lookup'] = self.estimated_lookup_memory() report['doctag_syn0'] = self.docvecs.count * self.vector_size * dtype(REAL).itemsize @@ -701,15 +903,18 @@ def estimate_memory(self, vocab_size=None, report=None): def build_vocab(self, documents, update=False, progress_per=10000, keep_raw_vocab=False, trim_rule=None, **kwargs): """Build vocabulary from a sequence of sentences (can be a once-only generator stream). - Each sentence is a iterable of iterables (can simply be a list of unicode strings too). Parameters ---------- - documents : iterable of iterables - The `documents` iterable can be simply a list of TaggedDocument elements, but for larger corpora, + documents : iterable of iterables of str + Can be simply a list of :class:`~gensim.models.doc2vec.TaggedDocument` elements, but for larger corpora, consider an iterable that streams the documents directly from disk/network. See :class:`~gensim.models.doc2vec.TaggedBrownCorpus` or :class:`~gensim.models.doc2vec.TaggedLineDocument` in :mod:`~gensim.models.doc2vec` module for such examples. + update : bool + If true, the new words in `sentences` will be added to model's vocab. + progress_per : int + Indicates how many words to process before showing/updating the progress. keep_raw_vocab : bool If not true, delete the raw vocabulary after the scaling is done and free up RAM. trim_rule : function @@ -720,10 +925,8 @@ def build_vocab(self, documents, update=False, progress_per=10000, keep_raw_voca :attr:`gensim.utils.RULE_DISCARD`, :attr:`gensim.utils.RULE_KEEP` or :attr:`gensim.utils.RULE_DEFAULT`. Note: The rule, if given, is only used to prune vocabulary during build_vocab() and is not stored as part of the model. - progress_per : int - Indicates how many words to process before showing/updating the progress. - update : bool - If true, the new words in `sentences` will be added to model's vocab. + **kwargs + Additional key word arguments passed to the internal vocabulary construction. """ total_words, corpus_count = self.vocabulary.scan_vocab( documents, self.docvecs, progress_per=progress_per, trim_rule=trim_rule) @@ -737,20 +940,20 @@ def build_vocab(self, documents, update=False, progress_per=10000, keep_raw_voca self.hs, self.negative, self.wv, self.docvecs, update=update) def build_vocab_from_freq(self, word_freq, keep_raw_vocab=False, corpus_count=None, trim_rule=None, update=False): - """ - Build vocabulary from a dictionary of word frequencies. - Build model vocabulary from a passed dictionary that contains (word,word count). + """Build vocabulary from a dictionary of word frequencies. + + Build model vocabulary from a passed dictionary that contains a (word -> word count) mapping. Words must be of type unicode strings. Parameters ---------- - word_freq : dict - Word,Word_Count dictionary. - keep_raw_vocab : bool + word_freq : dict of (str, int) + Word count mapping. + keep_raw_vocab : bool, optional If not true, delete the raw vocabulary after the scaling is done and free up RAM. - corpus_count : int + corpus_count : int, optional Even if no corpus is provided, this argument can set corpus_count explicitly. - trim_rule : function + trim_rule : function, optional Vocabulary trimming rule, specifies whether certain words should remain in the vocabulary, be trimmed away, or handled using the default (discard if word count < min_count). Can be None (min_count will be used, look to :func:`~gensim.utils.keep_vocab_item`), @@ -758,12 +961,13 @@ def build_vocab_from_freq(self, word_freq, keep_raw_vocab=False, corpus_count=No :attr:`gensim.utils.RULE_DISCARD`, :attr:`gensim.utils.RULE_KEEP` or :attr:`gensim.utils.RULE_DEFAULT`. Note: The rule, if given, is only used to prune vocabulary during build_vocab() and is not stored as part of the model. - update : bool + update : bool, optional If true, the new provided words in `word_freq` dict will be added to model's vocab. Examples -------- >>> from gensim.models.word2vec import Word2Vec + >>> >>> model= Word2Vec() >>> model.build_vocab_from_freq({"Word1": 15, "Word2": 20}) """ From 7c74a4c8a9199e256a2629e2760428d15ba18161 Mon Sep 17 00:00:00 2001 From: "Stergiadis, E" Date: Tue, 6 Mar 2018 11:20:32 +0100 Subject: [PATCH 10/41] =?UTF-8?q?Fixed=20paper=20references=20and=20added?= =?UTF-8?q?=20documentation=20for=20=C2=96Doc2VecVocab?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- gensim/models/doc2vec.py | 140 +++++++++++++++++++++++++++++++++------ 1 file changed, 119 insertions(+), 21 deletions(-) diff --git a/gensim/models/doc2vec.py b/gensim/models/doc2vec.py index e5a2902ce0..c555288ba5 100644 --- a/gensim/models/doc2vec.py +++ b/gensim/models/doc2vec.py @@ -8,41 +8,38 @@ """ Deep learning via the distributed memory and distributed bag of words models from -[1]_, using either hierarchical softmax or negative sampling [2]_ [3]_. See [#tutorial]_ +`Quoc Le and Tomas Mikolov: "Distributed Representations of Sentences and Documents" +`_, using either hierarchical softmax or negative sampling, see +`Tomas Mikolov, Kai Chen, Greg Corrado, and Jeffrey Dean: "Efficient Estimation of Word Representations in +Vector Space, in Proceedings of Workshop at ICLR, 2013" `_ and +`Tomas Mikolov, Ilya Sutskever, Kai Chen, Greg Corrado, and Jeffrey Dean: "Distributed Representations of Words +and Phrases and their Compositionality. In Proceedings of NIPS, 2013" +`_. + +For a real world usage scenario, see the `Doc2vec in gensim tutorial +`_ **Make sure you have a C compiler before installing gensim, to use optimized (compiled) doc2vec training** (70x speedup [blog]_). -Initialize a model with e.g.:: +Examples +-------- + +#. Initialize a model with e.g.:: >>> model = Doc2Vec(documents, size=100, window=8, min_count=5, workers=4) -Persist a model to disk with:: +#. Persist a model to disk with:: >>> model.save(fname) >>> model = Doc2Vec.load(fname) # you can continue training with the loaded model! If you're finished training a model (=no more updates, only querying), you can do - >>> model.delete_temporary_training_data(keep_doctags_vectors=True, keep_inference=True): +>>> model.delete_temporary_training_data(keep_doctags_vectors=True, keep_inference=True): to trim unneeded model memory = use (much) less RAM. - - -.. [1] Quoc Le and Tomas Mikolov. Distributed Representations of Sentences and Documents. - http://arxiv.org/pdf/1405.4053v2.pdf -.. [2] Tomas Mikolov, Kai Chen, Greg Corrado, and Jeffrey Dean. - Efficient Estimation of Word Representations in Vector Space. In Proceedings of Workshop at ICLR, 2013. -.. [3] Tomas Mikolov, Ilya Sutskever, Kai Chen, Greg Corrado, and Jeffrey Dean. - Distributed Representations of Words and Phrases and their Compositionality. In Proceedings of NIPS, 2013. -.. [blog] Optimizing word2vec in gensim, http://radimrehurek.com/2013/09/word2vec-in-python-part-two-optimizing/ - -.. [#tutorial] Doc2vec in gensim tutorial, - https://github.com/RaRe-Technologies/gensim/blob/develop/docs/notebooks/doc2vec-lee.ipynb - - - """ import logging @@ -338,6 +335,7 @@ def train_document_dm_concat(model, doc_words, doctag_indexes, alpha, work=None, class TaggedDocument(namedtuple('TaggedDocument', 'words tags')): """Represents a document along with a tag. + A single document, made up of `words` (a list of unicode string tokens) and `tags` (a list of tokens). Tags may be one or more unicode string tokens, but typical practice (which will also be most memory-efficient) is @@ -348,6 +346,14 @@ class TaggedDocument(namedtuple('TaggedDocument', 'words tags')): """ def __str__(self): + """Human readable representation of the object's state, used for debugging. + + Returns + ------- + str + Human readable representation of the object's state. + + """ return '%s(%s, %s)' % (self.__class__.__name__, self.words, self.tags) @@ -994,12 +1000,62 @@ def build_vocab_from_freq(self, word_freq, keep_raw_vocab=False, corpus_count=No class Doc2VecVocab(Word2VecVocab): + """Vocabulary used by :class:`~gensim.models.doc2vec.Doc2Vec`. + + This includes a mapping from words found in the corpus to their total occurence count. + + """ def __init__(self, max_vocab_size=None, min_count=5, sample=1e-3, sorted_vocab=True, null_word=0): + """Initialize the vocabulary. + + Parameters + ---------- + max_vocab_size : int, optional + Maximum number of words in the Vocabulary. Used to limit the RAM during vocabulary building; + if there are more unique words than this, then prune the infrequent ones. + Every 10 million word types need about 1GB of RAM. Set to `None` for no limit. + min_count : int + Words with frequency lower than this limit will be discarded form the vocabulary. + sample : float, optional + The threshold for configuring which higher-frequency words are randomly downsampled, + useful range is (0, 1e-5). + sorted_vocab : bool + If True, sort the vocabulary by descending frequency before assigning word indexes. + null_word : int {0, 1} + If True, a null pseudo-word will be created for padding when using concatenative L1 (run-of-words). + This word is only ever input – never predicted – so count, huffman-point, etc doesn't matter. + + """ super(Doc2VecVocab, self).__init__( max_vocab_size=max_vocab_size, min_count=min_count, sample=sample, sorted_vocab=sorted_vocab, null_word=null_word) def scan_vocab(self, documents, docvecs, progress_per=10000, trim_rule=None): + """Create the models Vocabulary: A mapping from unique words in the corpus to their occurence count. + + Parameters + ---------- + documents : iterable of :class:`~gensim.models.doc2vec.TaggedDocument` + The tagged documents used to create the vocabulary. Their tags can be either str tokens or ints (faster). + docvecs : list of :class:`~gensim.models.keyedvectors.Doc2VecKeyedVectors` + The vector representations of the documents in our corpus. Each of them has a size == `vector_size`. + progress_per : int + Progress will be logged every `progress_per` documents. + trim_rule : function, optional + Vocabulary trimming rule, specifies whether certain words should remain in the vocabulary, + be trimmed away, or handled using the default (discard if word count < min_count). + Can be None (min_count will be used, look to :func:`~gensim.utils.keep_vocab_item`), + or a callable that accepts parameters (word, count, min_count) and returns either + :attr:`gensim.utils.RULE_DISCARD`, :attr:`gensim.utils.RULE_KEEP` or :attr:`gensim.utils.RULE_DEFAULT`. + Note: The rule, if given, is only used to prune vocabulary during build_vocab() and is not stored as part + of the model. + + Returns + ------- + (int, int) + Tuple of (Total words in the corpus, number of documents) + + """ logger.info("collecting all words and their counts") document_no = -1 total_words = 0 @@ -1047,7 +1103,20 @@ def scan_vocab(self, documents, docvecs, progress_per=10000, trim_rule=None): return total_words, corpus_count def note_doctag(self, key, document_no, document_length, docvecs): - """Note a document tag during initial corpus scan, for structure sizing.""" + """Note a document tag during initial corpus scan, for correctly setting the keyedvectors size. + + Parameters + ---------- + key : {int, str} + The tag to be noted. + document_no : int + The document's index in `docvecs`. Unused. + document_length : int + The document's length in words. + docvecs : list of :class:`~gensim.models.keyedvectors.Doc2VecKeyedVectors` + Vector representations of the documents in the corpus. Each vector has size == `vector_size` + + """ if isinstance(key, integer_types + (integer,)): docvecs.max_rawint = max(docvecs.max_rawint, key) else: @@ -1059,12 +1128,41 @@ def note_doctag(self, key, document_no, document_length, docvecs): docvecs.count = docvecs.max_rawint + 1 + len(docvecs.offset2doctag) def indexed_doctags(self, doctag_tokens, docvecs): - """Return indexes and backing-arrays used in training examples.""" + """Return indexes and backing-arrays used in training examples. + + Parameters + ---------- + doctag_tokens : list of {str, int} + A list of tags for which we want the index. + docvecs : list of :class:`~gensim.models.keyedvectors.Doc2VecKeyedVectors` + Vector representations of the documents in the corpus. Each vector has size == `vector_size` + + Returns + ------- + list of int + Indices of the provided tag keys. + + """ return [ Doc2VecKeyedVectors._int_index(index, docvecs.doctags, docvecs.max_rawint) for index in doctag_tokens if self._tag_seen(index, docvecs)] def _tag_seen(self, index, docvecs): + """Whether or not the tag exists in our Vocabulary. + + Parameters + ---------- + index : {str, int} + The tag to be checked. + docvecs : :class:`~gensim.models.keyedvectors.Doc2VecKeyedVectors` + Vector representations of the documents in the corpus. Each vector has size == `vector_size` + + Returns + ------- + bool + Whether or not the passed tag exists in our vocabulary. + + """ if isinstance(index, integer_types + (integer,)): return index < docvecs.count else: From e92b9b4a27a663b71e69727efbbdb744d051cd77 Mon Sep 17 00:00:00 2001 From: "Stergiadis, E" Date: Tue, 6 Mar 2018 11:25:08 +0100 Subject: [PATCH 11/41] Fixed paper references --- gensim/models/word2vec.py | 82 +++++++++++++++++++-------------------- 1 file changed, 41 insertions(+), 41 deletions(-) diff --git a/gensim/models/word2vec.py b/gensim/models/word2vec.py index 49c41257dc..749107aaa5 100755 --- a/gensim/models/word2vec.py +++ b/gensim/models/word2vec.py @@ -6,7 +6,9 @@ # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html """Produce word vectors with deep learning via word2vec's "skip-gram and CBOW models", using either -hierarchical softmax or negative sampling [1]_ [2]_. +hierarchical softmax or negative sampling: `Efficient Estimation of Word Representations in Vector Space +`_, `Distributed Representations of Words and Phrases and their Compositionality +`_. NOTE: There are more ways to get word vectors in Gensim than just Word2Vec. See FastText and wrappers for VarEmbed and WordRank. @@ -18,71 +20,74 @@ visit http://radimrehurek.com/2014/02/word2vec-tutorial/ **Make sure you have a C compiler before installing gensim, to use optimized (compiled) word2vec training** -(70x speedup compared to plain NumPy implementation [3]_). +(70x speedup compared to plain NumPy implementation `Optimizing word2vec in gensim +`_). -Initialize a model with e.g.:: +Examples +-------- - >>> model = Word2Vec(sentences, size=100, window=5, min_count=5, workers=4) +#. Initialize a model with e.g.:: -Persist a model to disk with:: +>>> model = Word2Vec(sentences, size=100, window=5, min_count=5, workers=4) - >>> model.save(fname) - >>> model = Word2Vec.load(fname) # you can continue training with the loaded model! +#. Persist a model to disk with:: -The word vectors are stored in a KeyedVectors instance in model.wv. +>>> model.save(fname) +>>> model = Word2Vec.load(fname) # you can continue training with the loaded model! + +The word vectors are stored in a KeyedVectors instance in `model.wv`. This separates the read-only word vector lookup operations in KeyedVectors from the training code in Word2Vec:: - >>> model.wv['computer'] # numpy vector of a word - array([-0.00449447, -0.00310097, 0.02421786, ...], dtype=float32) +>>> model.wv['computer'] # numpy vector of a word +array([-0.00449447, -0.00310097, 0.02421786, ...], dtype=float32) The word vectors can also be instantiated from an existing file on disk in the word2vec C format as a KeyedVectors instance. -NOTE: It is impossible to continue training the vectors loaded from the C format because hidden weights, +**NOTE**: It is impossible to continue training the vectors loaded from the C format because hidden weights, vocabulary frequency and the binary tree is missing:: - >>> from gensim.models import KeyedVectors - >>> word_vectors = KeyedVectors.load_word2vec_format('/tmp/vectors.txt', binary=False) # C text format - >>> word_vectors = KeyedVectors.load_word2vec_format('/tmp/vectors.bin', binary=True) # C binary format +>>> from gensim.models import KeyedVectors +>>> word_vectors = KeyedVectors.load_word2vec_format('/tmp/vectors.txt', binary=False) # C text format +>>> word_vectors = KeyedVectors.load_word2vec_format('/tmp/vectors.bin', binary=True) # C binary format -You can perform various NLP word tasks with the model. Some of them -are already built-in:: +#. You can perform various NLP word tasks with the model. Some of them are already built-in:: - >>> model.wv.most_similar(positive=['woman', 'king'], negative=['man']) - [('queen', 0.50882536), ...] +>>> model.wv.most_similar(positive=['woman', 'king'], negative=['man']) +[('queen', 0.50882536), ...] - >>> model.wv.most_similar_cosmul(positive=['woman', 'king'], negative=['man']) - [('queen', 0.71382287), ...] +>>> model.wv.most_similar_cosmul(positive=['woman', 'king'], negative=['man']) +[('queen', 0.71382287), ...] - >>> model.wv.doesnt_match("breakfast cereal dinner lunch".split()) - 'cereal' +>>> model.wv.doesnt_match("breakfast cereal dinner lunch".split()) +'cereal' - >>> model.wv.similarity('woman', 'man') - 0.73723527 +>>> model.wv.similarity('woman', 'man') +0.73723527 -Probability of a text under the model:: +#. Probability of a text under the model:: - >>> model.score(["The fox jumped over a lazy dog".split()]) - 0.2158356 +>>> model.score(["The fox jumped over a lazy dog".split()]) +0.2158356 -Correlation with human opinion on word similarity:: +#. Correlation with human opinion on word similarity:: - >>> model.wv.evaluate_word_pairs(os.path.join(module_path, 'test_data','wordsim353.tsv')) - 0.51, 0.62, 0.13 +>>> model.wv.evaluate_word_pairs(os.path.join(module_path, 'test_data','wordsim353.tsv')) +0.51, 0.62, 0.13 -And on analogies:: +#. And on analogies:: - >>> model.wv.accuracy(os.path.join(module_path, 'test_data', 'questions-words.txt')) +>>> model.wv.accuracy(os.path.join(module_path, 'test_data', 'questions-words.txt')) and so on. If you're finished training a model (i.e. no more updates, only querying), then switch to the :mod:`gensim.models.KeyedVectors` instance in wv - >>> word_vectors = model.wv - >>> del model +>>> word_vectors = model.wv +>>> del model to trim unneeded model memory = use much less RAM. @@ -90,14 +95,9 @@ detect phrases longer than one word. Using phrases, you can learn a word2vec model where "words" are actually multiword expressions, such as `new_york_times` or `financial_crisis`: - >>> bigram_transformer = gensim.models.Phrases(sentences) - >>> model = Word2Vec(bigram_transformer[sentences], size=100, ...) +>>> bigram_transformer = gensim.models.Phrases(sentences) +>>> model = Word2Vec(bigram_transformer[sentences], size=100, ...) -.. [1] Tomas Mikolov, Kai Chen, Greg Corrado, and Jeffrey Dean. - Efficient Estimation of Word Representations in Vector Space. In Proceedings of Workshop at ICLR, 2013. -.. [2] Tomas Mikolov, Ilya Sutskever, Kai Chen, Greg Corrado, and Jeffrey Dean. - Distributed Representations of Words and Phrases and their Compositionality. In Proceedings of NIPS, 2013. -.. [3] Optimizing word2vec in gensim, http://radimrehurek.com/2013/09/word2vec-in-python-part-two-optimizing/ """ from __future__ import division # py3 "true division" From 9093eab030b04c512b772ec324284605638d1b41 Mon Sep 17 00:00:00 2001 From: "Stergiadis, E" Date: Tue, 6 Mar 2018 15:12:02 +0100 Subject: [PATCH 12/41] minor referencing fixes --- gensim/models/doc2vec.py | 2 +- gensim/models/word2vec.py | 8 +++----- 2 files changed, 4 insertions(+), 6 deletions(-) diff --git a/gensim/models/doc2vec.py b/gensim/models/doc2vec.py index c555288ba5..0a3cc6e80c 100644 --- a/gensim/models/doc2vec.py +++ b/gensim/models/doc2vec.py @@ -17,7 +17,7 @@ `_. For a real world usage scenario, see the `Doc2vec in gensim tutorial -`_ +`_. **Make sure you have a C compiler before installing gensim, to use optimized (compiled) doc2vec training** (70x speedup [blog]_). diff --git a/gensim/models/word2vec.py b/gensim/models/word2vec.py index 749107aaa5..9c68c4b174 100755 --- a/gensim/models/word2vec.py +++ b/gensim/models/word2vec.py @@ -824,13 +824,11 @@ def score(self, sentences, total_sentences=int(1e6), chunksize=100, queue_factor Note that you should specify total_sentences; we'll run into problems if you ask to score more than this number of sentences but it is inefficient to set the value too high. - See the article by [#taddy]_ and the gensim demo at [#deepir]_ for examples of + See the `article by Matt Taddy: "Document Classification by Inversion of Distributed Language Representations" + `_ and the + `gensim demo `_ for examples of how to use such scores in document classification. - .. [#taddy] Taddy, Matt. Document Classification by Inversion of Distributed Language Representations, - in Proceedings of the 2015 Conference of the Association of Computational Linguistics. - .. [#deepir] https://github.com/piskvorky/gensim/blob/develop/docs/notebooks/deepir.ipynb - Parameters ---------- sentences : iterable of iterables From c07afa4ff47707de6a932151333d11df223348d1 Mon Sep 17 00:00:00 2001 From: "Stergiadis, E" Date: Tue, 6 Mar 2018 15:49:35 +0100 Subject: [PATCH 13/41] sphinx identation --- gensim/models/doc2vec.py | 10 ++++---- gensim/models/word2vec.py | 54 +++++++++++++++++++-------------------- 2 files changed, 32 insertions(+), 32 deletions(-) diff --git a/gensim/models/doc2vec.py b/gensim/models/doc2vec.py index 0a3cc6e80c..b93535c3b1 100644 --- a/gensim/models/doc2vec.py +++ b/gensim/models/doc2vec.py @@ -27,16 +27,16 @@ #. Initialize a model with e.g.:: ->>> model = Doc2Vec(documents, size=100, window=8, min_count=5, workers=4) + >>> model = Doc2Vec(documents, size=100, window=8, min_count=5, workers=4) #. Persist a model to disk with:: ->>> model.save(fname) ->>> model = Doc2Vec.load(fname) # you can continue training with the loaded model! + >>> model.save(fname) + >>> model = Doc2Vec.load(fname) # you can continue training with the loaded model! -If you're finished training a model (=no more updates, only querying), you can do +If you're finished training a model (=no more updates, only querying), you can do:: ->>> model.delete_temporary_training_data(keep_doctags_vectors=True, keep_inference=True): + >>> model.delete_temporary_training_data(keep_doctags_vectors=True, keep_inference=True): to trim unneeded model memory = use (much) less RAM. diff --git a/gensim/models/word2vec.py b/gensim/models/word2vec.py index 9c68c4b174..00d59e0f50 100755 --- a/gensim/models/word2vec.py +++ b/gensim/models/word2vec.py @@ -28,18 +28,18 @@ #. Initialize a model with e.g.:: ->>> model = Word2Vec(sentences, size=100, window=5, min_count=5, workers=4) + >>> model = Word2Vec(sentences, size=100, window=5, min_count=5, workers=4) #. Persist a model to disk with:: ->>> model.save(fname) ->>> model = Word2Vec.load(fname) # you can continue training with the loaded model! + >>> model.save(fname) + >>> model = Word2Vec.load(fname) # you can continue training with the loaded model! -The word vectors are stored in a KeyedVectors instance in `model.wv`. -This separates the read-only word vector lookup operations in KeyedVectors from the training code in Word2Vec:: + The word vectors are stored in a KeyedVectors instance in `model.wv`. + This separates the read-only word vector lookup operations in KeyedVectors from the training code in Word2Vec:: ->>> model.wv['computer'] # numpy vector of a word -array([-0.00449447, -0.00310097, 0.02421786, ...], dtype=float32) + >>> model.wv['computer'] # numpy vector of a word + array([-0.00449447, -0.00310097, 0.02421786, ...], dtype=float32) The word vectors can also be instantiated from an existing file on disk in the word2vec C format as a KeyedVectors instance. @@ -47,47 +47,47 @@ **NOTE**: It is impossible to continue training the vectors loaded from the C format because hidden weights, vocabulary frequency and the binary tree is missing:: ->>> from gensim.models import KeyedVectors ->>> word_vectors = KeyedVectors.load_word2vec_format('/tmp/vectors.txt', binary=False) # C text format ->>> word_vectors = KeyedVectors.load_word2vec_format('/tmp/vectors.bin', binary=True) # C binary format + >>> from gensim.models import KeyedVectors + >>> word_vectors = KeyedVectors.load_word2vec_format('/tmp/vectors.txt', binary=False) # C text format + >>> word_vectors = KeyedVectors.load_word2vec_format('/tmp/vectors.bin', binary=True) # C binary format #. You can perform various NLP word tasks with the model. Some of them are already built-in:: ->>> model.wv.most_similar(positive=['woman', 'king'], negative=['man']) -[('queen', 0.50882536), ...] + >>> model.wv.most_similar(positive=['woman', 'king'], negative=['man']) + [('queen', 0.50882536), ...] ->>> model.wv.most_similar_cosmul(positive=['woman', 'king'], negative=['man']) -[('queen', 0.71382287), ...] + >>> model.wv.most_similar_cosmul(positive=['woman', 'king'], negative=['man']) + [('queen', 0.71382287), ...] ->>> model.wv.doesnt_match("breakfast cereal dinner lunch".split()) -'cereal' + >>> model.wv.doesnt_match("breakfast cereal dinner lunch".split()) + 'cereal' ->>> model.wv.similarity('woman', 'man') -0.73723527 + >>> model.wv.similarity('woman', 'man') + 0.73723527 #. Probability of a text under the model:: ->>> model.score(["The fox jumped over a lazy dog".split()]) -0.2158356 + >>> model.score(["The fox jumped over a lazy dog".split()]) + 0.2158356 #. Correlation with human opinion on word similarity:: ->>> model.wv.evaluate_word_pairs(os.path.join(module_path, 'test_data','wordsim353.tsv')) -0.51, 0.62, 0.13 + >>> model.wv.evaluate_word_pairs(os.path.join(module_path, 'test_data','wordsim353.tsv')) + 0.51, 0.62, 0.13 #. And on analogies:: ->>> model.wv.accuracy(os.path.join(module_path, 'test_data', 'questions-words.txt')) + >>> model.wv.accuracy(os.path.join(module_path, 'test_data', 'questions-words.txt')) and so on. If you're finished training a model (i.e. no more updates, only querying), then switch to the :mod:`gensim.models.KeyedVectors` instance in wv ->>> word_vectors = model.wv ->>> del model + >>> word_vectors = model.wv + >>> del model to trim unneeded model memory = use much less RAM. @@ -95,8 +95,8 @@ detect phrases longer than one word. Using phrases, you can learn a word2vec model where "words" are actually multiword expressions, such as `new_york_times` or `financial_crisis`: ->>> bigram_transformer = gensim.models.Phrases(sentences) ->>> model = Word2Vec(bigram_transformer[sentences], size=100, ...) + >>> bigram_transformer = gensim.models.Phrases(sentences) + >>> model = Word2Vec(bigram_transformer[sentences], size=100, ...) """ from __future__ import division # py3 "true division" From 4a14a3e7f10456607beaa89883aa9a0a477c1d45 Mon Sep 17 00:00:00 2001 From: "Stergiadis, E" Date: Thu, 8 Mar 2018 10:29:33 +0100 Subject: [PATCH 14/41] Added docstrings for the private methods in `BaseAny2Vec` --- gensim/models/base_any2vec.py | 149 ++++++++++++++++++++++++++++++++-- 1 file changed, 144 insertions(+), 5 deletions(-) diff --git a/gensim/models/base_any2vec.py b/gensim/models/base_any2vec.py index d5098ce277..e6161324c8 100644 --- a/gensim/models/base_any2vec.py +++ b/gensim/models/base_any2vec.py @@ -28,7 +28,14 @@ class BaseAny2VecModel(utils.SaveLoad): """Base class for training, using and evaluating any2vec model. - Contains implementation for multi-threaded training. + + Contains implementation for multi-threaded training. The purpose of this class is to provide a + reference interface for concrete embedding implementations, whether the input space is a corpus + of words, documents or anything else. At the same time, functionality that we expect to be common + for those implementations is provided here to avoid code duplication. + + In the special but usual case where the input space consists of words, a more specialized layer + is provided, consider inheriting from :class:`~gensim.models.base_any2vec.BaseWordEmbeddingsModel` """ @@ -83,7 +90,24 @@ def _check_training_sanity(self, epochs=None, total_examples=None, total_words=N raise NotImplementedError() def _worker_loop(self, job_queue, progress_queue): - """Train the model, lifting lists of data from the job_queue.""" + """Train the model, lifting lists of data from the queue. + + This function will be called in paralle by multiple workers (threads or processes) to make + optimal use of multicore machines. + + Parameters + ---------- + job_queue : Queue of (iterable of object, dict) + A queue of jobs still to be processed. The worker will take up jobs from this queue. + Each job is represented by a tuple where the first element is the corpus chunk to be processed and + the second is the dictionary of parameters. + progress_queue : Queue of (int, int, int) + A queue of progress reports. Each report is represented as a tuple of these 3 elements: + * size of data chunk processed, for example number of sentences in the corpus chunk. + * Effective word count used in training (after ignoring unknown words and trimming the sentence length). + * Total word count used in training. + + """ thread_private_mem = self._get_thread_working_mem() jobs_processed = 0 while True: @@ -106,7 +130,30 @@ def _worker_loop(self, job_queue, progress_queue): logger.debug("worker exiting, processed %i jobs", jobs_processed) def _job_producer(self, data_iterator, job_queue, cur_epoch=0, total_examples=None, total_words=None): - """Fill jobs queue using the input `data_iterator`.""" + """Fill the jobs queue using the data found in the input stream. + + Each job is represented by a tuple where the first element is the corpus chunk to be processed and + the second is the dictionary of parameters. + + Parameters + ---------- + data_iterator : iterable of iterable of object + The input corpus. This will be split in chunks and these chunks will be pushed to the queue. + job_queue : Queue of (iterable of object, dict) + A queue of jobs still to be processed. The worker will take up jobs from this queue. + Each job is represented by a tuple where the first element is the corpus chunk to be processed and + the second is the dictionary of parameters. + cur_epoch : int, optional + The current training epoch, needed to compute the training parameters for each job. + For example in many implementations the learning rate would be dropping with the number of epochs. + total_examples : int, optional + Count of objects in the `data_iterator`. In the usual case this would correspond to the number of sentences + in a corpus. Used to log progress. + total_words : int, optional + Count of total objects in `data_iterator`. In the usual case this would correspond to the number of raw + words in a corpus. Used to log progress. + + """ job_batch, batch_size = [], 0 pushed_words, pushed_examples = 0, 0 next_job_params = self._get_job_params(cur_epoch) @@ -166,6 +213,40 @@ def _log_train_end(self, raw_word_count, trained_word_count, total_elapsed, job_ def _log_epoch_progress(self, progress_queue, job_queue, cur_epoch=0, total_examples=None, total_words=None, report_delay=1.0): + """Get the progress report for a single training epoch. + + Parameters + ---------- + progress_queue : Queue of (int, int, int) + A queue of progress reports. Each report is represented as a tuple of these 3 elements: + * size of data chunk processed, for example number of sentences in the corpus chunk. + * Effective word count used in training (after ignoring unknown words and trimming the sentence length). + * Total word count used in training. + job_queue : Queue of (iterable of object, dict) + A queue of jobs still to be processed. The worker will take up jobs from this queue. + Each job is represented by a tuple where the first element is the corpus chunk to be processed and + the second is the dictionary of parameters. + cur_epoch : int, optional + The current training epoch, needed to compute the training parameters for each job. + For example in many implementations the learning rate would be dropping with the number of epochs. + total_examples : int, optional + Count of objects in the `data_iterator`. In the usual case this would correspond to the number of sentences + in a corpus. Used to log progress. + total_words : int, optional + Count of total objects in `data_iterator`. In the usual case this would correspond to the number of raw + words in a corpus. Used to log progress. + report_delay : float, optional + Number of seconds between two consecutive progress report messages in the logger. + + Returns + ------- + (int, int, int) + The epoch report consisting of three elements: + * size of data chunk processed, for example number of sentences in the corpus chunk. + * Effective word count used in training (after ignoring unknown words and trimming the sentence length). + * Total word count used in training. + + """ example_count, trained_word_count, raw_word_count = 0, 0, 0 start, next_report = default_timer() - 0.00001, 1.0 job_tally = 0 @@ -202,7 +283,35 @@ def _log_epoch_progress(self, progress_queue, job_queue, cur_epoch=0, total_exam def _train_epoch(self, data_iterable, cur_epoch=0, total_examples=None, total_words=None, queue_factor=2, report_delay=1.0): - """Train one epoch.""" + """Train the model for a single epoch. + + Parameters + ---------- + data_iterable : iterable of iterable of object + The input corpus. This will be split in chunks and these chunks will be pushed to the queue. + cur_epoch : int, optional + The current training epoch, needed to compute the training parameters for each job. + For example in many implementations the learning rate would be dropping with the number of epochs. + total_examples : int, optional + Count of objects in the `data_iterator`. In the usual case this would correspond to the number of sentences + in a corpus. Used to log progress. + total_words : int, optional + Count of total objects in `data_iterator`. In the usual case this would correspond to the number of raw + words in a corpus. Used to log progress. + queue_factor : int, optional + Multiplier for size of queue -> size = number of workers * queue_factor. + report_delay : float, optional + Number of seconds between two consecutive progress report messages in the logger. + + Returns + ------- + (int, int, int) + The training report for this epoch consisting of three elements: + * size of data chunk processed, for example number of sentences in the corpus chunk. + * Effective word count used in training (after ignoring unknown words and trimming the sentence length). + * Total word count used in training. + + """ job_queue = Queue(maxsize=queue_factor * self.workers) progress_queue = Queue(maxsize=(queue_factor + 1) * self.workers) @@ -230,7 +339,37 @@ def _train_epoch(self, data_iterable, cur_epoch=0, total_examples=None, def train(self, data_iterable, epochs=None, total_examples=None, total_words=None, queue_factor=2, report_delay=1.0, callbacks=(), **kwargs): - """Handle multi-worker training.""" + """Train the model for every epochs using multiple workers. + + Parameters + ---------- + data_iterable : iterable of iterable of object + The input corpus. This will be split in chunks and these chunks will be pushed to the queue. + epochs : int, optional + Number of epochs (training iterations over the whole input) of training. + total_examples : int, optional + Count of objects in the `data_iterator`. In the usual case this would correspond to the number of sentences + in a corpus. Used to log progress. + total_words : int, optional + Count of total objects in `data_iterator`. In the usual case this would correspond to the number of raw + words in a corpus. Used to log progress. + queue_factor : int, optional + Multiplier for size of queue -> size = number of workers * queue_factor. + report_delay : float, optional + Number of seconds between two consecutive progress report messages in the logger. + callbacks : list of :class: `~gensim.models.callbacks.CallbackAny2Vec`, optional + List of callbacks that need to be executed/run at specific stages during training. + **kwargs + Additional key word parameters for the specific model inheriting from this class. + + Returns + ------- + (int, int) + The total training report consisting of two elements: + * size of total data processed, for example number of sentences in the whole corpus. + * Effective word count used in training (after ignoring unknown words and trimming the sentence length). + + """ self._set_train_params(**kwargs) if callbacks: self.callbacks = callbacks From a7f3f0ed633ec91bd9eab1d2e8fe3ebf3932a446 Mon Sep 17 00:00:00 2001 From: "Stergiadis, E" Date: Thu, 8 Mar 2018 10:53:59 +0100 Subject: [PATCH 15/41] Applied all code review corrections, example fix still pending --- gensim/models/base_any2vec.py | 72 ++++++++++++++++++++--------------- gensim/models/doc2vec.py | 22 +++++++++-- gensim/models/word2vec.py | 12 +++--- 3 files changed, 68 insertions(+), 38 deletions(-) diff --git a/gensim/models/base_any2vec.py b/gensim/models/base_any2vec.py index e6161324c8..c054ea69f0 100644 --- a/gensim/models/base_any2vec.py +++ b/gensim/models/base_any2vec.py @@ -97,7 +97,7 @@ def _worker_loop(self, job_queue, progress_queue): Parameters ---------- - job_queue : Queue of (iterable of object, dict) + job_queue : Queue of (list of object, dict) A queue of jobs still to be processed. The worker will take up jobs from this queue. Each job is represented by a tuple where the first element is the corpus chunk to be processed and the second is the dictionary of parameters. @@ -137,9 +137,9 @@ def _job_producer(self, data_iterator, job_queue, cur_epoch=0, total_examples=No Parameters ---------- - data_iterator : iterable of iterable of object + data_iterator : iterable of list of object The input corpus. This will be split in chunks and these chunks will be pushed to the queue. - job_queue : Queue of (iterable of object, dict) + job_queue : Queue of (list of object, dict) A queue of jobs still to be processed. The worker will take up jobs from this queue. Each job is represented by a tuple where the first element is the corpus chunk to be processed and the second is the dictionary of parameters. @@ -222,7 +222,7 @@ def _log_epoch_progress(self, progress_queue, job_queue, cur_epoch=0, total_exam * size of data chunk processed, for example number of sentences in the corpus chunk. * Effective word count used in training (after ignoring unknown words and trimming the sentence length). * Total word count used in training. - job_queue : Queue of (iterable of object, dict) + job_queue : Queue of (list of object, dict) A queue of jobs still to be processed. The worker will take up jobs from this queue. Each job is represented by a tuple where the first element is the corpus chunk to be processed and the second is the dictionary of parameters. @@ -287,7 +287,7 @@ def _train_epoch(self, data_iterable, cur_epoch=0, total_examples=None, Parameters ---------- - data_iterable : iterable of iterable of object + data_iterable : iterable of list of object The input corpus. This will be split in chunks and these chunks will be pushed to the queue. cur_epoch : int, optional The current training epoch, needed to compute the training parameters for each job. @@ -343,7 +343,7 @@ def train(self, data_iterable, epochs=None, total_examples=None, Parameters ---------- - data_iterable : iterable of iterable of object + data_iterable : iterable of list of object The input corpus. This will be split in chunks and these chunks will be pushed to the queue. epochs : int, optional Number of epochs (training iterations over the whole input) of training. @@ -445,20 +445,20 @@ def __init__(self, sentences=None, workers=3, vector_size=100, epochs=5, callbac Parameters ---------- - sentences : iterable of iterable of str + sentences : iterable of list of str, optional Can be simply a list of lists of tokens, but for larger corpora, consider an iterable that streams the sentences directly from disk/network. See :class:`~gensim.models.word2vec.BrownCorpus`, :class:`~gensim.models.word2vec.Text8Corpus` - or :class:`~gensim.models.word2vec.LineSentence` in :mod:`~gensim.models.word2vec` module for such examples. - workers : int + or :class:`~gensim.models.word2vec.LineSentence` for such examples. + workers : int, optional Number of working threads, used for multiprocessing. - vector_size : int + vector_size : int, optional Dimensionality of the feature vectors. - epochs : int + epochs : int, optional Number of iterations (epochs) of training through the corpus. callbacks : list of :class: `~gensim.models.callbacks.CallbackAny2Vec`, optional List of callbacks that need to be executed/run at specific stages during training. - batch_words : int + batch_words : int, optional Number of words to be processed by a single job. trim_rule : function, optional Vocabulary trimming rule, specifies whether certain words should remain in the vocabulary, @@ -466,35 +466,39 @@ def __init__(self, sentences=None, workers=3, vector_size=100, epochs=5, callbac Can be None (min_count will be used, look to :func:`~gensim.utils.keep_vocab_item`), or a callable that accepts parameters (word, count, min_count) and returns either :attr:`gensim.utils.RULE_DISCARD`, :attr:`gensim.utils.RULE_KEEP` or :attr:`gensim.utils.RULE_DEFAULT`. + The input parameters are of the following types + * word: str. The word we are examining + * count: int. The word's occurence count in the corpus + * min_count: int. The minimum count threshold. Note: The rule, if given, is only used to prune vocabulary during build_vocab() and is not stored as part of the model. - sg : int {1, 0} + sg : {1, 0}, optional Defines the training algorithm. If 1, skip-gram is used, otherwise, CBOW is employed. - alpha : float + alpha : float, optional The beginning learning rate. This will linearly reduce with iterations until it reaches `min_alpha`. - window : int + window : int, optional The maximum distance between the current and predicted word within a sentence. - seed : int + seed : int, optional Seed for the random number generator. Initial vectors for each word are seeded with a hash of the concatenation of word + `str(seed)`. Note that for a fully deterministically-reproducible run, you must also limit the model to a single worker thread (`workers=1`), to eliminate ordering jitter from OS thread scheduling. (In Python 3, reproducibility between interpreter launches also requires use of the `PYTHONHASHSEED` environment variable to control hash randomization). - hs : int {1,0} + hs : {1,0}, optional If 1, hierarchical softmax will be used for model training. If set to 0, and `negative` is non-zero, negative sampling will be used. - negative : int + negative : int, optional If > 0, negative sampling will be used, the int for negative specifies how many "noise words" should be drawn (usually between 5-20). If set to 0, no negative sampling is used. - cbow_mean : int {1,0} + cbow_mean : {1,0}, optional If 0, use the sum of the context word vectors. If 1, use the mean, only applies when cbow is used. min_alpha : float, optional Final learning rate. Drops linearly with the number of iterations from `alpha`. compute_loss : bool, optional If True, loss will be computed while training the Word2Vec model and stored in :attr:`~gensim.models.base_any2vec.BaseWordEmbeddingsModel.running_training_loss`. - fast_version : int {-1, 1} + fast_version : {-1, 1}, optional Whether or not the fast cython implementation of the internal training methods is available. 1 means it is. **kwargs Key word arguments needed to allow children classes to accept more arguments. @@ -654,7 +658,7 @@ def cum_table(self): del self.vocabulary.cum_table def __str__(self): - """Return a human readable representation of the object. + """Get a human readable representation of the object. Returns ------- @@ -671,11 +675,11 @@ def build_vocab(self, sentences, update=False, progress_per=10000, keep_raw_voca Parameters ---------- - sentences : iterable of iterable of str + sentences : iterable of list of str Can be simply a list of lists of tokens, but for larger corpora, consider an iterable that streams the sentences directly from disk/network. See :class:`~gensim.models.word2vec.BrownCorpus`, :class:`~gensim.models.word2vec.Text8Corpus` - or :class:`~gensim.models.word2vec.LineSentence` in :mod:`~gensim.models.word2vec` module for such examples. + or :class:`~gensim.models.word2vec.LineSentence` module for such examples. update : bool, optional If true, the new words in `sentences` will be added to model's vocab. progress_per : int, optional @@ -688,6 +692,10 @@ def build_vocab(self, sentences, update=False, progress_per=10000, keep_raw_voca Can be None (min_count will be used, look to :func:`~gensim.utils.keep_vocab_item`), or a callable that accepts parameters (word, count, min_count) and returns either :attr:`gensim.utils.RULE_DISCARD`, :attr:`gensim.utils.RULE_KEEP` or :attr:`gensim.utils.RULE_DEFAULT`. + The input parameters are of the following types + * word: str. The word we are examining + * count: int. The word's occurence count in the corpus + * min_count: int. The minimum count threshold. Note: The rule, if given, is only used to prune vocabulary during build_vocab() and is not stored as part of the model. **kwargs @@ -709,7 +717,7 @@ def build_vocab_from_freq(self, word_freq, keep_raw_vocab=False, corpus_count=No Parameters ---------- - word_freq : dict of (unicode str, int) + word_freq : dict of (str, int) A mapping from a word in the vocabulary to its frequency count. keep_raw_vocab : bool, optional If False, delete the raw vocabulary after the scaling is done to free up RAM. @@ -721,6 +729,10 @@ def build_vocab_from_freq(self, word_freq, keep_raw_vocab=False, corpus_count=No Can be None (min_count will be used, look to :func:`~gensim.utils.keep_vocab_item`), or a callable that accepts parameters (word, count, min_count) and returns either :attr:`gensim.utils.RULE_DISCARD`, :attr:`gensim.utils.RULE_KEEP` or :attr:`gensim.utils.RULE_DEFAULT`. + The input parameters are of the following types + * word: str. The word we are examining + * count: int. The word's occurence count in the corpus + * min_count: int. The minimum count threshold. Note: The rule, if given, is only used to prune vocabulary during build_vocab() and is not stored as part of the model. update : bool, optional @@ -767,7 +779,7 @@ def estimate_memory(self, vocab_size=None, report=None): Returns ------- - dict of (str, int), optional + dict of (str, int) A dictionary from string representations of the model's memory consuming members to their size in bytes. """ vocab_size = vocab_size or len(self.wv.vocab) @@ -792,11 +804,11 @@ def train(self, sentences, total_examples=None, total_words=None, Parameters ---------- - sentences : iterable of iterable of str + sentences : iterable of list of str Can be simply a list of lists of tokens, but for larger corpora, consider an iterable that streams the sentences directly from disk/network. See :class:`~gensim.models.word2vec.BrownCorpus`, :class:`~gensim.models.word2vec.Text8Corpus` - or :class:`~gensim.models.word2vec.LineSentence` in :mod:`~gensim.models.word2vec` module for such examples. + or :class:`~gensim.models.word2vec.LineSentence` module for such examples. total_examples : int, optional Count of sentences. total_words : int, optional @@ -851,7 +863,7 @@ def _get_job_params(self, cur_epoch): return alpha def _update_job_params(self, job_params, epoch_progress, cur_epoch): - """Returns the correct learning rate for the next iteration. + """Get the correct learning rate for the next iteration. Parameters ---------- @@ -893,7 +905,7 @@ def _raw_word_count(self, job): Parameters ---------- - job: iterable of iterable of str + job: iterable of list of str The corpus chunk processed in a single batch. Returns @@ -1011,7 +1023,7 @@ def _log_progress(self, job_queue, progress_queue, cur_epoch, example_count, tot Parameters ---------- - job_queue : Queue of (iterable of object, dict of (str, float)) + job_queue : Queue of (list of object, dict of (str, float)) The queue of jobs still to be performed by workers. Each job is represented as a tuple containing the batch of data to be processed and the parameters to be used for the processing as a dict. progress_queue : Queue of (int, int, int) diff --git a/gensim/models/doc2vec.py b/gensim/models/doc2vec.py index b93535c3b1..064ace990f 100644 --- a/gensim/models/doc2vec.py +++ b/gensim/models/doc2vec.py @@ -451,6 +451,10 @@ def __init__(self, documents=None, dm_mean=None, dm=1, dbow_words=0, dm_concat=0 Can be None (min_count will be used, look to :func:`~gensim.utils.keep_vocab_item`), or a callable that accepts parameters (word, count, min_count) and returns either :attr:`gensim.utils.RULE_DISCARD`, :attr:`gensim.utils.RULE_KEEP` or :attr:`gensim.utils.RULE_DEFAULT`. + The input parameters are of the following types + * word: str. The word we are examining + * count: int. The word's occurence count in the corpus + * min_count: int. The minimum count threshold. Note: The rule, if given, is only used to prune vocabulary during build_vocab() and is not stored as part of the model. callbacks : :obj: `list` of :obj: `~gensim.models.callbacks.CallbackAny2Vec`, optional @@ -648,7 +652,7 @@ def train(self, documents, total_examples=None, total_words=None, queue_factor=queue_factor, report_delay=report_delay, callbacks=callbacks) def _raw_word_count(self, job): - """Return the number of words in a given job. + """Get the number of words in a given job. Parameters ---------- @@ -923,12 +927,16 @@ def build_vocab(self, documents, update=False, progress_per=10000, keep_raw_voca Indicates how many words to process before showing/updating the progress. keep_raw_vocab : bool If not true, delete the raw vocabulary after the scaling is done and free up RAM. - trim_rule : function + trim_rule : function, optional Vocabulary trimming rule, specifies whether certain words should remain in the vocabulary, be trimmed away, or handled using the default (discard if word count < min_count). Can be None (min_count will be used, look to :func:`~gensim.utils.keep_vocab_item`), or a callable that accepts parameters (word, count, min_count) and returns either :attr:`gensim.utils.RULE_DISCARD`, :attr:`gensim.utils.RULE_KEEP` or :attr:`gensim.utils.RULE_DEFAULT`. + The input parameters are of the following types + * word: str. The word we are examining + * count: int. The word's occurence count in the corpus + * min_count: int. The minimum count threshold. Note: The rule, if given, is only used to prune vocabulary during build_vocab() and is not stored as part of the model. **kwargs @@ -965,6 +973,10 @@ def build_vocab_from_freq(self, word_freq, keep_raw_vocab=False, corpus_count=No Can be None (min_count will be used, look to :func:`~gensim.utils.keep_vocab_item`), or a callable that accepts parameters (word, count, min_count) and returns either :attr:`gensim.utils.RULE_DISCARD`, :attr:`gensim.utils.RULE_KEEP` or :attr:`gensim.utils.RULE_DEFAULT`. + The input parameters are of the following types + * word: str. The word we are examining + * count: int. The word's occurence count in the corpus + * min_count: int. The minimum count threshold. Note: The rule, if given, is only used to prune vocabulary during build_vocab() and is not stored as part of the model. update : bool, optional @@ -1047,6 +1059,10 @@ def scan_vocab(self, documents, docvecs, progress_per=10000, trim_rule=None): Can be None (min_count will be used, look to :func:`~gensim.utils.keep_vocab_item`), or a callable that accepts parameters (word, count, min_count) and returns either :attr:`gensim.utils.RULE_DISCARD`, :attr:`gensim.utils.RULE_KEEP` or :attr:`gensim.utils.RULE_DEFAULT`. + The input parameters are of the following types + * word: str. The word we are examining + * count: int. The word's occurence count in the corpus + * min_count: int. The minimum count threshold. Note: The rule, if given, is only used to prune vocabulary during build_vocab() and is not stored as part of the model. @@ -1128,7 +1144,7 @@ def note_doctag(self, key, document_no, document_length, docvecs): docvecs.count = docvecs.max_rawint + 1 + len(docvecs.offset2doctag) def indexed_doctags(self, doctag_tokens, docvecs): - """Return indexes and backing-arrays used in training examples. + """Get the indexes and backing-arrays used in training examples. Parameters ---------- diff --git a/gensim/models/word2vec.py b/gensim/models/word2vec.py index 00d59e0f50..26d158b4fe 100755 --- a/gensim/models/word2vec.py +++ b/gensim/models/word2vec.py @@ -241,8 +241,7 @@ def train_batch_cbow(model, sentences, alpha, work=None, neu1=None, compute_loss return result def score_sentence_sg(model, sentence, work=None): - """ - Obtain likelihood score for a single sentence in a fitted skip-gram representation. + """Obtain likelihood score for a single sentence in a fitted skip-gram representation. Notes ----- @@ -283,8 +282,7 @@ def score_sentence_sg(model, sentence, work=None): return log_prob_sentence def score_sentence_cbow(model, sentence, work=None, neu1=None): - """ - Obtain likelihood score for a single sentence in a fitted CBOW representation. + """Obtain likelihood score for a single sentence in a fitted CBOW representation. Notes ----- @@ -657,12 +655,16 @@ def __init__(self, sentences=None, size=100, alpha=0.025, window=5, min_count=5, Hash function to use to randomly initialize weights, for increased training reproducibility. iter : int Number of iterations (epochs) over the corpus. - trim_rule : function + trim_rule : function, optional Vocabulary trimming rule, specifies whether certain words should remain in the vocabulary, be trimmed away, or handled using the default (discard if word count < min_count). Can be None (min_count will be used, look to :func:`~gensim.utils.keep_vocab_item`), or a callable that accepts parameters (word, count, min_count) and returns either :attr:`gensim.utils.RULE_DISCARD`, :attr:`gensim.utils.RULE_KEEP` or :attr:`gensim.utils.RULE_DEFAULT`. + The input parameters are of the following types + * word: str. The word we are examining + * count: int. The word's occurence count in the corpus + * min_count: int. The minimum count threshold. Note: The rule, if given, is only used to prune vocabulary during build_vocab() and is not stored as part of the model. sorted_vocab : int {1,0} From 69d524d1d77c7242226caac388c6e808a6828f7b Mon Sep 17 00:00:00 2001 From: Manos Stergiadis Date: Mon, 12 Mar 2018 17:57:08 +0100 Subject: [PATCH 16/41] Added missing docstrings --- gensim/models/base_any2vec.py | 51 +++++++++++++++++++++++++++++++++++ 1 file changed, 51 insertions(+) diff --git a/gensim/models/base_any2vec.py b/gensim/models/base_any2vec.py index c054ea69f0..4a3906a9c2 100644 --- a/gensim/models/base_any2vec.py +++ b/gensim/models/base_any2vec.py @@ -42,11 +42,26 @@ class BaseAny2VecModel(utils.SaveLoad): def __init__(self, workers=3, vector_size=100, epochs=5, callbacks=(), batch_words=10000): """Initialize model parameters. + Notes + ----- A subclass should initialize the following attributes: - self.kv (instance of concrete implementation of `BaseKeyedVectors` interface) - self.vocabulary (instance of concrete implementation of `BaseVocabBuilder` abstract class) - self.trainables (instance of concrete implementation of `BaseTrainables` abstract class) + Parameters + ---------- + workers : int + Number of working threads, used for multiprocessing. + vector_size : int + Dimensionality of the feature vectors. + epochs : int + Number of iterations (epochs) of training through the corpus. + callbacks : list of :class: `~gensim.models.callbacks.CallbackAny2Vec`, optional + List of callbacks that need to be executed/run at specific stages during training. + batch_words : int + Number of words to be processed by a single job. + """ self.vector_size = int(vector_size) self.workers = int(workers) @@ -414,9 +429,45 @@ def train(self, data_iterable, epochs=None, total_examples=None, @classmethod def load(cls, fname_or_handle, **kwargs): + """Load a previously saved object (using :meth:`~gensim.base_any2vec.BaseAny2VecModel.save`) from file. + + Parameters + ---------- + fname_or_handle : {str, file-like object} + Path to file that contains needed object or handle to the opened file. + **kwargs + Key word arguments propagated to :meth:`~gensim.utils.SaveLoad.load`. + + See Also + -------- + :meth:`~gensim.base_any2vec.BaseAny2VecModel.save` + + Returns + ------- + object + Object loaded from `fname_or_handle`. + + Raises + ------ + IOError + When methods are called on instance (should be called from class). + """ return super(BaseAny2VecModel, cls).load(fname_or_handle, **kwargs) def save(self, fname_or_handle, **kwargs): + """"Save the object to file. + + Parameters + ---------- + fname_or_handle : {str, file-like object} + Path to file where the model will be persisted. + **kwargs + Key word arguments propagated to :meth:`~gensim.utils.SaveLoad.save`. + + See Also + -------- + :meth:`~gensim.models.base_any2vec.BaseAny2VecModel.save` + """ super(BaseAny2VecModel, self).save(fname_or_handle, **kwargs) From 4707c3739a4089627f67b9e91d782f973f796b39 Mon Sep 17 00:00:00 2001 From: Manos Stergiadis Date: Mon, 12 Mar 2018 17:57:41 +0100 Subject: [PATCH 17/41] Fixed `int {1, 0}` -> `{1, 0}` --- gensim/models/doc2vec.py | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/gensim/models/doc2vec.py b/gensim/models/doc2vec.py index 064ace990f..3d3b6163aa 100644 --- a/gensim/models/doc2vec.py +++ b/gensim/models/doc2vec.py @@ -394,7 +394,7 @@ def __init__(self, documents=None, dm_mean=None, dm=1, dbow_words=0, dm_concat=0 Can be simply a list of elements, but for larger corpora,consider an iterable that streams the documents directly from disk/network. If you don't supply `documents`, the model is left uninitialized -- use if you plan to initialize it in some other way. - dm : int {1,0}, optional + dm : {1,0}, optional Defines the training algorithm. If `dm=1`, 'distributed memory' (PV-DM) is used. Otherwise, `distributed bag of words` (PV-DBOW) is employed. size : int, optional @@ -424,17 +424,17 @@ def __init__(self, documents=None, dm_mean=None, dm=1, dbow_words=0, dm_concat=0 Use these many worker threads to train the model (=faster training with multicore machines). iter : int, optional Number of iterations (epochs) over the corpus. - hs : int {1,0}, optional + hs : {1,0}, optional If 1, hierarchical softmax will be used for model training. If set to 0, and `negative` is non-zero, negative sampling will be used. negative : int, optional If > 0, negative sampling will be used, the int for negative specifies how many "noise words" should be drawn (usually between 5-20). If set to 0, no negative sampling is used. - dm_mean : int {1,0}, optional + dm_mean : {1,0}, optional If 0 , use the sum of the context word vectors. If 1, use the mean. Only applies when `dm` is used in non-concatenative mode. - dm_concat : int {1,0}, optional + dm_concat : {1,0}, optional If 1, use concatenation of context vectors rather than sum/average; Note concatenation results in a much-larger model, as the input is no longer the size of one (sampled or arithmetically combined) word vector, but the @@ -442,7 +442,7 @@ def __init__(self, documents=None, dm_mean=None, dm=1, dbow_words=0, dm_concat=0 dm_tag_count : int, optional Expected constant number of document tags per document, when using dm_concat mode. - dbow_words : int {1,0}, optional + dbow_words : {1,0}, optional If set to 1 trains word-vectors (in skip-gram fashion) simultaneous with DBOW doc-vector training; If 0, only trains doc-vectors (faster). trim_rule : function, optional @@ -451,10 +451,10 @@ def __init__(self, documents=None, dm_mean=None, dm=1, dbow_words=0, dm_concat=0 Can be None (min_count will be used, look to :func:`~gensim.utils.keep_vocab_item`), or a callable that accepts parameters (word, count, min_count) and returns either :attr:`gensim.utils.RULE_DISCARD`, :attr:`gensim.utils.RULE_KEEP` or :attr:`gensim.utils.RULE_DEFAULT`. - The input parameters are of the following types - * word: str. The word we are examining - * count: int. The word's occurence count in the corpus - * min_count: int. The minimum count threshold. + The input parameters are of the following types: + - word: str. The word we are examining + - count: int. The word's occurence count in the corpus + - min_count: int. The minimum count threshold. Note: The rule, if given, is only used to prune vocabulary during build_vocab() and is not stored as part of the model. callbacks : :obj: `list` of :obj: `~gensim.models.callbacks.CallbackAny2Vec`, optional @@ -1033,7 +1033,7 @@ def __init__(self, max_vocab_size=None, min_count=5, sample=1e-3, sorted_vocab=T useful range is (0, 1e-5). sorted_vocab : bool If True, sort the vocabulary by descending frequency before assigning word indexes. - null_word : int {0, 1} + null_word : {0, 1} If True, a null pseudo-word will be created for padding when using concatenative L1 (run-of-words). This word is only ever input – never predicted – so count, huffman-point, etc doesn't matter. From 3a85ac5c04c0f7e4bf24db92e841e073810c87a0 Mon Sep 17 00:00:00 2001 From: Manos Stergiadis Date: Mon, 12 Mar 2018 18:27:25 +0100 Subject: [PATCH 18/41] Fixed examples and code review corrections --- gensim/models/doc2vec.py | 28 ++++++++++++++++------------ 1 file changed, 16 insertions(+), 12 deletions(-) diff --git a/gensim/models/doc2vec.py b/gensim/models/doc2vec.py index 3d3b6163aa..4e83c495f3 100644 --- a/gensim/models/doc2vec.py +++ b/gensim/models/doc2vec.py @@ -27,16 +27,20 @@ #. Initialize a model with e.g.:: - >>> model = Doc2Vec(documents, size=100, window=8, min_count=5, workers=4) + >>> from gensim.test.utils import common_texts + >>> from gensim.models.doc2vec import Doc2Vec, TaggedDocument + >>> + >>> documents = [TaggedDocument(word, [i]) for i, word in enumerate(common_texts)] + >>> model = Doc2Vec(documents, vector_size=5, window=2, min_count=1, workers=4) #. Persist a model to disk with:: - >>> model.save(fname) - >>> model = Doc2Vec.load(fname) # you can continue training with the loaded model! + >>> model.save('/tmp/model') + >>> model = Doc2Vec.load('/tmp/model') # you can continue training with the loaded model! If you're finished training a model (=no more updates, only querying), you can do:: - >>> model.delete_temporary_training_data(keep_doctags_vectors=True, keep_inference=True): + >>> model.delete_temporary_training_data(keep_doctags_vectors=True, keep_inference=True) to trim unneeded model memory = use (much) less RAM. @@ -183,7 +187,7 @@ def train_document_dm(model, doc_words, doctag_indexes, alpha, work=None, neu1=N `learn_words` and `train_words` are set to True. learn_hidden : bool, optional Whether or not the weights of the hidden layer will be updated. - word_vectors : iterable of iterable of float, optional + word_vectors : iterable of list of float, optional Vector representations of each word in the model's vocabulary. word_locks : listf of float, optional Lock factors for each word in the vocabulary. @@ -269,7 +273,7 @@ def train_document_dm_concat(model, doc_words, doctag_indexes, alpha, work=None, `learn_words` and `train_words` are set to True. learn_hidden : bool, optional Whether or not the weights of the hidden layer will be updated. - word_vectors : iterable of iterable of float, optional + word_vectors : iterable of list of float, optional Vector representations of each word in the model's vocabulary. word_locks : listf of float, optional Lock factors for each word in the vocabulary. @@ -386,11 +390,11 @@ class Doc2Vec(BaseWordEmbeddingsModel): def __init__(self, documents=None, dm_mean=None, dm=1, dbow_words=0, dm_concat=0, dm_tag_count=1, docvecs=None, docvecs_mapfile=None, comment=None, trim_rule=None, callbacks=(), **kwargs): """Initialize the model from an iterable of `documents`. Each document is a - TaggedDocument object that will be used for training. + :class:`~gensim.models.doc2vec.TaggedDocument` object that will be used for training. Parameters ---------- - documents : iterable of iterables of :class:`~gensim.models.doc2vec.TaggedDocument`, optional + documents : iterable of list of :class:`~gensim.models.doc2vec.TaggedDocument`, optional Can be simply a list of elements, but for larger corpora,consider an iterable that streams the documents directly from disk/network. If you don't supply `documents`, the model is left uninitialized -- use if you plan to initialize it in some other way. @@ -567,7 +571,7 @@ def _do_train_job(self, job, alpha, inits): Parameters ---------- - job : iterable of iterable of str + job : iterable of list of str The corpus chunk to be used for training this batch. alpha : float Learning rate to be used for training this batch. @@ -622,7 +626,7 @@ def train(self, documents, total_examples=None, total_words=None, Parameters ---------- - documents : iterable of iterables of :class:`~gensim.models.doc2vec.TaggedDocument` + documents : iterable of list of :class:`~gensim.models.doc2vec.TaggedDocument` Can be simply a list of elements, but for larger corpora,consider an iterable that streams the documents directly from disk/network. If you don't supply `documents`, the model is left uninitialized -- use if you plan to initialize it in some other way. @@ -656,7 +660,7 @@ def _raw_word_count(self, job): Parameters ---------- - job : iterable of iterable of str + job : iterable of list of str Corpus chunk. Returns @@ -916,7 +920,7 @@ def build_vocab(self, documents, update=False, progress_per=10000, keep_raw_voca Parameters ---------- - documents : iterable of iterables of str + documents : iterable of list of :class:`~gensim.models.doc2vec.TaggedDocument` Can be simply a list of :class:`~gensim.models.doc2vec.TaggedDocument` elements, but for larger corpora, consider an iterable that streams the documents directly from disk/network. See :class:`~gensim.models.doc2vec.TaggedBrownCorpus` or :class:`~gensim.models.doc2vec.TaggedLineDocument` From f041cf1860b522feac46791668c4eb49588e9424 Mon Sep 17 00:00:00 2001 From: "Stergiadis, E" Date: Wed, 14 Mar 2018 15:19:11 +0100 Subject: [PATCH 19/41] Fixed examples and applied code review corrections (optional arguments, correct types, blank lines at end of docstrings --- gensim/models/word2vec.py | 95 ++++++++++++++++++++++----------------- 1 file changed, 55 insertions(+), 40 deletions(-) diff --git a/gensim/models/word2vec.py b/gensim/models/word2vec.py index 26d158b4fe..e6649e6605 100755 --- a/gensim/models/word2vec.py +++ b/gensim/models/word2vec.py @@ -11,7 +11,7 @@ `_. NOTE: There are more ways to get word vectors in Gensim than just Word2Vec. -See FastText and wrappers for VarEmbed and WordRank. +See :class:`~gensim.models.fasttext.FastText` and wrappers for VarEmbed and WordRank. The training algorithms were originally ported from the C package https://code.google.com/p/word2vec/ and extended with additional functionality. @@ -27,19 +27,19 @@ -------- #. Initialize a model with e.g.:: - - >>> model = Word2Vec(sentences, size=100, window=5, min_count=5, workers=4) + >>> from gensim.test.utils import common_texts + >>> + >>> model = Word2Vec(size=4, window=2, min_count=1, workers=4) + >>> model.build_vocab(common_texts) #. Persist a model to disk with:: - - >>> model.save(fname) - >>> model = Word2Vec.load(fname) # you can continue training with the loaded model! + >>> model.save("temp_model.w2v") + >>> model = Word2Vec.load("temp_model.w2v") # you can continue training with the loaded model! The word vectors are stored in a KeyedVectors instance in `model.wv`. This separates the read-only word vector lookup operations in KeyedVectors from the training code in Word2Vec:: - >>> model.wv['computer'] # numpy vector of a word - array([-0.00449447, -0.00310097, 0.02421786, ...], dtype=float32) + >>> computer_vec = model.wv['computer'] # numpy vector of a word The word vectors can also be instantiated from an existing file on disk in the word2vec C format as a KeyedVectors instance. @@ -48,38 +48,43 @@ vocabulary frequency and the binary tree is missing:: >>> from gensim.models import KeyedVectors - >>> word_vectors = KeyedVectors.load_word2vec_format('/tmp/vectors.txt', binary=False) # C text format - >>> word_vectors = KeyedVectors.load_word2vec_format('/tmp/vectors.bin', binary=True) # C binary format - + >>> + >>> # Save and load key word vectors in C text format. + >>> model.wv.save_word2vec_format('vectors.txt', binary=False) + >>> word_vectors = KeyedVectors.load_word2vec_format('vectors.txt', binary=False) + >>> + >>> # Save and load key word vectors in C binary format. + >>> model.wv.save_word2vec_format('vectors.bin', binary=True) + >>> word_vectors = KeyedVectors.load_word2vec_format('vectors.bin', binary=True) #. You can perform various NLP word tasks with the model. Some of them are already built-in:: - >>> model.wv.most_similar(positive=['woman', 'king'], negative=['man']) - [('queen', 0.50882536), ...] + >>> similarities = model.wv.most_similar(positive=['computer', 'human'], negative=['interface']) + >>> most_similar = similarities[0] - >>> model.wv.most_similar_cosmul(positive=['woman', 'king'], negative=['man']) - [('queen', 0.71382287), ...] + >>> similarities = model.wv.most_similar_cosmul(positive=['computer', 'human'], negative=['interface']) + >>> most_similar = similarities[0] + >>> not_matching = model.wv.doesnt_match("human computer interface tree".split()) - >>> model.wv.doesnt_match("breakfast cereal dinner lunch".split()) - 'cereal' + >>> sim_score = model.wv.similarity('computer', 'human') - >>> model.wv.similarity('woman', 'man') - 0.73723527 +#. Probability of a (possibly unseen) text under the model:: -#. Probability of a text under the model:: - - >>> model.score(["The fox jumped over a lazy dog".split()]) - 0.2158356 + >>> # Note that score is only implemented for the hierarchical softmax scheme. + >>> model = Word2Vec(size=4, window=2, min_count=1, workers=4, hs=1, negative=0) + >>> model.build_vocab(common_texts) + >>> proba = model.score(["The fox jumped over a lazy dog".split()]) #. Correlation with human opinion on word similarity:: - >>> model.wv.evaluate_word_pairs(os.path.join(module_path, 'test_data','wordsim353.tsv')) - 0.51, 0.62, 0.13 + >>> from gensim.test.utils import datapath + >>> + >>> similarities = model.wv.evaluate_word_pairs(datapath('wordsim353.tsv')) #. And on analogies:: - >>> model.wv.accuracy(os.path.join(module_path, 'test_data', 'questions-words.txt')) + >>> analogies = model.wv.accuracy(datapath('questions-words.txt')) and so on. @@ -94,9 +99,11 @@ Note that there is a :mod:`gensim.models.phrases` module which lets you automatically detect phrases longer than one word. Using phrases, you can learn a word2vec model where "words" are actually multiword expressions, such as `new_york_times` or `financial_crisis`: - - >>> bigram_transformer = gensim.models.Phrases(sentences) - >>> model = Word2Vec(bigram_transformer[sentences], size=100, ...) + >>> from gensim.models import Phrases + >>> + >>> bigram_transformer = Phrases(common_texts) + >>> model = Word2Vec(size=5) + >>> model.build_vocab(bigram_transformer[common_texts]) """ from __future__ import division # py3 "true division" @@ -158,7 +165,7 @@ def train_batch_sg(model, sentences, alpha, work=None, compute_loss=False): ---------- model : :class:`~gensim.models.word2Vec.Word2Vec` The Word2Vec model instance to train. - sentences : iterable of iterable of str + sentences : iterable of list of str The corpus used to train the model. alpha : float The learning rate @@ -205,9 +212,9 @@ def train_batch_cbow(model, sentences, alpha, work=None, neu1=None, compute_loss Parameters ---------- - model : :class:`~gensim.models.word2Vec.Word2Vec` + model : :class:`~gensim.models.word2vec.Word2Vec` The Word2Vec model instance to train. - sentences : iterable of iterable of str + sentences : iterable of list of str The corpus used to train the model. alpha : float The learning rate @@ -223,6 +230,7 @@ def train_batch_cbow(model, sentences, alpha, work=None, neu1=None, compute_loss int Number of words in the vocabulary actually used for training (They already existed in the vocabulary and were not discarded by negative sampling). + """ result = 0 for sentence in sentences: @@ -553,6 +561,7 @@ def score_sg_pair(model, word, word2): ------- float Logarithm of the sum of exponentiations of input words. + """ l1 = model.wv.syn0[word2.index] l2a = deepcopy(model.syn1[word.point]) # 2d matrix, codelen x layer1_size @@ -577,6 +586,7 @@ def score_cbow_pair(model, word, l1): ------- float Logarithm of the sum of exponentiations of input words. + """ l2a = model.syn1[word.point] # 2d matrix, codelen x layer1_size sgn = (-1.0) ** word.code # ch function, 0-> 1, 1 -> -1 @@ -607,14 +617,13 @@ def __init__(self, sentences=None, size=100, alpha=0.025, window=5, min_count=5, Parameters ---------- - sentences : iterable of iterables + sentences : iterable of list of str The `sentences` iterable can be simply a list of lists of tokens, but for larger corpora, consider an iterable that streams the sentences directly from disk/network. See :class:`~gensim.models.word2vec.BrownCorpus`, :class:`~gensim.models.word2vec.Text8Corpus` or :class:`~gensim.models.word2vec.LineSentence` in :mod:`~gensim.models.word2vec` module for such examples. If you don't supply `sentences`, the model is left uninitialized -- use if you plan to initialize it in some other way. - sg : int {1, 0} Defines the training algorithm. If 1, CBOW is used, otherwise, skip-gram is employed. size : int @@ -710,7 +719,7 @@ def _do_train_job(self, sentences, alpha, inits): Parameters ---------- - sentences : iterable of iterable of str + sentences : iterable of list of str Corpus chunk to be used in this training batch. alpha : float The learning rate used in this batch. @@ -737,6 +746,7 @@ def _clear_post_train(self): Notes ----- You will have to recompute them using :meth:`~gensim.models.word2vec.Word2Vec.init_sims`. + """ self.wv.vectors_norm = None @@ -772,7 +782,7 @@ def train(self, sentences, total_examples=None, total_words=None, Parameters ---------- - sentences : iterable of iterables + sentences : iterable of list of str The `sentences` iterable can be simply a list of lists of tokens, but for larger corpora, consider an iterable that streams the sentences directly from disk/network. See :class:`~gensim.models.word2vec.BrownCorpus`, :class:`~gensim.models.word2vec.Text8Corpus` @@ -807,6 +817,7 @@ def train(self, sentences, total_examples=None, total_words=None, >>> model = Word2Vec(min_count=1) >>> model.build_vocab(sentences) >>> model.train(sentences, total_examples=model.corpus_count, epochs=model.iter) + (1, 30) """ @@ -833,7 +844,7 @@ def score(self, sentences, total_sentences=int(1e6), chunksize=100, queue_factor Parameters ---------- - sentences : iterable of iterables + sentences : iterable of list of str The `sentences` iterable can be simply a list of lists of tokens, but for larger corpora, consider an iterable that streams the sentences directly from disk/network. See :class:`~gensim.models.word2vec.BrownCorpus`, :class:`~gensim.models.word2vec.Text8Corpus` @@ -957,6 +968,7 @@ def clear_sims(self): Notes ----- You will have to recompute them using :meth:`~gensim.models.word2vec.Word2Vec.init_sims`. + """ self.wv.vectors_norm = None @@ -1027,6 +1039,7 @@ def __getitem__(self, words): """ Deprecated. Use self.wv.__getitem__() instead. Refer to the documentation for `gensim.models.keyedvectors.Word2VecKeyedVectors.__getitem__` + """ return self.wv.__getitem__(words) @@ -1035,6 +1048,7 @@ def __contains__(self, word): """ Deprecated. Use self.wv.__contains__() instead. Refer to the documentation for `gensim.models.keyedvectors.Word2VecKeyedVectors.__contains__` + """ return self.wv.__contains__(word) @@ -1132,7 +1146,7 @@ def log_accuracy(section): Parameters ---------- - section : iterable of iterable of str + section : iterable of list of str Chunk of sentences use to score the models accuracy. """ @@ -1165,6 +1179,7 @@ def delete_temporary_training_data(self, replace_word_vectors_with_normalized=Fa ---------- replace_word_vectors_with_normalized : bool, optional If True, forget the original vectors and only keep the normalizedto save RAM. + """ if replace_word_vectors_with_normalized: self.init_sims(replace=True) @@ -1234,6 +1249,7 @@ def load(cls, *args, **kwargs): ------- :class:`~gensim.models.word2vec.Word2Vec` Loaded model. + """ try: return super(Word2Vec, cls).load(*args, **kwargs) @@ -1296,8 +1312,7 @@ def __iter__(self): class LineSentence(object): - """Simple format: one sentence = one line; words already preprocessed and separated by whitespace. - """ + """Simple format: one sentence = one line; words already preprocessed and separated by whitespace.""" def __init__(self, source, max_sentence_length=MAX_WORDS_IN_BATCH, limit=None): """ From 8badb810a01affebfcea90418f0a175dd89d82c3 Mon Sep 17 00:00:00 2001 From: "Stergiadis, E" Date: Wed, 14 Mar 2018 15:30:57 +0100 Subject: [PATCH 20/41] Applied code review corrections and added top level usage examples --- gensim/models/fasttext.py | 265 ++++++++++++++++++++++++++++---------- 1 file changed, 194 insertions(+), 71 deletions(-) diff --git a/gensim/models/fasttext.py b/gensim/models/fasttext.py index 6ea76819cf..f3f76ef56b 100644 --- a/gensim/models/fasttext.py +++ b/gensim/models/fasttext.py @@ -6,7 +6,8 @@ # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html """Learn word representations via fasttext's "skip-gram and CBOW models", using either -hierarchical softmax or negative sampling [1]_. +hierarchical softmax or negative sampling `Enriching Word Vectors with Subword Information +`_. Notes ----- @@ -16,15 +17,50 @@ This module allows training a word embedding from a training corpus with the additional ability to obtain word vectors for out-of-vocabulary words. -For a tutorial on gensim's native fasttext, refer to the noteboook -- [2]_ +For a tutorial on gensim's native fasttext, refer to the `noteboook +`_. **Make sure you have a C compiler before installing gensim, to use optimized (compiled) fasttext training** -.. [1] P. Bojanowski, E. Grave, A. Joulin, T. Mikolov - Enriching Word Vectors with Subword Information. In arXiv preprint arXiv:1607.04606. - https://arxiv.org/abs/1607.04606 +Examples +-------- -.. [2] https://github.com/RaRe-Technologies/gensim/blob/develop/docs/notebooks/FastText_Tutorial.ipynb +#. Initialize a model with e.g.:: + >>> from gensim.test.utils import common_texts + >>> + >>> model = FastText(size=4, window=3, min_count=1) + >>> model.build_vocab(common_texts) + +#. Persist a model to disk with:: + >>> model.save("temp_model.w2v") + >>> model = FastText.load("temp_model.w2v") # you can continue training with the loaded model! + + The word vectors are stored in a KeyedVectors instance in `model.wv`. + This separates the read-only word vector lookup operations in KeyedVectors from the training code in Word2Vec:: + + >>> computer_vec = model.wv['computer'] # numpy vector of a word + +#. You can perform various NLP word tasks with the model. Some of them are already built-in:: + + >>> similarities = model.wv.most_similar(positive=['computer', 'human'], negative=['interface']) + >>> most_similar = similarities[0] + + >>> similarities = model.wv.most_similar_cosmul(positive=['computer', 'human'], negative=['interface']) + >>> most_similar = similarities[0] + + >>> not_matching = model.wv.doesnt_match("human computer interface tree".split()) + + >>> sim_score = model.wv.similarity('computer', 'human') + +#. Correlation with human opinion on word similarity:: + + >>> from gensim.test.utils import datapath + >>> + >>> similarities = model.wv.evaluate_word_pairs(datapath('wordsim353.tsv')) + +#. And on analogies:: + + >>> analogies = model.wv.accuracy(datapath('questions-words.txt')) """ @@ -58,24 +94,29 @@ def train_batch_cbow(model, sentences, alpha, work=None, neu1=None): """Update CBOW model by training on a sequence of sentences. Each sentence is a list of string tokens, which are looked up in the model's vocab dictionary. Called internally from :meth:`gensim.models.fasttext.FastText.train()`. + + Notes + ----- This is the non-optimized, Python version. If you have cython installed, gensim will use the optimized version from fasttext_inner instead. + Parameters ---------- model : :class:`~gensim.models.fasttext.FastText` `FastText` instance. - sentences : iterable of iterables + sentences : iterable of list of str Iterable of the sentences directly from disk/network. alpha : float Learning rate. - work : :class:`numpy.ndarray` + work : :class:`numpy.ndarray`, optional Private working memory for each worker. - neu1 : :class:`numpy.ndarray` + neu1 : :class:`numpy.ndarray`, optional Private working memory for each worker. Returns ------- int Effective number of words trained. + """ result = 0 for sentence in sentences: @@ -115,24 +156,30 @@ def train_batch_sg(model, sentences, alpha, work=None, neu1=None): """Update skip-gram model by training on a sequence of sentences. Each sentence is a list of string tokens, which are looked up in the model's vocab dictionary. Called internally from :meth:`gensim.models.fasttext.FastText.train()`. + + Notes + ----- This is the non-optimized, Python version. If you have cython installed, gensim will use the optimized version from fasttext_inner instead. + Parameters ---------- model : :class:`~gensim.models.fasttext.FastText` `FastText` instance. - sentences : iterable of iterables + sentences : iterable of list of str Iterable of the sentences directly from disk/network. alpha : float Learning rate. - work : :class:`numpy.ndarray` + work : :class:`numpy.ndarray`, optional Private working memory for each worker. - neu1 : :class:`numpy.ndarray` + neu1 : :class:`numpy.ndarray`, optional Private working memory for each worker. + Returns ------- int Effective number of words trained. + """ result = 0 for sentence in sentences: @@ -161,7 +208,7 @@ def train_batch_sg(model, sentences, alpha, work=None, neu1=None): class FastText(BaseWordEmbeddingsModel): """Class for training, using and evaluating word representations learned using method - described in [1]_ aka Fasttext. + described in `Enriching Word Vectors with Subword Information `_, aka FastText. The model can be stored/loaded via its :meth:`~gensim.models.fasttext.FastText.save()` and :meth:`~gensim.models.fasttext.FastText.load()` methods, or loaded in a format compatible with the original @@ -177,79 +224,83 @@ def __init__(self, sentences=None, sg=0, hs=0, size=100, alpha=0.025, window=5, Parameters ---------- - sentences : iterable of iterables - The `sentences` iterable can be simply a list of lists of tokens, but for larger corpora, + sentences : iterable of list of str, optional + Can be simply a list of lists of tokens, but for larger corpora, consider an iterable that streams the sentences directly from disk/network. See :class:`~gensim.models.word2vec.BrownCorpus`, :class:`~gensim.models.word2vec.Text8Corpus` or :class:`~gensim.models.word2vec.LineSentence` in :mod:`~gensim.models.word2vec` module for such examples. If you don't supply `sentences`, the model is left uninitialized -- use if you plan to initialize it in some other way. - sg : int {1, 0} + sg : {1, 0}, optional Defines the training algorithm. If 1, skip-gram is used, otherwise, CBOW is employed. - size : int + size : int, optional Dimensionality of the feature vectors. - window : int + window : int, optional The maximum distance between the current and predicted word within a sentence. - alpha : float + alpha : float, optional The initial learning rate. - min_alpha : float + min_alpha : float, optional Learning rate will linearly drop to `min_alpha` as training progresses. - seed : int + seed : int, optional Seed for the random number generator. Initial vectors for each word are seeded with a hash of the concatenation of word + `str(seed)`. Note that for a fully deterministically-reproducible run, you must also limit the model to a single worker thread (`workers=1`), to eliminate ordering jitter from OS thread scheduling. (In Python 3, reproducibility between interpreter launches also requires use of the `PYTHONHASHSEED` environment variable to control hash randomization). - min_count : int - Ignores all words with total frequency lower than this. - max_vocab_size : int + min_count : int, optional + The model ignores all words with total frequency lower than this. + max_vocab_size : int, optional Limits the RAM during vocabulary building; if there are more unique words than this, then prune the infrequent ones. Every 10 million word types need about 1GB of RAM. Set to `None` for no limit. - sample : float + sample : float, optional The threshold for configuring which higher-frequency words are randomly downsampled, useful range is (0, 1e-5). - workers : int + workers : int, optional Use these many worker threads to train the model (=faster training with multicore machines). - hs : int {1,0} + hs : {1,0}, optional If 1, hierarchical softmax will be used for model training. If set to 0, and `negative` is non-zero, negative sampling will be used. - negative : int + negative : int, optional If > 0, negative sampling will be used, the int for negative specifies how many "noise words" should be drawn (usually between 5-20). If set to 0, no negative sampling is used. - cbow_mean : int {1,0} + cbow_mean : {1,0}, optional If 0, use the sum of the context word vectors. If 1, use the mean, only applies when cbow is used. - hashfxn : function + hashfxn : function, optional Hash function to use to randomly initialize weights, for increased training reproducibility. - iter : int + iter : int, optional Number of iterations (epochs) over the corpus. - trim_rule : function + trim_rule : function, optional Vocabulary trimming rule, specifies whether certain words should remain in the vocabulary, be trimmed away, or handled using the default (discard if word count < min_count). Can be None (min_count will be used, look to :func:`~gensim.utils.keep_vocab_item`), or a callable that accepts parameters (word, count, min_count) and returns either :attr:`gensim.utils.RULE_DISCARD`, :attr:`gensim.utils.RULE_KEEP` or :attr:`gensim.utils.RULE_DEFAULT`. + The input parameters are of the following types + * word: str. The word we are examining + * count: int. The word's occurence count in the corpus + * min_count: int. The minimum count threshold. Note: The rule, if given, is only used to prune vocabulary during build_vocab() and is not stored as part of the model. - sorted_vocab : int {1,0} - If 1, sort the vocabulary by descending frequency before assigning word indexes. - batch_words : int + sorted_vocab : {1,0}, optional + If 1, sort the vocabulary by descending frequency before assigning word indices. + batch_words : int, optional Target size (in words) for batches of examples passed to worker threads (and thus cython routines).(Larger batches will be passed if individual texts are longer than 10000 words, but the standard cython code truncates to that maximum.) - min_n : int - Min length of char ngrams to be used for training word representations. - max_n : int + min_n : int, optional + Minimum length of char n-grams to be used for training word representations. + max_n : int, optional Max length of char ngrams to be used for training word representations. Set `max_n` to be lesser than `min_n` to avoid char ngrams being used. - word_ngrams : int {1,0} - If 1, uses enriches word vectors with subword(ngrams) information. + word_ngrams : {1,0}, optional + If 1, uses enriches word vectors with subword(n-grams) information. If 0, this is equivalent to word2vec. - bucket : int + bucket : int, optional Character ngrams are hashed into a fixed number of buckets, in order to limit the memory usage of the model. This option specifies the number of buckets used by the model. - callbacks : :obj: `list` of :obj: `~gensim.models.callbacks.CallbackAny2Vec` + callbacks : :obj: `list` of :obj: `~gensim.models.callbacks.CallbackAny2Vec`, optional List of callbacks that need to be executed/run at specific stages during training. Examples @@ -263,7 +314,6 @@ def __init__(self, sentences=None, sg=0, hs=0, size=100, alpha=0.025, window=5, >>> say_vector = model['say'] # get vector for word >>> of_vector = model['of'] # get vector for out-of-vocab word - """ self.load = call_on_class_only self.load_fasttext_format = call_on_class_only @@ -340,28 +390,35 @@ def build_vocab(self, sentences, update=False, progress_per=10000, keep_raw_voca Parameters ---------- - sentences : iterable of iterables - The `sentences` iterable can be simply a list of lists of tokens, but for larger corpora, + sentences : iterable of list of str + Can be simply a list of lists of tokens, but for larger corpora, consider an iterable that streams the sentences directly from disk/network. See :class:`~gensim.models.word2vec.BrownCorpus`, :class:`~gensim.models.word2vec.Text8Corpus` or :class:`~gensim.models.word2vec.LineSentence` in :mod:`~gensim.models.word2vec` module for such examples. + update : bool + If true, the new words in `sentences` will be added to model's vocab. + progress_per : int + Indicates how many words to process before showing/updating the progress. keep_raw_vocab : bool If not true, delete the raw vocabulary after the scaling is done and free up RAM. - trim_rule : function + trim_rule : function, optional Vocabulary trimming rule, specifies whether certain words should remain in the vocabulary, be trimmed away, or handled using the default (discard if word count < min_count). Can be None (min_count will be used, look to :func:`~gensim.utils.keep_vocab_item`), or a callable that accepts parameters (word, count, min_count) and returns either :attr:`gensim.utils.RULE_DISCARD`, :attr:`gensim.utils.RULE_KEEP` or :attr:`gensim.utils.RULE_DEFAULT`. + The input parameters are of the following types + * word: str. The word we are examining + * count: int. The word's occurence count in the corpus + * min_count: int. The minimum count threshold. Note: The rule, if given, is only used to prune vocabulary during build_vocab() and is not stored as part of the model. - progress_per : int - Indicates how many words to process before showing/updating the progress. - update : bool - If true, the new words in `sentences` will be added to model's vocab. + **kwargs + Additional key word parameters passed to + :meth:`~gensim.models.base_any2vec.BaseWordEmbeddingsModel.build_vocab`. - Example - ------- + Examples + -------- Train a model and update vocab for online training >>> from gensim.models import FastText @@ -392,6 +449,7 @@ def _set_train_params(self, **kwargs): pass def _clear_post_train(self): + """Clears the model's internal structures after training has finished to free up RAM. """ self.wv.vectors_norm = None self.wv.vectors_vocab_norm = None self.wv.vectors_ngrams_norm = None @@ -402,14 +460,14 @@ def _do_train_job(self, sentences, alpha, inits): Parameters ---------- - sentences : iterable of iterables - The `sentences` iterable can be simply a list of lists of tokens, but for larger corpora, + sentences : iterable of list of str + Can be simply a list of lists of tokens, but for larger corpora, consider an iterable that streams the sentences directly from disk/network. See :class:`~gensim.models.word2vec.BrownCorpus`, :class:`~gensim.models.word2vec.Text8Corpus` or :class:`~gensim.models.word2vec.LineSentence` in :mod:`~gensim.models.word2vec` module for such examples. alpha : float The current learning rate. - inits : (:class:`numpy.ndarray`, :class:`numpy.ndarray`) + inits : tuple of (:class:`numpy.ndarray`, :class:`numpy.ndarray`) Each worker's private work memory. Returns @@ -489,20 +547,31 @@ def train(self, sentences, total_examples=None, total_words=None, self.trainables.get_vocab_word_vecs(self.wv) def init_sims(self, replace=False): - """ + """Deletes the keyed vector syn1 structure. + + Notes + ----- init_sims() resides in KeyedVectors because it deals with syn0 mainly, but because syn1 is not an attribute - of KeyedVectors, it has to be deleted in this class, and the normalizing of syn0 happens inside of KeyedVectors + of KeyedVectors, it has to be deleted in this class, and the normalizing of syn0 happens inside of KeyedVectors. + + Parameters + ---------- + replace : bool + If True, forget the original vectors and only keep the normalized ones to save RAM. + """ if replace and hasattr(self.trainables, 'syn1'): del self.trainables.syn1 - return self.wv.init_sims(replace) + self.wv.init_sims(replace) def clear_sims(self): - """ - Removes all L2-normalized vectors for words from the model. + """Removes all L2-normalized vectors for words from the model. + + Notes + ----- You will have to recompute them using init_sims method. - """ + """ self._clear_post_train() @deprecated("Method will be removed in 4.0.0, use self.wv.__getitem__() instead") @@ -523,10 +592,11 @@ def __contains__(self, word): @classmethod def load_fasttext_format(cls, model_file, encoding='utf8'): - """ - Load the input-hidden weight matrix from the fast text output files. + """Load the input-hidden weight matrix from the fast text output files. - Note that due to limitations in the FastText API, you cannot continue training + Notes + ------ + Due to limitations in the FastText API, you cannot continue training with a model loaded this way, though you can query for word similarity etc. Parameters @@ -535,14 +605,14 @@ def load_fasttext_format(cls, model_file, encoding='utf8'): Path to the FastText output files. FastText outputs two model files - `/path/to/model.vec` and `/path/to/model.bin` Expected value for this example: `/path/to/model` or `/path/to/model.bin`, - as gensim requires only `.bin` file to load entire fastText model. - encoding : str + as gensim requires only `.bin` file to the load entire fastText model. + encoding : str, optional Specifies the encoding. Returns ------- - :obj: `~gensim.models.fasttext.FastText` - Returns the loaded model as an instance of :class: `~gensim.models.fasttext.FastText`. + :class: `~gensim.models.fasttext.FastText` + The loaded model. """ model = cls() @@ -553,13 +623,28 @@ def load_fasttext_format(cls, model_file, encoding='utf8'): return model def load_binary_data(self, encoding='utf8'): - """Loads data from the output binary file created by FastText training""" + """Loads data from the output binary file created by FastText training. + + Parameters + ---------- + encoding : str, optional + Specifies the encoding. + + """ with utils.smart_open(self.file_name, 'rb') as f: self._load_model_params(f) self._load_dict(f, encoding=encoding) self._load_vectors(f) def _load_model_params(self, file_handle): + """Loads the models parameters from a file. + + Parameters + ---------- + file_handle : file-like object + Handle to an opened file. + + """ magic, version = self.struct_unpack(file_handle, '@2i') if magic == FASTTEXT_FILEFORMAT_MAGIC: # newer format self.new_format = True @@ -585,6 +670,18 @@ def _load_model_params(self, file_handle): self.vocabulary.sample = t def _load_dict(self, file_handle, encoding='utf8'): + """Loads a previously saved dictionary from disk. + + The dictionary is used to initialize the word vectors. + + Parameters + ---------- + file_handle : file-like object + The opened file handle to the persisted dictionary. + encoding : str + Specifies the encoding. + + """ vocab_size, nwords, nlabels = self.struct_unpack(file_handle, '@3i') # Vocab stored by [Dictionary::save](https://github.com/facebookresearch/fastText/blob/master/src/dictionary.cc) if nlabels > 0: @@ -622,6 +719,16 @@ def _load_dict(self, file_handle, encoding='utf8'): self.struct_unpack(file_handle, '@2i') def _load_vectors(self, file_handle): + """Loads the word vectors from disk. + + Parameters + ---------- + file_handle : file-like object + The opened file handle to the persisted dictionary. + encoding : str + Specifies the encoding. + + """ if self.new_format: self.struct_unpack(file_handle, '@?') # bool quant_input in fasttext.cc num_vectors, dim = self.struct_unpack(file_handle, '@2q') @@ -650,6 +757,21 @@ def _load_vectors(self, file_handle): self._clear_post_train() def struct_unpack(self, file_handle, fmt): + """Get the word vectors from disk using the cc format. + + Parameters + ---------- + file_handle : file_like object + Handle to an open file + fmt : str + Specified the format in which the C representation is saved. + + Returns + ------- + Tuple of (str) + String representation of each byte string found in the C file. + + """ num_bytes = struct.calcsize(fmt) return struct.unpack(fmt, file_handle.read(num_bytes)) @@ -677,8 +799,9 @@ def load(cls, *args, **kwargs): Returns ------- - :obj: `~gensim.models.fasttext.FastText` - Returns the loaded model as an instance of :class: `~gensim.models.fasttext.FastText`. + :class:`~gensim.models.fasttext.FastText` + The loaded model. + """ try: model = super(FastText, cls).load(*args, **kwargs) From 8a8e1fbeec5cc3c35d78e370851bc26e973db56a Mon Sep 17 00:00:00 2001 From: "Stergiadis, E" Date: Wed, 14 Mar 2018 15:31:52 +0100 Subject: [PATCH 21/41] Added high level explanation of the class hierarchy, fixed code review corrections --- gensim/models/base_any2vec.py | 44 +++++++++++++++++++++++------------ 1 file changed, 29 insertions(+), 15 deletions(-) diff --git a/gensim/models/base_any2vec.py b/gensim/models/base_any2vec.py index 4a3906a9c2..d5175b5d78 100644 --- a/gensim/models/base_any2vec.py +++ b/gensim/models/base_any2vec.py @@ -5,7 +5,21 @@ # Copyright (C) 2018 RaRe Technologies s.r.o. # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html -"""Contains base classes required for implementing any2vec algorithms.""" +"""This module contains base classes required for implementing any2vec algorithms. + +The class hierarchy is designed to facilitate adding more concrete implementations for creating embeddings. +In the most general case, the purpose of this class is to transform an arbitrary representation to a numerical vector +(embedding). This is represented by the base :class:`~gensim.models.base_any2vec.BaseAny2VecModel`. The input space in +most cases (in the NLP field at least) is plain text. For this reason, we enrich the class hierarchy with the abstract +:class:`~gensim.models.base_any2vec.BaseWordEmbeddingsModel` to be used as a base for models where the input +space is text. + +Notes +----- +Even though this is the usual case, not all embeddings transform text. +For example :class:`~gensim.models.poincare.PoincareModel` operates on graph representations. + +""" from gensim import utils import logging from timeit import default_timer @@ -51,15 +65,15 @@ def __init__(self, workers=3, vector_size=100, epochs=5, callbacks=(), batch_wor Parameters ---------- - workers : int - Number of working threads, used for multiprocessing. - vector_size : int + workers : int, optional + Number of working threads, used for multithreading. + vector_size : int, optional Dimensionality of the feature vectors. - epochs : int + epochs : int, optional Number of iterations (epochs) of training through the corpus. - callbacks : list of :class: `~gensim.models.callbacks.CallbackAny2Vec`, optional + callbacks : list of :class:`~gensim.models.callbacks.CallbackAny2Vec`, optional List of callbacks that need to be executed/run at specific stages during training. - batch_words : int + batch_words : int, optional Number of words to be processed by a single job. """ @@ -112,7 +126,7 @@ def _worker_loop(self, job_queue, progress_queue): Parameters ---------- - job_queue : Queue of (list of object, dict) + job_queue : Queue of (list of object, (str, int)) A queue of jobs still to be processed. The worker will take up jobs from this queue. Each job is represented by a tuple where the first element is the corpus chunk to be processed and the second is the dictionary of parameters. @@ -153,8 +167,8 @@ def _job_producer(self, data_iterator, job_queue, cur_epoch=0, total_examples=No Parameters ---------- data_iterator : iterable of list of object - The input corpus. This will be split in chunks and these chunks will be pushed to the queue. - job_queue : Queue of (list of object, dict) + The input dataset. This will be split in chunks and these chunks will be pushed to the queue. + job_queue : Queue of (list of object, dict of (str, int)) A queue of jobs still to be processed. The worker will take up jobs from this queue. Each job is represented by a tuple where the first element is the corpus chunk to be processed and the second is the dictionary of parameters. @@ -237,7 +251,7 @@ def _log_epoch_progress(self, progress_queue, job_queue, cur_epoch=0, total_exam * size of data chunk processed, for example number of sentences in the corpus chunk. * Effective word count used in training (after ignoring unknown words and trimming the sentence length). * Total word count used in training. - job_queue : Queue of (list of object, dict) + job_queue : Queue of (list of object, dict of (str, int)) A queue of jobs still to be processed. The worker will take up jobs from this queue. Each job is represented by a tuple where the first element is the corpus chunk to be processed and the second is the dictionary of parameters. @@ -372,7 +386,7 @@ def train(self, data_iterable, epochs=None, total_examples=None, Multiplier for size of queue -> size = number of workers * queue_factor. report_delay : float, optional Number of seconds between two consecutive progress report messages in the logger. - callbacks : list of :class: `~gensim.models.callbacks.CallbackAny2Vec`, optional + callbacks : list of :class:`~gensim.models.callbacks.CallbackAny2Vec`, optional List of callbacks that need to be executed/run at specific stages during training. **kwargs Additional key word parameters for the specific model inheriting from this class. @@ -507,7 +521,7 @@ def __init__(self, sentences=None, workers=3, vector_size=100, epochs=5, callbac Dimensionality of the feature vectors. epochs : int, optional Number of iterations (epochs) of training through the corpus. - callbacks : list of :class: `~gensim.models.callbacks.CallbackAny2Vec`, optional + callbacks : list of :class:`~gensim.models.callbacks.CallbackAny2Vec`, optional List of callbacks that need to be executed/run at specific stages during training. batch_words : int, optional Number of words to be processed by a single job. @@ -806,7 +820,7 @@ def build_vocab_from_freq(self, word_freq, keep_raw_vocab=False, corpus_count=No len(raw_vocab), sum(itervalues(raw_vocab)) ) - # Since no sentences are provided, this is to control the `corpus_count` + # Since no sentences are provided, this is to control the corpus_count. self.corpus_count = corpus_count or 0 self.vocabulary.raw_vocab = raw_vocab @@ -879,7 +893,7 @@ def train(self, sentences, total_examples=None, total_words=None, compute_loss : bool, optional If True, loss will be computed while training the Word2Vec model and stored in :attr:`~gensim.models.base_any2vec.BaseWordEmbeddingsModel.running_training_loss`. - callbacks : list of :class: `~gensim.models.callbacks.CallbackAny2Vec`, optional + callbacks : list of :class:`~gensim.models.callbacks.CallbackAny2Vec`, optional List of callbacks that need to be executed/run at specific stages during training. Returns From 535dc156244b87a1da9fd371fb6a44fb4f26280b Mon Sep 17 00:00:00 2001 From: Manos Stergiadis Date: Mon, 19 Mar 2018 20:40:31 +0100 Subject: [PATCH 22/41] Final identation fixes --- gensim/models/doc2vec.py | 2 +- gensim/models/word2vec.py | 11 ++++++++--- 2 files changed, 9 insertions(+), 4 deletions(-) diff --git a/gensim/models/doc2vec.py b/gensim/models/doc2vec.py index 4e83c495f3..cb30ef3ef8 100644 --- a/gensim/models/doc2vec.py +++ b/gensim/models/doc2vec.py @@ -937,7 +937,7 @@ def build_vocab(self, documents, update=False, progress_per=10000, keep_raw_voca Can be None (min_count will be used, look to :func:`~gensim.utils.keep_vocab_item`), or a callable that accepts parameters (word, count, min_count) and returns either :attr:`gensim.utils.RULE_DISCARD`, :attr:`gensim.utils.RULE_KEEP` or :attr:`gensim.utils.RULE_DEFAULT`. - The input parameters are of the following types + The input parameters are of the following types: * word: str. The word we are examining * count: int. The word's occurence count in the corpus * min_count: int. The minimum count threshold. diff --git a/gensim/models/word2vec.py b/gensim/models/word2vec.py index e7524a499d..9a764ba113 100755 --- a/gensim/models/word2vec.py +++ b/gensim/models/word2vec.py @@ -10,7 +10,9 @@ `_, `Distributed Representations of Words and Phrases and their Compositionality `_. -NOTE: There are more ways to get word vectors in Gensim than just Word2Vec. +Notes +----- +There are more ways to get word vectors in Gensim than just Word2Vec. See :class:`~gensim.models.fasttext.FastText` and wrappers for VarEmbed and WordRank. The training algorithms were originally ported from the C package https://code.google.com/p/word2vec/ @@ -44,8 +46,10 @@ The word vectors can also be instantiated from an existing file on disk in the word2vec C format as a KeyedVectors instance. -**NOTE**: It is impossible to continue training the vectors loaded from the C format because hidden weights, -vocabulary frequency and the binary tree is missing:: +Notes +----- +It is impossible to continue training the vectors loaded from the C format because hidden weights, +vocabulary frequency and the binary tree are missing:: >>> from gensim.models import KeyedVectors >>> @@ -99,6 +103,7 @@ Note that there is a :mod:`gensim.models.phrases` module which lets you automatically detect phrases longer than one word. Using phrases, you can learn a word2vec model where "words" are actually multiword expressions, such as `new_york_times` or `financial_crisis`: + >>> from gensim.models import Phrases >>> >>> bigram_transformer = Phrases(common_texts) From 1cc8889876634ba2d76cb15c3b1d764a66ed1461 Mon Sep 17 00:00:00 2001 From: Manos Stergiadis Date: Mon, 19 Mar 2018 21:07:15 +0100 Subject: [PATCH 23/41] Documentation fixes --- gensim/models/poincare.py | 142 +++++++++++++++++++++++--------------- 1 file changed, 86 insertions(+), 56 deletions(-) diff --git a/gensim/models/poincare.py b/gensim/models/poincare.py index cb9bce4aba..0536975b91 100644 --- a/gensim/models/poincare.py +++ b/gensim/models/poincare.py @@ -6,8 +6,11 @@ # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html -"""Python implementation of Poincaré Embeddings [1]_, an embedding that is better at capturing latent hierarchical -information than traditional Euclidean embeddings. The method is described in more detail in [1]_. +"""Python implementation of Poincaré Embeddings. + +These embeddings are better at capturing latent hierarchical information than traditional Euclidean embeddings. +The method is described in detail in `Maximilian Nickel, Douwe Kiela - +"Poincaré Embeddings for Learning Hierarchical Representations" `_. The main use-case is to automatically learn hierarchical representations of nodes from a tree-like structure, such as a Directed Acyclic Graph, using a transitive closure of the relations. Representations of nodes in a @@ -16,8 +19,6 @@ This module allows training a Poincaré Embedding from a training file containing relations of graph in a csv-like format, or a Python iterable of relations. -.. [1] Maximilian Nickel, Douwe Kiela - "Poincaré Embeddings for Learning Hierarchical Representations" - https://arxiv.org/abs/1705.08039 Examples -------- @@ -73,7 +74,9 @@ class PoincareModel(utils.SaveLoad): and :meth:`~gensim.models.poincare.PoincareModel.load` methods, or stored/loaded in the word2vec format via `model.kv.save_word2vec_format` and :meth:`~gensim.models.poincare.PoincareKeyedVectors.load_word2vec_format`. - Note that training cannot be resumed from a model loaded via `load_word2vec_format`, if you wish to train further, + Notes + ----- + Training cannot be resumed from a model loaded via `load_word2vec_format`, if you wish to train further, use :meth:`~gensim.models.poincare.PoincareModel.save` and :meth:`~gensim.models.poincare.PoincareModel.load` methods instead. @@ -84,11 +87,11 @@ def __init__(self, train_data, size=50, alpha=0.1, negative=10, workers=1, epsil Parameters ---------- - train_data : iterable of (str, str) - Iterable of relations, e.g. a list of tuples, or a PoincareRelations instance streaming from a file. - Note that the relations are treated as ordered pairs, i.e. a relation (a, b) does not imply the - opposite relation (b, a). In case the relations are symmetric, the data should contain both relations - (a, b) and (b, a). + train_data : {iterable of (str, str), :class:`gensim.models.poincare.PoincareRelations` + Iterable of relations, e.g. a list of tuples, or a :class:`gensim.models.poincare.PoincareRelations` + instance streaming from a file. Note that the relations are treated as ordered pairs, + i.e. a relation (a, b) does not imply the opposite relation (b, a). In case the relations are symmetric, + the data should contain both relations (a, b) and (b, a). size : int, optional Number of dimensions of the trained model. alpha : float, optional @@ -190,6 +193,7 @@ def _init_embeddings(self): self.kv.syn0 = self._np_random.uniform(self.init_range[0], self.init_range[1], shape).astype(self.dtype) def _init_node_probabilities(self): + """Initialize the a-priori probabilities. """ counts = np.array([ self.kv.vocab[self.kv.index2word[i]].count for i in range(len(self.kv.index2word)) @@ -271,7 +275,7 @@ def _loss_fn(matrix, regularization_coeff=1.0): ---------- matrix : numpy.array Array containing vectors for u, v and negative samples, of shape (2 + negative_size, dim). - regularization_coeff : float + regularization_coeff : float, optional Coefficient to use for l2-regularization Returns @@ -305,7 +309,7 @@ def _clip_vectors(vectors, epsilon): Parameters ---------- vectors : numpy.array - Can be 1-D,or 2-D (in which case the norm for each row is checked). + Can be 1-D, or 2-D (in which case the norm for each row is checked). epsilon : float Parameter for numerical stability, each dimension of the vector is reduced by `epsilon` if the norm of the vector is greater than or equal to 1. @@ -334,7 +338,18 @@ def _clip_vectors(vectors, epsilon): return vectors def save(self, *args, **kwargs): - """Save complete model to disk, inherited from :class:`gensim.utils.SaveLoad`.""" + """Save complete model to disk, inherited from :class:`~gensim.utils.SaveLoad`. + + See also :meth:`~gensim.models.poincare.PoincareModel.load` + + Parameters + ---------- + *args + Positional arguments passed to :meth:`~gensim.utils.SaveLoad.save`. + **kwargs + Keyword arguments passed to :meth:`~gensim.utils.SaveLoad.save`. + + """ self._loss_grad = None # Can't pickle autograd fn to disk attrs_to_ignore = ['_node_probabilities', '_node_counts_cumsum'] kwargs['ignore'] = set(list(kwargs.get('ignore', [])) + attrs_to_ignore) @@ -342,7 +357,23 @@ def save(self, *args, **kwargs): @classmethod def load(cls, *args, **kwargs): - """Load model from disk, inherited from :class:`~gensim.utils.SaveLoad`.""" + """Load model from disk, inherited from :class:`~gensim.utils.SaveLoad`. + + See also :meth:`~gensim.models.poincare.PoincareModel.save` + + Parameters + ---------- + *args + Positional arguments passed to :meth:`~gensim.utils.SaveLoad.load`. + **kwargs + Keyword arguments passed to :meth:`~gensim.utils.SaveLoad.load`. + + Returns + ------- + :class:`~gensim.models.poincare.PoincareModel` + The loaded model. + + """ model = super(PoincareModel, cls).load(*args, **kwargs) model._init_node_probabilities() return model @@ -352,7 +383,6 @@ def _prepare_training_batch(self, relations, all_negatives, check_gradients=Fals Parameters ---------- - relations : list of tuples List of tuples of positive examples of the form (node_1_index, node_2_index). all_negatives : list of lists @@ -390,12 +420,14 @@ def _check_gradients(self, relations, all_negatives, batch, tol=1e-8): Parameters ---------- - batch : PoincareBatch instance - Batch for which computed gradients are to checked. relations : list of tuples List of tuples of positive examples of the form (node_1_index, node_2_index). all_negatives : list of lists List of lists of negative samples for each node_1 in the positive examples. + batch : :class:`~gensim.models.poincare.PoincareBatch` + Batch for which computed gradients are to be checked. + tol : float, optional + The maximum error between our computed gradients and the reference ones from autograd. """ if not AUTOGRAD_PRESENT: @@ -425,7 +457,7 @@ def _sample_negatives_batch(self, nodes): Parameters ---------- - nodes : list + nodes : list of int List of node indices for which negative samples are to be returned. Returns @@ -442,7 +474,7 @@ def _train_on_batch(self, relations, check_gradients=False): Parameters ---------- - relations : list of tuples + relations : list of tuples of (int, int) List of tuples of positive examples of the form (node_1_index, node_2_index). check_gradients : bool, optional Whether to compare the computed gradients to autograd gradients for this batch. @@ -466,7 +498,7 @@ def _handle_duplicates(vector_updates, node_indices): ---------- vector_updates : numpy.array Array with each row containing updates to be performed on a certain node. - node_indices : list + node_indices : list of int Node indices on which the above updates are to be performed on. Notes @@ -518,11 +550,11 @@ def train(self, epochs, batch_size=10, print_every=1000, check_gradients_every=N Parameters ---------- - - batch_size : int, optional - Number of examples to train on in a single batch. epochs : int Number of iterations (epochs) over the corpus. + batch_size : int, optional + Number of examples to train on in a single batch. + print_every : int, optional Prints progress and average loss after every `print_every` batches. check_gradients_every : int or None, optional @@ -625,18 +657,16 @@ def __init__(self, vectors_u, vectors_v, indices_u, indices_v, regularization_co Parameters ---------- vectors_u : numpy.array - Vectors of all nodes `u` in the batch. - Expected shape (batch_size, dim). + Vectors of all nodes `u` in the batch. Expected shape (batch_size, dim). vectors_v : numpy.array Vectors of all positively related nodes `v` and negatively sampled nodes `v'`, - for each node `u` in the batch. - Expected shape (1 + neg_size, dim, batch_size). - indices_u : list + for each node `u` in the batch. Expected shape (1 + neg_size, dim, batch_size). + indices_u : list of int List of node indices for each of the vectors in `vectors_u`. - indices_v : list + indices_v : list of lists of int Nested list of lists, each of which is a list of node indices for each of the vectors in `vectors_v` for a specific node `u`. - regularization_coeff : float + regularization_coeff : float, optional Coefficient to use for l2-regularization """ @@ -1153,10 +1183,10 @@ def distances(self, node_or_vector, other_nodes=()): Parameters ---------- - node_or_vector : str/int or numpy.array + node_or_vector : {str, int, numpy.array} Node key or vector from which distances are to be computed. - other_nodes : iterable of str/int or None + other_nodes : {iterable of str, iterable of int, None}, optional For each node in `other_nodes` distance from `node_or_vector` is computed. If None or empty, distance of `node_or_vector` from all nodes in vocab is computed (including itself). @@ -1198,7 +1228,7 @@ def norm(self, node_or_vector): Parameters ---------- - node_or_vector : str/int or numpy.array + node_or_vector : {str, int, numpy.array} Input node key or vector for which position in hierarchy is to be returned. Returns @@ -1230,10 +1260,10 @@ def difference_in_hierarchy(self, node_or_vector_1, node_or_vector_2): Parameters ---------- - node_or_vector_1 : str/int or numpy.array + node_or_vector_1 : {str, int, numpy.array} Input node key or vector. - node_or_vector_2 : str/int or numpy.array + node_or_vector_2 : {str, int, numpy.array} Input node key or vector. Returns @@ -1364,7 +1394,7 @@ def __init__(self, file_path, embedding): ---------- file_path : str Path to tsv file containing relation pairs. - embedding : PoincareKeyedVectors instance + embedding : :class:`~gensim.models.poincare.PoincareKeyedVectors` Embedding to be evaluated. """ @@ -1391,15 +1421,15 @@ def get_positive_relation_ranks_and_avg_prec(all_distances, positive_relations): Parameters ---------- - all_distances : numpy.array (float) + all_distances : numpy.array of float Array of all distances (floats) for a specific item. positive_relations : list List of indices of positive relations for the item. Returns ------- - tuple (list, float) - The list contains ranks (int) of positive relations in the same order as `positive_relations`. + tuple (list of int, float) + The list contains ranks of positive relations in the same order as `positive_relations`. The float is the Average Precision of the ranking. e.g. ([1, 2, 3, 20], 0.610). @@ -1418,12 +1448,12 @@ def evaluate(self, max_n=None): Parameters ---------- - max_n : int or None + max_n : int, optional Maximum number of positive relations to evaluate, all if `max_n` is None. Returns ------- - dict + dict of (str, float) Contains (metric_name, metric_value) pairs. e.g. {'mean_rank': 50.3, 'MAP': 0.31}. @@ -1436,12 +1466,12 @@ def evaluate_mean_rank_and_map(self, max_n=None): Parameters ---------- - max_n : int or None + max_n : int, optional Maximum number of positive relations to evaluate, all if `max_n` is None. Returns ------- - tuple (float, float) + tuple of (float, float) Contains (mean_rank, MAP). e.g (50.3, 0.31) @@ -1475,7 +1505,7 @@ def __init__(self, train_path, test_path, embedding): Path to tsv file containing relation pairs used for training. test_path : str Path to tsv file containing relation pairs to evaluate. - embedding : PoincareKeyedVectors instance + embedding : :class:`~gensim.models.poincare.PoincareKeyedVectors` Embedding to be evaluated. """ @@ -1504,17 +1534,17 @@ def get_unknown_relation_ranks_and_avg_prec(all_distances, unknown_relations, kn Parameters ---------- - all_distances : numpy.array (float) + all_distances : numpy.array of float Array of all distances for a specific item. - unknown_relations : list + unknown_relations : list of int List of indices of unknown positive relations. - known_relations : list + known_relations : list of int List of indices of known positive relations. Returns ------- - tuple (list, float) - The list contains ranks (int) of positive relations in the same order as `positive_relations`. + tuple (list of int, float) + The list contains ranks of positive relations in the same order as `positive_relations`. The float is the Average Precision of the ranking. e.g. ([1, 2, 3, 20], 0.610). @@ -1534,12 +1564,12 @@ def evaluate(self, max_n=None): Parameters ---------- - max_n : int or None + max_n : int, optional Maximum number of positive relations to evaluate, all if `max_n` is None. Returns ------- - dict + dict of (str, float) Contains (metric_name, metric_value) pairs. e.g. {'mean_rank': 50.3, 'MAP': 0.31}. @@ -1552,7 +1582,7 @@ def evaluate_mean_rank_and_map(self, max_n=None): Parameters ---------- - max_n : int or None + max_n : int, optional Maximum number of positive relations to evaluate, all if `max_n` is None. Returns @@ -1608,7 +1638,7 @@ def score_function(self, embedding, trie, term_1, term_2): Parameters ---------- - embedding : PoincareKeyedVectors instance + embedding : :class:`~gensim.models.poincare.PoincareKeyedVectors` Embedding to use for computing predicted score. trie : pygtrie.Trie instance Trie to use for finding matching vocab terms for input terms. @@ -1655,7 +1685,7 @@ def find_matching_terms(trie, word): Returns ------- - list (str) + list of str List of matching terms. """ @@ -1669,7 +1699,7 @@ def create_vocab_trie(embedding): Parameters ---------- - embedding : PoincareKeyedVectors instance + embedding : :class:`~gensim.models.poincare.PoincareKeyedVectors` Embedding for which trie is to be created. Returns @@ -1694,7 +1724,7 @@ def evaluate_spearman(self, embedding): Parameters ---------- - embedding : PoincareKeyedVectors instance + embedding : :class:`~gensim.models.poincare.PoincareKeyedVectors` Embedding for which evaluation is to be done. Returns From add686ea9184150ba5ac8cf5fb4d89e48104034b Mon Sep 17 00:00:00 2001 From: Manos Stergiadis Date: Mon, 19 Mar 2018 21:49:11 +0100 Subject: [PATCH 24/41] Fixed all examples --- gensim/models/poincare.py | 109 ++++++++++++++++++++++++++++---------- 1 file changed, 82 insertions(+), 27 deletions(-) diff --git a/gensim/models/poincare.py b/gensim/models/poincare.py index 0536975b91..72bd3cdaaa 100644 --- a/gensim/models/poincare.py +++ b/gensim/models/poincare.py @@ -833,9 +833,15 @@ def word_vec(self, word): Returns the word's representations in vector space, as a 1D numpy array. Example:: - - >>> trained_model.word_vec('office') - array([ -1.40128313e-02, ...]) + >>> from gensim.test.utils import datapath + >>> + >>> # Read the sample relations file and train the model + >>> relations = PoincareRelations(file_path=datapath('poincare_hypernyms_large.tsv')) + >>> model = PoincareModel(train_data=relations) + >>> model.train(epochs=50) + >>> + >>> # Query the trained model. + >>> wv = model.kv.word_vec('kangaroo.n.01') """ return super(PoincareKeyedVectors, self).get_vector(word) @@ -858,9 +864,16 @@ def words_closer_than(self, w1, w2): Examples -------- - - >>> model.words_closer_than('carnivore.n.01', 'mammal.n.01') - ['dog.n.01', 'canine.n.02'] + >>> from gensim.test.utils import datapath + >>> + >>> # Read the sample relations file and train the model + >>> relations = PoincareRelations(file_path=datapath('poincare_hypernyms_large.tsv')) + >>> model = PoincareModel(train_data=relations) + >>> model.train(epochs=50) + >>> + >>> # Which term is closer to 'kangaroo' than 'metatherian' is to 'kangaroo'? + >>> model.kv.words_closer_than('kangaroo.n.01', 'metatherian.n.01') + [u'marsupial.n.01', u'phalanger.n.01'] """ return super(PoincareKeyedVectors, self).closer_than(w1, w2) @@ -1084,9 +1097,16 @@ def distance(self, w1, w2): Examples -------- - - >>> model.distance('mammal.n.01', 'carnivore.n.01') - 2.13 + >>> from gensim.test.utils import datapath + >>> + >>> # Read the sample relations file and train the model + >>> relations = PoincareRelations(file_path=datapath('poincare_hypernyms_large.tsv')) + >>> model = PoincareModel(train_data=relations) + >>> model.train(epochs=50) + >>> + >>> # What is the distance between the words 'mammal' and 'carnivore'? + >>> model.kv.distance('mammal.n.01', 'carnivore.n.01') + 2.9742298803339304 Notes ----- @@ -1115,9 +1135,16 @@ def similarity(self, w1, w2): Examples -------- - - >>> model.similarity('mammal.n.01', 'carnivore.n.01') - 0.73 + >>> from gensim.test.utils import datapath + >>> + >>> # Read the sample relations file and train the model + >>> relations = PoincareRelations(file_path=datapath('poincare_hypernyms_large.tsv')) + >>> model = PoincareModel(train_data=relations) + >>> model.train(epochs=50) + >>> + >>> # What is the similarity between the words 'mammal' and 'carnivore'? + >>> model.kv.similarity('mammal.n.01', 'carnivore.n.01') + 0.25162107631176484 Notes ----- @@ -1150,8 +1177,16 @@ def most_similar(self, node_or_vector, topn=10, restrict_vocab=None): Examples -------- - >>> vectors.most_similar('lion.n.01') - [('lion_cub.n.01', 0.4484), ('lionet.n.01', 0.6552), ...] + >>> from gensim.test.utils import datapath + >>> + >>> # Read the sample relations file and train the model + >>> relations = PoincareRelations(file_path=datapath('poincare_hypernyms_large.tsv')) + >>> model = PoincareModel(train_data=relations) + >>> model.train(epochs=50) + >>> + >>> # Which words are most similar to 'kangaroo'? + >>> model.kv.most_similar('kangaroo.n.01', topn=2) + [(u'kangaroo.n.01', 0.0), (u'marsupial.n.01', 0.26524229460827725)] """ if not restrict_vocab: @@ -1198,12 +1233,19 @@ def distances(self, node_or_vector, other_nodes=()): Examples -------- + >>> from gensim.test.utils import datapath + >>> + >>> # Read the sample relations file and train the model + >>> relations = PoincareRelations(file_path=datapath('poincare_hypernyms_large.tsv')) + >>> model = PoincareModel(train_data=relations) + >>> model.train(epochs=50) + >>> + >>> # Check the distances between a word and a list of other words. + >>> model.kv.distances('mammal.n.01', ['carnivore.n.01', 'dog.n.01']) + array([2.97422988, 2.83007402]) - >>> model.distances('mammal.n.01', ['carnivore.n.01', 'dog.n.01']) - np.array([2.1199, 2.0710] - - >>> model.distances('mammal.n.01') - np.array([0.43753847, 3.67973852, ..., 6.66172886]) + >>> # Check the distances between a word and every other word in the vocab. + >>> all_distances = model.kv.distances('mammal.n.01') Notes ----- @@ -1238,9 +1280,16 @@ def norm(self, node_or_vector): Examples -------- - - >>> model.norm('mammal.n.01') - 0.9 + >>> from gensim.test.utils import datapath + >>> + >>> # Read the sample relations file and train the model + >>> relations = PoincareRelations(file_path=datapath('poincare_hypernyms_large.tsv')) + >>> model = PoincareModel(train_data=relations) + >>> model.train(epochs=50) + >>> + >>> # Get the norm of the embedding of the word `mammal`. + >>> model.kv.norm('mammal.n.01') + 0.6423008703542398 Notes ----- @@ -1273,12 +1322,18 @@ def difference_in_hierarchy(self, node_or_vector_1, node_or_vector_2): Examples -------- + >>> from gensim.test.utils import datapath + >>> + >>> # Read the sample relations file and train the model + >>> relations = PoincareRelations(file_path=datapath('poincare_hypernyms_large.tsv')) + >>> model = PoincareModel(train_data=relations) + >>> model.train(epochs=50) + >>> + >>> model.kv.difference_in_hierarchy('mammal.n.01', 'dog.n.01') + 0.05382517902410999 - >>> model.difference_in_hierarchy('mammal.n.01', 'dog.n.01') - 0.51 - - >>> model.difference_in_hierarchy('dog.n.01', 'mammal.n.01') - -0.51 + >>> model.kv.difference_in_hierarchy('dog.n.01', 'mammal.n.01') + -0.05382517902410999 Notes ----- From 7cb408c32c445af39e4884a73bf08b460c54b017 Mon Sep 17 00:00:00 2001 From: "Stergiadis, E" Date: Tue, 20 Mar 2018 11:44:18 +0100 Subject: [PATCH 25/41] delete redundant reference to module --- gensim/models/doc2vec.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/gensim/models/doc2vec.py b/gensim/models/doc2vec.py index cb30ef3ef8..c582c44b71 100644 --- a/gensim/models/doc2vec.py +++ b/gensim/models/doc2vec.py @@ -924,7 +924,6 @@ def build_vocab(self, documents, update=False, progress_per=10000, keep_raw_voca Can be simply a list of :class:`~gensim.models.doc2vec.TaggedDocument` elements, but for larger corpora, consider an iterable that streams the documents directly from disk/network. See :class:`~gensim.models.doc2vec.TaggedBrownCorpus` or :class:`~gensim.models.doc2vec.TaggedLineDocument` - in :mod:`~gensim.models.doc2vec` module for such examples. update : bool If true, the new words in `sentences` will be added to model's vocab. progress_per : int @@ -938,9 +937,9 @@ def build_vocab(self, documents, update=False, progress_per=10000, keep_raw_voca or a callable that accepts parameters (word, count, min_count) and returns either :attr:`gensim.utils.RULE_DISCARD`, :attr:`gensim.utils.RULE_KEEP` or :attr:`gensim.utils.RULE_DEFAULT`. The input parameters are of the following types: - * word: str. The word we are examining - * count: int. The word's occurence count in the corpus - * min_count: int. The minimum count threshold. + * word: str. The word we are examining. + * count: int. The word's occurence count in the corpus. + * min_count: int. The minimum count threshold. Note: The rule, if given, is only used to prune vocabulary during build_vocab() and is not stored as part of the model. **kwargs From 5b6d8154c738b8c7bb50626f58aeb42cfa08e4e0 Mon Sep 17 00:00:00 2001 From: Manos Stergiadis Date: Wed, 21 Mar 2018 20:45:34 +0100 Subject: [PATCH 26/41] Added explanation for all important class attributes. These include some intuitive information taken from the papers but also references to usage examples for users that do not wish to understand the underlying theory. --- gensim/models/doc2vec.py | 35 ++++++++++++++++++++++++++++++++++- gensim/models/fasttext.py | 24 ++++++++++++++++++++++++ gensim/models/poincare.py | 8 ++++++++ gensim/models/word2vec.py | 18 ++++++++++++++++++ 4 files changed, 84 insertions(+), 1 deletion(-) diff --git a/gensim/models/doc2vec.py b/gensim/models/doc2vec.py index c582c44b71..fa10665ea7 100644 --- a/gensim/models/doc2vec.py +++ b/gensim/models/doc2vec.py @@ -385,7 +385,40 @@ def repeat(self, word_count): class Doc2Vec(BaseWordEmbeddingsModel): - """Class for training, using and evaluating neural networks described in http://arxiv.org/pdf/1405.4053v2.pdf""" + """Class for training, using and evaluating neural networks described in + `Distributed Representations of Sentences and Documents `_. + + Some important attributes are the following: + + self.wv : :class:`~gensim.models.keyedvectors.Word2VecKeyedVectors` + This object essentially contains the mapping between words and embeddings. After training, it can be used + directly to query those embeddings in various ways. See the module level docstring for examples. + + self.docvecs : :class:`~gensim.models.keyedvectors.Doc2VecKeyedVectors` + This object contains the paragraph vectors. Remember that the only difference between this model and + Word2Vec is that besides the word vectors we also include paragraph embeddings to capture the paragraph. + In this way we can capture the difference between the same word used in a different wide context. + For example we now have a different representation of the word "leaves" in the following two sentences:: + + 1. Manos leaves the office every day at 18:00 to catch his train + 2. This season is called Fall, because leaves fall from the trees. + + In a plain Word2Vec model the word would have exactly the same representation in both sentences, in Doc2Vec it + will not. + + self.vocabulary : :class:'~gensim.models.doc2vec.Doc2VecVocab' + This object represents the vocabulary (sometimes called Dictionary in gensim) of the model. + Besides keeping track of all unique words, this object provides extra functionality, such as + sorting words by frequency, or discarding extremely rare words. + + self.trainables : :class:`~gensim.models.doc2vec.Doc2VecTrainables` + This object represents the inner shallow neural network used to train the embeddings. The semantics of the + network differ slightly in the two available training modes (CBOW or SG) but you can think of it as a NN with + a single projection and hidden layer which we train on the corpus. The weights are then used as our embeddings + The only addition to the underlying NN used in Word2Vec is that the input includes not only the word vectors + of each word in the context, but also the paragraph vector. + + """ def __init__(self, documents=None, dm_mean=None, dm=1, dbow_words=0, dm_concat=0, dm_tag_count=1, docvecs=None, docvecs_mapfile=None, comment=None, trim_rule=None, callbacks=(), **kwargs): diff --git a/gensim/models/fasttext.py b/gensim/models/fasttext.py index 086f25ce27..949e94640a 100644 --- a/gensim/models/fasttext.py +++ b/gensim/models/fasttext.py @@ -81,6 +81,7 @@ logger = logging.getLogger(__name__) try: + raise ImportError from gensim.models.fasttext_inner import train_batch_sg, train_batch_cbow from gensim.models.fasttext_inner import FAST_VERSION, MAX_WORDS_IN_BATCH @@ -206,6 +207,29 @@ class FastText(BaseWordEmbeddingsModel): :meth:`~gensim.models.fasttext.FastText.load()` methods, or loaded in a format compatible with the original fasttext implementation via :meth:`~gensim.models.fasttext.FastText.load_fasttext_format()`. + Some important attributes are the following: + + self.wv : :class:`~gensim.models.keyedvectors.FastTextKeyedVectors` + This object essentially contains the mapping between words and embeddings. These are similar to the embeddings + computed in the Word2Vec model, however here we also include vectors for n-grams. This allows the model to + compute embeddings even for **unseen** words (that do not exist in the vocabulary), as the aggregate of the + n-grams included in the word. After training the model, this attribute can be used directly to query those + embeddings in various ways. Check the module level docstring from some examples. + + self.vocabulary : :class:'~gensim.models.fasttext.FastTextVocab' + This object represents the vocabulary (sometimes called Dictionary in gensim) of the model. + Besides keeping track of all unique words, this object provides extra functionality, such as + constructing a huffman tree (frequent words are closer to the root), or discarding extremely rare words. + + self.trainables : :class:`~gensim.models.fasttext.FastTextTrainables` + This object represents the inner shallow neural network used to train the embeddings. This is very + similar to the network of the Word2Vec model, but it also trains weights for the N-Grams (sequences of more + than 1 words). The semantics of the network are almost the same as the one used for the Word2Vec model: + You can think of it as a NN with a single projection and hidden layer which we train on the corpus. + The weights are then used as our embeddings. An important difference however between the two models, is the + scoring function used to compute the loss. In the case of FastText, this is modified in word to also account + for the internal structure of words, besides their cooccurence counts. + """ def __init__(self, sentences=None, sg=0, hs=0, size=100, alpha=0.025, window=5, min_count=5, max_vocab_size=None, word_ngrams=1, sample=1e-3, seed=1, workers=3, min_alpha=0.0001, diff --git a/gensim/models/poincare.py b/gensim/models/poincare.py index 72bd3cdaaa..9353226d3e 100644 --- a/gensim/models/poincare.py +++ b/gensim/models/poincare.py @@ -80,6 +80,14 @@ class PoincareModel(utils.SaveLoad): use :meth:`~gensim.models.poincare.PoincareModel.save` and :meth:`~gensim.models.poincare.PoincareModel.load` methods instead. + An important attribute (that provides a lot of additional functionality when directly accessed) are the \ + keyed vectors: + + self.kv : :class:`~gensim.models.poincare.PoincareKeyedVectors` + This object essentially contains the mapping between nodes and embeddings, as well the vocabulary of the model + (set of unique nodes seen by the model). After training, it can be used to perform operations on the vectors \ + such as vector lookup, distance etc. See the documentation of its class for many usage examples. + """ def __init__(self, train_data, size=50, alpha=0.1, negative=10, workers=1, epsilon=1e-5, regularization_coeff=1.0, burn_in=10, burn_in_alpha=0.01, init_range=(-0.001, 0.001), dtype=np.float64, seed=0): diff --git a/gensim/models/word2vec.py b/gensim/models/word2vec.py index 9a764ba113..c574a5a511 100755 --- a/gensim/models/word2vec.py +++ b/gensim/models/word2vec.py @@ -610,6 +610,24 @@ class Word2Vec(BaseWordEmbeddingsModel): compatible with the original word2vec implementation via `wv.save_word2vec_format()` and `Word2VecKeyedVectors.load_word2vec_format()`. + + Some important attributes are the following: + + self.wv : :class:`~gensim.models.keyedvectors.Word2VecKeyedVectors` + This object essentially contains the mapping between words and embeddings. After training, it can be used + directly to query those embeddings in various ways. See the module level docstring for examples. + + self.vocabulary : :class:'~gensim.models.word2vec.Word2VecVocab' + This object represents the vocabulary (sometimes called Dictionary in gensim) of the model. + Besides keeping track of all unique words, this object provides extra functionality, such as + constructing a huffman tree (frequent words are closer to the root), or discarding extremely rare words. + + self.trainables : :class:`~gensim.models.word2vec.Word2VecTrainables` + This object represents the inner shallow neural network used to train the embeddings. The semantics of the + network differ slightly in the two available training modes (CBOW or SG) but you can think of it as a NN with + a single projection and hidden layer which we train on the corpus. The weights are then used as our embeddings + (which means that the size of the hidden layer is equal to the number of features `self.size`). + """ def __init__(self, sentences=None, size=100, alpha=0.025, window=5, min_count=5, From f58e9a2b471e91472ff64e62fd517a23f0f11224 Mon Sep 17 00:00:00 2001 From: Manos Stergiadis Date: Thu, 29 Mar 2018 11:17:56 +0200 Subject: [PATCH 27/41] documented public cython functions --- gensim/models/word2vec_inner.pyx | 102 ++++++++++++++++++++++++++++++- 1 file changed, 101 insertions(+), 1 deletion(-) diff --git a/gensim/models/word2vec_inner.pyx b/gensim/models/word2vec_inner.pyx index 98e719c6d4..6974c3a96a 100755 --- a/gensim/models/word2vec_inner.pyx +++ b/gensim/models/word2vec_inner.pyx @@ -294,6 +294,30 @@ cdef unsigned long long fast_sentence_cbow_neg( def train_batch_sg(model, sentences, alpha, _work, compute_loss): + """Update skip-gram model by training on a batch of sentences. + + Called internally from :meth:`~gensim.models.word2vec.Word2Vec.train`. + + Parameters + ---------- + model : :class:`~gensim.models.word2Vec.Word2Vec` + The Word2Vec model instance to train. + sentences : iterable of list of str + The corpus used to train the model. + alpha : float + The learning rate + _work : np.ndarray + Private working memory for each worker. + compute_loss : bool + Whether or not the training loss should be computed in this batch. + + Returns + ------- + int + Number of words in the vocabulary actually used for training (They already existed in the vocabulary + and were not discarded by negative sampling). + + """ cdef int hs = model.hs cdef int negative = model.negative cdef int sample = (model.vocabulary.sample != 0) @@ -401,6 +425,31 @@ def train_batch_sg(model, sentences, alpha, _work, compute_loss): def train_batch_cbow(model, sentences, alpha, _work, _neu1, compute_loss): + """Update CBOW model by training on a batch of sentences. + + Called internally from :meth:`~gensim.models.word2vec.Word2Vec.train`. + + Parameters + ---------- + model : :class:`~gensim.models.word2vec.Word2Vec` + The Word2Vec model instance to train. + sentences : iterable of list of str + The corpus used to train the model. + alpha : float + The learning rate. + _work : np.ndarray + Private working memory for each worker. + _neu1 : np.ndarray + Private working memory for each worker. + compute_loss : bool + Whether or not the training loss should be computed in this batch. + + Returns + ------- + int + Number of words in the vocabulary actually used for training (They already existed in the vocabulary + and were not discarded by negative sampling). + """ cdef int hs = model.hs cdef int negative = model.negative cdef int sample = (model.vocabulary.sample != 0) @@ -506,8 +555,29 @@ def train_batch_cbow(model, sentences, alpha, _work, _neu1, compute_loss): return effective_words -# Score is only implemented for hierarchical softmax def score_sentence_sg(model, sentence, _work): + """Obtain likelihood score for a single sentence in a fitted skip-gram representation. + + Notes + ----- + This scoring function is only implemented for hierarchical softmax (`model.hs == 1`). + The model should have been trained using the skip-gram model (`model.sg` == 1`). + + Parameters + ---------- + model : :class:`~gensim.models.word2vec.Word2Vec` + The trained model. It **MUST** have been trained using hierarchical softmax and the skip-gram algorithm. + sentence : list of str + The words comprising the sentence to be scored. + _work : np.ndarray + Private working memory for each worker. + + Returns + ------- + float + The probability assigned to this sentence by the Skip-Gram model. + + """ cdef REAL_t *syn0 = (np.PyArray_DATA(model.wv.vectors)) cdef REAL_t *work @@ -586,7 +656,30 @@ cdef void score_pair_sg_hs( work[0] += f def score_sentence_cbow(model, sentence, _work, _neu1): + """Obtain likelihood score for a single sentence in a fitted CBOW representation. + + Notes + ----- + This scoring function is only implemented for hierarchical softmax (`model.hs == 1`). + The model should have been trained using the skip-gram model (`model.cbow` == 1`). + + Parameters + ---------- + model : :class:`~gensim.models.word2vec.Word2Vec` + The trained model. It **MUST** have been trained using hierarchical softmax and the CBOW algorithm. + sentence : list of str + The words comprising the sentence to be scored. + _work : np.ndarray + Private working memory for each worker. + _neu1 : np.ndarray + Private working memory for each worker. + + Returns + ------- + float + The probability assigned to this sentence by the Skip-Gram model. + """ cdef int cbow_mean = model.cbow_mean cdef REAL_t *syn0 = (np.PyArray_DATA(model.wv.vectors)) @@ -685,6 +778,13 @@ def init(): Precompute function `sigmoid(x) = 1 / (1 + exp(-x))`, for x values discretized into table EXP_TABLE. Also calculate log(sigmoid(x)) into LOG_TABLE. + Returns + ------- + {0, 1, 2} + Enumeration to signify underlying data type returned by the BLAS dot product calculation. + 0 signifies double, 1 signifies double, and 2 signifies that custom cython loops were used + instead of BLAS. + """ global our_dot global our_saxpy From 6570cefeb9e91098f0f9e1bca64abcac2049c640 Mon Sep 17 00:00:00 2001 From: Manos Stergiadis Date: Thu, 29 Mar 2018 11:48:11 +0200 Subject: [PATCH 28/41] documented public cython functions in doc2vec --- gensim/models/doc2vec_inner.pyx | 134 ++++++++++++++++++++++++++++++++ 1 file changed, 134 insertions(+) diff --git a/gensim/models/doc2vec_inner.pyx b/gensim/models/doc2vec_inner.pyx index b41e8a8a3a..3facbdde44 100644 --- a/gensim/models/doc2vec_inner.pyx +++ b/gensim/models/doc2vec_inner.pyx @@ -227,6 +227,51 @@ cdef unsigned long long fast_document_dmc_neg( def train_document_dbow(model, doc_words, doctag_indexes, alpha, work=None, train_words=False, learn_doctags=True, learn_words=True, learn_hidden=True, word_vectors=None, word_locks=None, doctag_vectors=None, doctag_locks=None): + ""Update distributed bag of words model ("PV-DBOW") by training on a single document. + + Called internally from :meth:`~gensim.models.doc2vec.Doc2Vec.train` and + :meth:`~gensim.models.doc2vec.Doc2Vec.infer_vector()`. + + Parameters + ---------- + model : :class:`~gensim.models.doc2vec.Doc2Vec` + The model to train. + doc_words : list of str + The input document as a list of words to be used for training. Each word will be looked up in + the model's vocabulary. + doctag_indexes : list of int + Indices into `doctag_vectors` used to obtain the tags of the document. + alpha : float + Learning rate. + work : list of float, optional + Updates to be performed on each neuron in the hidden layer of the underlying network. + train_words : bool, optional + Word vectors will be updated exactly as per Word2Vec skip-gram training only if **both** + `learn_words` and `train_words` are set to True. + learn_doctags : bool, optional + Whether the tag vectors should be updated. + learn_words : bool, optional + Word vectors will be updated exactly as per Word2Vec skip-gram training only if **both** + `learn_words` and `train_words` are set to True. + learn_hidden : bool, optional + Whether or not the weights of the hidden layer will be updated. + word_vectors : list of list of float, optional + The vector representation for each word in the vocabulary. If None, these will be retrieved from + the model. + word_locks : list of float, optional + A learning lock factor for each weight in the hidden layer. A value of 0 completely + blocks updates, a value of 1 allows full speed learning. + doctag_vectors : list of list of float, optional + Vector representations of the tags. If None, these will be retrieved from the model. + doctag_locks : list of float, optional + The lock factors for each tag. + + Returns + ------- + int + Number of words in the input document. + + """ cdef int hs = model.hs cdef int negative = model.negative cdef int sample = (model.vocabulary.sample != 0) @@ -363,6 +408,51 @@ def train_document_dbow(model, doc_words, doctag_indexes, alpha, work=None, def train_document_dm(model, doc_words, doctag_indexes, alpha, work=None, neu1=None, learn_doctags=True, learn_words=True, learn_hidden=True, word_vectors=None, word_locks=None, doctag_vectors=None, doctag_locks=None): + """Update distributed memory model ("PV-DM") by training on a single document. + + Called internally from :meth:`~gensim.models.doc2vec.Doc2Vec.train` and + :meth:`~gensim.models.doc2vec.Doc2Vec.infer_vector()`. This method implements + the DM model with a projection (input) layer that is either the sum or mean of + the context vectors, depending on the model's `dm_mean` configuration field. + + Parameters + ---------- + model : :class:`~gensim.models.doc2vec.Doc2Vec` + The model to train. + doc_words : list of str + The input document as a list of words to be used for training. Each word will be looked up in + the model's vocabulary. + doctag_indexes : list of int + Indices into `doctag_vectors` used to obtain the tags of the document. + alpha : float + Learning rate. + work : np.ndarray, optional + Private working memory for each worker. + neu1 : np.ndarray, optional + Private working memory for each worker. + learn_doctags : bool, optional + Whether the tag vectors should be updated. + learn_words : bool, optional + Word vectors will be updated exactly as per Word2Vec skip-gram training only if **both** + `learn_words` and `train_words` are set to True. + learn_hidden : bool, optional + Whether or not the weights of the hidden layer will be updated. + word_vectors : iterable of list of float + Vector representations of each word in the model's vocabulary. + word_locks : listf of float, optional + Lock factors for each word in the vocabulary. 0 blocks training, 1 fully allows it. + doctag_vectors : list of list of float, optional + Vector representations of the tags. If None, these will be retrieved from the model. + doctag_locks : list of float, optional + The lock factors for each tag. 0 blocks training, 1 fully allows it. + + Returns + ------- + int + Number of words in the input document that were actually used for training (they were found in the + vocavulary and they were not discarded by negative sampling). + + """ cdef int hs = model.hs cdef int negative = model.negative cdef int sample = (model.vocabulary.sample != 0) @@ -521,6 +611,50 @@ def train_document_dm(model, doc_words, doctag_indexes, alpha, work=None, neu1=N def train_document_dm_concat(model, doc_words, doctag_indexes, alpha, work=None, neu1=None, learn_doctags=True, learn_words=True, learn_hidden=True, word_vectors=None, word_locks=None, doctag_vectors=None, doctag_locks=None): + """Update distributed memory model ("PV-DM") by training on a single document, using a + concatenation of the context window word vectors (rather than a sum or average). This + might be slower since the input at each batch will be significantly larger. + + Called internally from `Doc2Vec.train()` and `Doc2Vec.infer_vector()`. + + Parameters + ---------- + model : :class:`~gensim.models.doc2vec.Doc2Vec` + The model to train. + doc_words : list of str + The input document as a list of words to be used for training. Each word will be looked up in + the model's vocabulary. + doctag_indexes : list of int + Indices into `doctag_vectors` used to obtain the tags of the document. + alpha : float, optional + Learning rate. + work : np.ndarray, optional + Private working memory for each worker. + neu1 : np.ndarray, optional + Private working memory for each worker. + learn_doctags : bool, optional + Whether the tag vectors should be updated. + learn_words : bool, optional + Word vectors will be updated exactly as per Word2Vec skip-gram training only if **both** + `learn_words` and `train_words` are set to True. + learn_hidden : bool, optional + Whether or not the weights of the hidden layer will be updated. + word_vectors : iterable of list of float, optional + Vector representations of each word in the model's vocabulary. + word_locks : listf of float, optional + Lock factors for each word in the vocabulary. + doctag_vectors : list of list of float, optional + Vector representations of the tags. If None, these will be retrieved from the model. + doctag_locks : list of float, optional + The lock factors for each tag. + + Returns + ------- + int + Number of words in the input document that were actually used for training (they were found in the + vocavulary and they were not discarded by negative sampling). + + """ cdef int hs = model.hs cdef int negative = model.negative cdef int sample = (model.vocabulary.sample != 0) From 0e8d2995a7a341f25553628821b58e98a52294cc Mon Sep 17 00:00:00 2001 From: Manos Stergiadis Date: Fri, 30 Mar 2018 16:39:09 +0200 Subject: [PATCH 29/41] Applied code review corrections --- gensim/models/base_any2vec.py | 47 ++++++++++++++++++++++++----------- gensim/models/doc2vec.py | 31 +++++++++++++++-------- gensim/models/fasttext.py | 3 +-- gensim/models/poincare.py | 30 +++++++++++++--------- 4 files changed, 71 insertions(+), 40 deletions(-) diff --git a/gensim/models/base_any2vec.py b/gensim/models/base_any2vec.py index d5175b5d78..a2c6ca3760 100644 --- a/gensim/models/base_any2vec.py +++ b/gensim/models/base_any2vec.py @@ -16,8 +16,15 @@ Notes ----- -Even though this is the usual case, not all embeddings transform text. -For example :class:`~gensim.models.poincare.PoincareModel` operates on graph representations. +Even though this is the usual case, not all embeddings transform text. Check the next section for +concrete examples. + +See Also +-------- +:class:`~gensim.models.word2vec.Word2Vec`. +:class:`~gensim.models.fasttext.FastText`. +:class:`~gensim.models.doc2vec.Doc2Vec`. +:class:`~gensim.models.poincare.PoincareModel` """ from gensim import utils @@ -51,18 +58,18 @@ class BaseAny2VecModel(utils.SaveLoad): In the special but usual case where the input space consists of words, a more specialized layer is provided, consider inheriting from :class:`~gensim.models.base_any2vec.BaseWordEmbeddingsModel` + Notes + ----- + A subclass should initialize the following attributes: + - self.kv (instance of concrete implementation of `BaseKeyedVectors` interface) + - self.vocabulary (instance of concrete implementation of `BaseVocabBuilder` abstract class) + - self.trainables (instance of concrete implementation of `BaseTrainables` abstract class) + """ def __init__(self, workers=3, vector_size=100, epochs=5, callbacks=(), batch_words=10000): """Initialize model parameters. - Notes - ----- - A subclass should initialize the following attributes: - - self.kv (instance of concrete implementation of `BaseKeyedVectors` interface) - - self.vocabulary (instance of concrete implementation of `BaseVocabBuilder` abstract class) - - self.trainables (instance of concrete implementation of `BaseTrainables` abstract class) - Parameters ---------- workers : int, optional @@ -91,7 +98,7 @@ def _get_job_params(self, cur_epoch): raise NotImplementedError() def _set_train_params(self, **kwargs): - """Set model parameters required for training""" + """Set model parameters required for training.""" raise NotImplementedError() def _update_job_params(self, job_params, epoch_progress, cur_epoch): @@ -465,6 +472,7 @@ def load(cls, fname_or_handle, **kwargs): ------ IOError When methods are called on instance (should be called from class). + """ return super(BaseAny2VecModel, cls).load(fname_or_handle, **kwargs) @@ -481,6 +489,7 @@ def save(self, fname_or_handle, **kwargs): See Also -------- :meth:`~gensim.models.base_any2vec.BaseAny2VecModel.save` + """ super(BaseAny2VecModel, self).save(fname_or_handle, **kwargs) @@ -488,9 +497,10 @@ def save(self, fname_or_handle, **kwargs): class BaseWordEmbeddingsModel(BaseAny2VecModel): """Base class containing common methods for training, using & evaluating word embeddings learning models. - Example implementations are - * :class:`~gensim.models.word2vec.Word2Vec` - * :class:`~gensim.models.word2vec.FastText`, etc. + See Also + -------- + :class:`~gensim.models.word2vec.Word2Vec` + :class:`~gensim.models.word2vec.FastText`, etc. """ @@ -567,6 +577,7 @@ def __init__(self, sentences=None, workers=3, vector_size=100, epochs=5, callbac Whether or not the fast cython implementation of the internal training methods is available. 1 means it is. **kwargs Key word arguments needed to allow children classes to accept more arguments. + """ self.sg = int(sg) if vector_size % 4 != 0: @@ -730,6 +741,7 @@ def __str__(self): str A human readable string containing the class name, as well as the id to word mapping, number of features and starting learning rate used by the object. + """ return "%s(vocab=%s, size=%s, alpha=%s)" % ( self.__class__.__name__, len(self.wv.index2word), self.vector_size, self.alpha @@ -766,7 +778,6 @@ def build_vocab(self, sentences, update=False, progress_per=10000, keep_raw_voca **kwargs Key word arguments propagated to `self.vocabulary.prepare_vocab` - """ total_words, corpus_count = self.vocabulary.scan_vocab( sentences, progress_per=progress_per, trim_rule=trim_rule) @@ -846,6 +857,7 @@ def estimate_memory(self, vocab_size=None, report=None): ------- dict of (str, int) A dictionary from string representations of the model's memory consuming members to their size in bytes. + """ vocab_size = vocab_size or len(self.wv.vocab) report = report or {} @@ -900,6 +912,7 @@ def train(self, sentences, total_examples=None, total_words=None, ------- (int, int) Tuple of (effective word count after ignoring unknown words and sentence length trimming, total word count). + """ self.alpha = start_alpha or self.alpha @@ -923,6 +936,7 @@ def _get_job_params(self, cur_epoch): ------- float The learning rate for this epoch (it is linearly reduced with epochs from `self.alpha` to `self.min_alpha`). + """ alpha = self.alpha - ((self.alpha - self.min_alpha) * float(cur_epoch) / self.epochs) return alpha @@ -933,7 +947,7 @@ def _update_job_params(self, job_params, epoch_progress, cur_epoch): Parameters ---------- job_params : dict of (str, obj) - Unused. TODO: Delete this. + Unused epoch_progress : float Ratio of finished work in the current epoch. cur_epoch : int @@ -960,6 +974,7 @@ def _get_thread_working_mem(self): ------- (np.ndarray, np.ndarray) Each worker threads private work memory. + """ work = matutils.zeros_aligned(self.trainables.layer1_size, dtype=REAL) # per-thread private work memory neu1 = matutils.zeros_aligned(self.trainables.layer1_size, dtype=REAL) @@ -977,6 +992,7 @@ def _raw_word_count(self, job): ------- int Number of raw words in the corpus chunk. + """ return sum(len(sentence) for sentence in job) @@ -1066,6 +1082,7 @@ def load(cls, *args, **kwargs): ------ IOError When methods are called on instance (should be called from class). + """ model = super(BaseWordEmbeddingsModel, cls).load(*args, **kwargs) diff --git a/gensim/models/doc2vec.py b/gensim/models/doc2vec.py index fa10665ea7..0c5ef5e7cc 100644 --- a/gensim/models/doc2vec.py +++ b/gensim/models/doc2vec.py @@ -27,20 +27,21 @@ #. Initialize a model with e.g.:: - >>> from gensim.test.utils import common_texts - >>> from gensim.models.doc2vec import Doc2Vec, TaggedDocument - >>> - >>> documents = [TaggedDocument(word, [i]) for i, word in enumerate(common_texts)] - >>> model = Doc2Vec(documents, vector_size=5, window=2, min_count=1, workers=4) +>>> from gensim.test.utils import common_texts, get_tmpfile +>>> from gensim.models.doc2vec import Doc2Vec, TaggedDocument +>>> +>>> documents = [TaggedDocument(word, [i]) for i, word in enumerate(common_texts)] +>>> model = Doc2Vec(documents, vector_size=5, window=2, min_count=1, workers=4) #. Persist a model to disk with:: - >>> model.save('/tmp/model') - >>> model = Doc2Vec.load('/tmp/model') # you can continue training with the loaded model! +>>> tmp_f = get_tmpfile("model") +>>> model.save(tmp_f) +>>> model = Doc2Vec.load(tmp_f) # you can continue training with the loaded model! If you're finished training a model (=no more updates, only querying), you can do:: - >>> model.delete_temporary_training_data(keep_doctags_vectors=True, keep_inference=True) +>>> model.delete_temporary_training_data(keep_doctags_vectors=True, keep_inference=True) to trim unneeded model memory = use (much) less RAM. @@ -107,7 +108,8 @@ def train_document_dbow(model, doc_words, doctag_indexes, alpha, work=None, Indices into `doctag_vectors` used to obtain the tags of the document. alpha : float Learning rate. - work : + work : np.ndarray + Private working memory for each worker. train_words : bool, optional Word vectors will be updated exactly as per Word2Vec skip-gram training only if **both** `learn_words` and `train_words` are set to True. @@ -717,16 +719,23 @@ def estimated_lookup_memory(self): def infer_vector(self, doc_words, alpha=0.1, min_alpha=0.0001, steps=5): """Infer a vector for given post-bulk training document. + Notes + ----- + Subsequent calls to this function may infer different representations for the same document. + For a more stable representation, increase the number of steps to assert a stricket convergence. + Parameters ---------- doc_words : list of str - A (potentially unseen) document. + A document for which the vector representation will be inferred. Note this does not have to + be already used in training; it can be an completely new document. alpha : float, optional The initial learning rate. min_alpha : float, optional Learning rate will linearly drop to `min_alpha` as training progresses. steps : int, optional - Number of times to train the new document. + Number of times to train the new document. A higher value may slow down training, but + it will result in more stable representations. Returns ------- diff --git a/gensim/models/fasttext.py b/gensim/models/fasttext.py index 949e94640a..7f766fbd3a 100644 --- a/gensim/models/fasttext.py +++ b/gensim/models/fasttext.py @@ -81,7 +81,6 @@ logger = logging.getLogger(__name__) try: - raise ImportError from gensim.models.fasttext_inner import train_batch_sg, train_batch_cbow from gensim.models.fasttext_inner import FAST_VERSION, MAX_WORDS_IN_BATCH @@ -601,7 +600,7 @@ def train(self, sentences, total_examples=None, total_words=None, self.trainables.get_vocab_word_vecs(self.wv) def init_sims(self, replace=False): - """Deletes the keyed vector syn1 structure. + """Precompute L2-normalized vectors. Notes ----- diff --git a/gensim/models/poincare.py b/gensim/models/poincare.py index 9353226d3e..bb61036e8c 100644 --- a/gensim/models/poincare.py +++ b/gensim/models/poincare.py @@ -348,7 +348,9 @@ def _clip_vectors(vectors, epsilon): def save(self, *args, **kwargs): """Save complete model to disk, inherited from :class:`~gensim.utils.SaveLoad`. - See also :meth:`~gensim.models.poincare.PoincareModel.load` + See also + -------- + :meth:`~gensim.models.poincare.PoincareModel.load` Parameters ---------- @@ -367,7 +369,9 @@ def save(self, *args, **kwargs): def load(cls, *args, **kwargs): """Load model from disk, inherited from :class:`~gensim.utils.SaveLoad`. - See also :meth:`~gensim.models.poincare.PoincareModel.save` + See also + -------- + :meth:`~gensim.models.poincare.PoincareModel.save` Parameters ---------- @@ -840,16 +844,18 @@ def word_vec(self, word): Accept a single word as input. Returns the word's representations in vector space, as a 1D numpy array. - Example:: - >>> from gensim.test.utils import datapath - >>> - >>> # Read the sample relations file and train the model - >>> relations = PoincareRelations(file_path=datapath('poincare_hypernyms_large.tsv')) - >>> model = PoincareModel(train_data=relations) - >>> model.train(epochs=50) - >>> - >>> # Query the trained model. - >>> wv = model.kv.word_vec('kangaroo.n.01') + Examples + -------- + + >>> from gensim.test.utils import datapath + >>> + >>> # Read the sample relations file and train the model + >>> relations = PoincareRelations(file_path=datapath('poincare_hypernyms_large.tsv')) + >>> model = PoincareModel(train_data=relations) + >>> model.train(epochs=50) + >>> + >>> # Query the trained model. + >>> wv = model.kv.word_vec('kangaroo.n.01') """ return super(PoincareKeyedVectors, self).get_vector(word) From 86a6d236fc43350e9d4a24dd3a32a72bfcbedec4 Mon Sep 17 00:00:00 2001 From: Manos Stergiadis Date: Mon, 2 Apr 2018 12:33:32 +0200 Subject: [PATCH 30/41] added documentation for public cython methods in `fasttext` --- gensim/models/fasttext_inner.pyx | 47 ++++++++++++++++++++++++++++++++ 1 file changed, 47 insertions(+) diff --git a/gensim/models/fasttext_inner.pyx b/gensim/models/fasttext_inner.pyx index ac7cdafbd5..007065ba12 100644 --- a/gensim/models/fasttext_inner.pyx +++ b/gensim/models/fasttext_inner.pyx @@ -244,6 +244,30 @@ cdef void fast_sentence_cbow_hs( def train_batch_sg(model, sentences, alpha, _work, _l1): + """Update skip-gram model by training on a sequence of sentences. + + Each sentence is a list of string tokens, which are looked up in the model's + vocab dictionary. Called internally from :meth:`gensim.models.fasttext.FastText.train()`. + + Parameters + ---------- + model : :class:`~gensim.models.fasttext.FastText` + Model to be trained. + sentences : iterable of list of str + Corpus streamed directly from disk/network. + alpha : float + Learning rate. + _work : np.ndarray, optional + Private working memory for each worker. + _l1 : np.ndarray, optional + Private working memory for each worker. + + Returns + ------- + int + Effective number of words trained. + + """ cdef int hs = model.hs cdef int negative = model.negative cdef int sample = (model.vocabulary.sample != 0) @@ -374,6 +398,29 @@ def train_batch_sg(model, sentences, alpha, _work, _l1): def train_batch_cbow(model, sentences, alpha, _work, _neu1): + """Update the CBOW model by training on a sequence of sentences. + + Each sentence is a list of string tokens, which are looked up in the model's + vocab dictionary. Called internally from :meth:`gensim.models.fasttext.FastText.train()`. + + Parameters + ---------- + model : :class:`~gensim.models.fasttext.FastText` + Model to be trained. + sentences : iterable of list of str + Corpus streamed directly from disk/network. + alpha : float + Learning rate. + _work : np.ndarray, optional + Private working memory for each worker. + _neu1 : np.ndarray, optional + Private working memory for each worker. + Returns + ------- + int + Effective number of words trained. + + """ cdef int hs = model.hs cdef int negative = model.negative cdef int sample = (model.vocabulary.sample != 0) From dc2f93ef812dee3713768b1e7f8fa2fb28c98a05 Mon Sep 17 00:00:00 2001 From: Manos Stergiadis Date: Tue, 3 Apr 2018 20:06:14 +0200 Subject: [PATCH 31/41] added documentation for C functions in the word2vec --- gensim/models/word2vec_inner.pyx | 172 ++++++++++++++++++++++++++++++- 1 file changed, 169 insertions(+), 3 deletions(-) diff --git a/gensim/models/word2vec_inner.pyx b/gensim/models/word2vec_inner.pyx index 6974c3a96a..7d8ae26208 100755 --- a/gensim/models/word2vec_inner.pyx +++ b/gensim/models/word2vec_inner.pyx @@ -66,12 +66,45 @@ cdef void our_saxpy_noblas(const int *N, const float *alpha, const float *X, con for i from 0 <= i < N[0] by 1: Y[i * (incY[0])] = (alpha[0]) * X[i * (incX[0])] + Y[i * (incY[0])] - cdef void fast_sentence_sg_hs( const np.uint32_t *word_point, const np.uint8_t *word_code, const int codelen, REAL_t *syn0, REAL_t *syn1, const int size, const np.uint32_t word2_index, const REAL_t alpha, REAL_t *work, REAL_t *word_locks, const int _compute_loss, REAL_t *_running_training_loss_param) nogil: + """Train on a single effective word from the current batch, using the Skip-Gram model. + + In this model we are using a given word to predict a context word (a word that is + close to the one we are using as training). Hierarchical softmax is used to speed-up + training. + + Parameters + ---------- + word_point + Vector representation of the current word. + word_code + ASCII (char == uint8) representation of the current word. + codelen + Number of characters (length) in the current word. + syn0 + Embeddings for the words in the vocabulary (`model.wv.vectors`) + syn1 + Weights of the hidden layer in the model's trainable neural network. + size + Length of the embeddings. + word2_index + Index of the context word in the vocabulary. + alpha + Learning rate. + work + Private working memory for each worker. + word_locks + Lock factors for each word. A value of 0 will block training. + _compute_loss + Whether or not the loss should be computed at this step. + _running_training_loss_param + Running loss, used to debug or inspect how training progresses. + + """ cdef long long a, b cdef long long row1 = word2_index * size, row2, sgn @@ -124,7 +157,49 @@ cdef unsigned long long fast_sentence_sg_neg( const np.uint32_t word2_index, const REAL_t alpha, REAL_t *work, unsigned long long next_random, REAL_t *word_locks, const int _compute_loss, REAL_t *_running_training_loss_param) nogil: + """Train on a single effective word from the current batch, using the Skip-Gram model. + + In this model we are using a given word to predict a context word (a word that is + close to the one we are using as training). Negative sampling is used to speed-up + training. + + Parameters + ---------- + negative + Number of negative words to be sampled. + cum_table + Cumulative-distribution table using stored vocabulary word counts for + drawing random words (with a negative label). + cum_table_len + Length of the `cum_table` + syn0 + Embeddings for the words in the vocabulary (`model.wv.vectors`) + syn1neg + Weights of the hidden layer in the model's trainable neural network. + size + Length of the embeddings. + word_index + Index of the current training word in the vocabulary. + word2_index + Index of the context word in the vocabulary. + alpha + Learning rate. + work + Private working memory for each worker. + next_random + Seed to produce the index for the next word to be randomly sampled. + word_locks + Lock factors for each word. A value of 0 will block training. + _compute_loss + Whether or not the loss should be computed at this step. + _running_training_loss_param + Running loss, used to debug or inspect how training progresses. + + Returns + ------- + Seed to draw the training word for the next iteration of the same routine. + """ cdef long long a cdef long long row1 = word2_index * size, row2 cdef unsigned long long modulo = 281474976710655ULL @@ -173,7 +248,50 @@ cdef void fast_sentence_cbow_hs( const np.uint32_t indexes[MAX_SENTENCE_LEN], const REAL_t alpha, REAL_t *work, int i, int j, int k, int cbow_mean, REAL_t *word_locks, const int _compute_loss, REAL_t *_running_training_loss_param) nogil: + """Train on a single effective word from the current batch, using the CBOW method. + + Using this method we train the trainable neural network by attempting to predict a + given word by its context (words surrounding the one we are trying to predict). + Hierarchical softmax method is used to speed-up training. + + Parameters + ---------- + word_point + Vector representation of the current word. + word_code + ASCII (char == uint8) representation of the current word. + codelens + Number of characters (length) for all words in the context. + neu1 + Private working memory for every worker. + syn0 + Embeddings for the words in the vocabulary (`model.wv.vectors`) + syn1 + Weights of the hidden layer in the model's trainable neural network. + size + Length of the embeddings. + word2_index + Index of the context word in the vocabulary. + alpha + Learning rate. + work + Private working memory for each worker. + i + Index of the word to be predicted from the context. + j + Index of the word at the beginning of the context window. + k + Index of the word at the end of the context window. + cbow_mean + If 0, use the sum of the context word vectors as the prediction. If 1, use the mean. + word_locks + Lock factors for each word. A value of 0 will block training. + _compute_loss + Whether or not the loss should be computed at this step. + _running_training_loss_param + Running loss, used to debug or inspect how training progresses. + """ cdef long long a, b cdef long long row2, sgn cdef REAL_t f, g, count, inv_count = 1.0, f_dot, lprob @@ -228,7 +346,55 @@ cdef unsigned long long fast_sentence_cbow_neg( const np.uint32_t indexes[MAX_SENTENCE_LEN], const REAL_t alpha, REAL_t *work, int i, int j, int k, int cbow_mean, unsigned long long next_random, REAL_t *word_locks, const int _compute_loss, REAL_t *_running_training_loss_param) nogil: + """Train on a single effective word from the current batch, using the CBOW method. + + Using this method we train the trainable neural network by attempting to predict a + given word by its context (words surrounding the one we are trying to predict). + Negative sampling is used to speed-up training. + Parameters + ---------- + negative + Number of negative words to be sampled. + cum_table + Cumulative-distribution table using stored vocabulary word counts for + drawing random words (with a negative label). + cum_table_len + Length of the `cum_table` + codelens + Number of characters (length) for all words in the context. + neu1 + Private working memory for every worker. + syn0 + Embeddings for the words in the vocabulary (`model.wv.vectors`) + syn1neg + Weights of the hidden layer in the model's trainable neural network. + size + Length of the embeddings. + indexes + Indexes of the context words in the vocabulary. + alpha + Learning rate. + work + Private working memory for each worker. + i + Index of the word to be predicted from the context. + j + Index of the word at the beginning of the context window. + k + Index of the word at the end of the context window. + cbow_mean + If 0, use the sum of the context word vectors as the prediction. If 1, use the mean. + next_random + Seed for the drawing the predicted word for the next iteration of the same routine. + word_locks + Lock factors for each word. A value of 0 will block training. + _compute_loss + Whether or not the loss should be computed at this step. + _running_training_loss_param + Running loss, used to debug or inspect how training progresses. + + """ cdef long long a cdef long long row2 cdef unsigned long long modulo = 281474976710655ULL @@ -307,7 +473,7 @@ def train_batch_sg(model, sentences, alpha, _work, compute_loss): alpha : float The learning rate _work : np.ndarray - Private working memory for each worker. + Private working memory for each worker. compute_loss : bool Whether or not the training loss should be computed in this batch. @@ -322,7 +488,7 @@ def train_batch_sg(model, sentences, alpha, _work, compute_loss): cdef int negative = model.negative cdef int sample = (model.vocabulary.sample != 0) - cdef int _compute_loss = (1 if compute_loss == True else 0) + cdef int _compute_loss = (1 if compute_loss else 0) cdef REAL_t _running_training_loss = model.running_training_loss cdef REAL_t *syn0 = (np.PyArray_DATA(model.wv.vectors)) From f78348f736a239a7ec91fd99f9f1e6cfd1903046 Mon Sep 17 00:00:00 2001 From: ivan Date: Wed, 11 Apr 2018 17:45:00 +0500 Subject: [PATCH 32/41] fix build issues --- gensim/models/base_any2vec.py | 42 ++++++++++++++----------- gensim/models/doc2vec.py | 58 +++++++++++++++++++---------------- gensim/models/fasttext.py | 39 +++++++++++------------ gensim/models/word2vec.py | 14 +++++---- 4 files changed, 83 insertions(+), 70 deletions(-) diff --git a/gensim/models/base_any2vec.py b/gensim/models/base_any2vec.py index a2c6ca3760..06c6401ca2 100644 --- a/gensim/models/base_any2vec.py +++ b/gensim/models/base_any2vec.py @@ -541,12 +541,14 @@ def __init__(self, sentences=None, workers=3, vector_size=100, epochs=5, callbac Can be None (min_count will be used, look to :func:`~gensim.utils.keep_vocab_item`), or a callable that accepts parameters (word, count, min_count) and returns either :attr:`gensim.utils.RULE_DISCARD`, :attr:`gensim.utils.RULE_KEEP` or :attr:`gensim.utils.RULE_DEFAULT`. - The input parameters are of the following types - * word: str. The word we are examining - * count: int. The word's occurence count in the corpus - * min_count: int. The minimum count threshold. - Note: The rule, if given, is only used to prune vocabulary during build_vocab() and is not stored as part - of the model. + The rule, if given, is only used to prune vocabulary during build_vocab() and is not stored as part of the + model. + + The input parameters are of the following types: + * `word` (str) - the word we are examining + * `count` (int) - the word's frequency count in the corpus + * `min_count` (int) - the minimum count threshold. + sg : {1, 0}, optional Defines the training algorithm. If 1, skip-gram is used, otherwise, CBOW is employed. alpha : float, optional @@ -769,12 +771,14 @@ def build_vocab(self, sentences, update=False, progress_per=10000, keep_raw_voca Can be None (min_count will be used, look to :func:`~gensim.utils.keep_vocab_item`), or a callable that accepts parameters (word, count, min_count) and returns either :attr:`gensim.utils.RULE_DISCARD`, :attr:`gensim.utils.RULE_KEEP` or :attr:`gensim.utils.RULE_DEFAULT`. - The input parameters are of the following types - * word: str. The word we are examining - * count: int. The word's occurence count in the corpus - * min_count: int. The minimum count threshold. - Note: The rule, if given, is only used to prune vocabulary during build_vocab() and is not stored as part - of the model. + The rule, if given, is only used to prune vocabulary during build_vocab() and is not stored as part of the + model. + + The input parameters are of the following types: + * `word` (str) - the word we are examining + * `count` (int) - the word's frequency count in the corpus + * `min_count` (int) - the minimum count threshold. + **kwargs Key word arguments propagated to `self.vocabulary.prepare_vocab` @@ -805,12 +809,14 @@ def build_vocab_from_freq(self, word_freq, keep_raw_vocab=False, corpus_count=No Can be None (min_count will be used, look to :func:`~gensim.utils.keep_vocab_item`), or a callable that accepts parameters (word, count, min_count) and returns either :attr:`gensim.utils.RULE_DISCARD`, :attr:`gensim.utils.RULE_KEEP` or :attr:`gensim.utils.RULE_DEFAULT`. - The input parameters are of the following types - * word: str. The word we are examining - * count: int. The word's occurence count in the corpus - * min_count: int. The minimum count threshold. - Note: The rule, if given, is only used to prune vocabulary during build_vocab() and is not stored as part - of the model. + The rule, if given, is only used to prune vocabulary during build_vocab() and is not stored as part of the + model. + + The input parameters are of the following types: + * `word` (str) - the word we are examining + * `count` (int) - the word's frequency count in the corpus + * `min_count` (int) - the minimum count threshold. + update : bool, optional If true, the new provided words in `word_freq` dict will be added to model's vocab. diff --git a/gensim/models/doc2vec.py b/gensim/models/doc2vec.py index 0c5ef5e7cc..b92e123b9c 100644 --- a/gensim/models/doc2vec.py +++ b/gensim/models/doc2vec.py @@ -20,33 +20,34 @@ `_. **Make sure you have a C compiler before installing gensim, to use optimized (compiled) -doc2vec training** (70x speedup [blog]_). +doc2vec training** (70x speedup `blog `_). Examples -------- -#. Initialize a model with e.g.:: +#. Initialize a model with e.g. :: ->>> from gensim.test.utils import common_texts, get_tmpfile ->>> from gensim.models.doc2vec import Doc2Vec, TaggedDocument ->>> ->>> documents = [TaggedDocument(word, [i]) for i, word in enumerate(common_texts)] ->>> model = Doc2Vec(documents, vector_size=5, window=2, min_count=1, workers=4) + >>> from gensim.test.utils import common_texts, get_tmpfile + >>> from gensim.models.doc2vec import Doc2Vec, TaggedDocument + >>> + >>> documents = [TaggedDocument(word, [i]) for i, word in enumerate(common_texts)] + >>> model = Doc2Vec(documents, vector_size=5, window=2, min_count=1, workers=4) -#. Persist a model to disk with:: +#. Persist a model to disk with :: ->>> tmp_f = get_tmpfile("model") ->>> model.save(tmp_f) ->>> model = Doc2Vec.load(tmp_f) # you can continue training with the loaded model! + >>> tmp_f = get_tmpfile("model") + >>> model.save(tmp_f) + >>> model = Doc2Vec.load(tmp_f) # you can continue training with the loaded model! -If you're finished training a model (=no more updates, only querying), you can do:: +#. If you're finished training a model (=no more updates, only querying, reduce memory usage), you can do :: ->>> model.delete_temporary_training_data(keep_doctags_vectors=True, keep_inference=True) + >>> model.delete_temporary_training_data(keep_doctags_vectors=True, keep_inference=True) -to trim unneeded model memory = use (much) less RAM. +#. Infer vector for new document -""" + >>> vector = model.infer_vector(["system", "response"]) +""" import logging import os import warnings @@ -1018,12 +1019,14 @@ def build_vocab_from_freq(self, word_freq, keep_raw_vocab=False, corpus_count=No Can be None (min_count will be used, look to :func:`~gensim.utils.keep_vocab_item`), or a callable that accepts parameters (word, count, min_count) and returns either :attr:`gensim.utils.RULE_DISCARD`, :attr:`gensim.utils.RULE_KEEP` or :attr:`gensim.utils.RULE_DEFAULT`. - The input parameters are of the following types - * word: str. The word we are examining - * count: int. The word's occurence count in the corpus - * min_count: int. The minimum count threshold. - Note: The rule, if given, is only used to prune vocabulary during build_vocab() and is not stored as part - of the model. + The rule, if given, is only used to prune vocabulary during build_vocab() and is not stored as part of the + model. + + The input parameters are of the following types: + * `word` (str) - the word we are examining + * `count` (int) - the word's frequency count in the corpus + * `min_count` (int) - the minimum count threshold. + update : bool, optional If true, the new provided words in `word_freq` dict will be added to model's vocab. @@ -1104,12 +1107,13 @@ def scan_vocab(self, documents, docvecs, progress_per=10000, trim_rule=None): Can be None (min_count will be used, look to :func:`~gensim.utils.keep_vocab_item`), or a callable that accepts parameters (word, count, min_count) and returns either :attr:`gensim.utils.RULE_DISCARD`, :attr:`gensim.utils.RULE_KEEP` or :attr:`gensim.utils.RULE_DEFAULT`. - The input parameters are of the following types - * word: str. The word we are examining - * count: int. The word's occurence count in the corpus - * min_count: int. The minimum count threshold. - Note: The rule, if given, is only used to prune vocabulary during build_vocab() and is not stored as part - of the model. + The rule, if given, is only used to prune vocabulary during build_vocab() and is not stored as part of the + model. + + The input parameters are of the following types: + * `word` (str) - the word we are examining + * `count` (int) - the word's frequency count in the corpus + * `min_count` (int) - the minimum count threshold. Returns ------- diff --git a/gensim/models/fasttext.py b/gensim/models/fasttext.py index 7f766fbd3a..4a2208dc83 100644 --- a/gensim/models/fasttext.py +++ b/gensim/models/fasttext.py @@ -25,13 +25,13 @@ Examples -------- -#. Initialize a model with e.g.:: +#. Initialize a model with e.g. :: >>> from gensim.test.utils import common_texts >>> >>> model = FastText(size=4, window=3, min_count=1) >>> model.build_vocab(common_texts) -#. Persist a model to disk with:: +#. Persist a model to disk with :: >>> model.save("temp_model.w2v") >>> model = FastText.load("temp_model.w2v") # you can continue training with the loaded model! @@ -40,7 +40,7 @@ >>> computer_vec = model.wv['computer'] # numpy vector of a word -#. You can perform various NLP word tasks with the model. Some of them are already built-in:: +#. You can perform various NLP word tasks with the model. Some of them are already built-in :: >>> similarities = model.wv.most_similar(positive=['computer', 'human'], negative=['interface']) >>> most_similar = similarities[0] @@ -52,7 +52,7 @@ >>> sim_score = model.wv.similarity('computer', 'human') -#. Correlation with human opinion on word similarity:: +#. Correlation with human opinion on word similarity :: >>> from gensim.test.utils import datapath >>> @@ -63,7 +63,6 @@ >>> analogies = model.wv.accuracy(datapath('questions-words.txt')) """ - import logging import struct @@ -214,12 +213,10 @@ class FastText(BaseWordEmbeddingsModel): compute embeddings even for **unseen** words (that do not exist in the vocabulary), as the aggregate of the n-grams included in the word. After training the model, this attribute can be used directly to query those embeddings in various ways. Check the module level docstring from some examples. - self.vocabulary : :class:'~gensim.models.fasttext.FastTextVocab' This object represents the vocabulary (sometimes called Dictionary in gensim) of the model. Besides keeping track of all unique words, this object provides extra functionality, such as constructing a huffman tree (frequent words are closer to the root), or discarding extremely rare words. - self.trainables : :class:`~gensim.models.fasttext.FastTextTrainables` This object represents the inner shallow neural network used to train the embeddings. This is very similar to the network of the Word2Vec model, but it also trains weights for the N-Grams (sequences of more @@ -292,12 +289,14 @@ def __init__(self, sentences=None, sg=0, hs=0, size=100, alpha=0.025, window=5, Can be None (min_count will be used, look to :func:`~gensim.utils.keep_vocab_item`), or a callable that accepts parameters (word, count, min_count) and returns either :attr:`gensim.utils.RULE_DISCARD`, :attr:`gensim.utils.RULE_KEEP` or :attr:`gensim.utils.RULE_DEFAULT`. - The input parameters are of the following types - * word: str. The word we are examining - * count: int. The word's occurence count in the corpus - * min_count: int. The minimum count threshold. - Note: The rule, if given, is only used to prune vocabulary during build_vocab() and is not stored as part - of the model. + The rule, if given, is only used to prune vocabulary during build_vocab() and is not stored as part of the + model. + + The input parameters are of the following types: + * `word` (str) - the word we are examining + * `count` (int) - the word's frequency count in the corpus + * `min_count` (int) - the minimum count threshold. + sorted_vocab : {1,0}, optional If 1, sort the vocabulary by descending frequency before assigning word indices. batch_words : int, optional @@ -423,12 +422,14 @@ def build_vocab(self, sentences, update=False, progress_per=10000, keep_raw_voca Can be None (min_count will be used, look to :func:`~gensim.utils.keep_vocab_item`), or a callable that accepts parameters (word, count, min_count) and returns either :attr:`gensim.utils.RULE_DISCARD`, :attr:`gensim.utils.RULE_KEEP` or :attr:`gensim.utils.RULE_DEFAULT`. - The input parameters are of the following types - * word: str. The word we are examining - * count: int. The word's occurence count in the corpus - * min_count: int. The minimum count threshold. - Note: The rule, if given, is only used to prune vocabulary during build_vocab() and is not stored as part - of the model. + The rule, if given, is only used to prune vocabulary during build_vocab() and is not stored as part of the + model. + + The input parameters are of the following types: + * `word` (str) - the word we are examining + * `count` (int) - the word's frequency count in the corpus + * `min_count` (int) - the minimum count threshold. + **kwargs Additional key word parameters passed to :meth:`~gensim.models.base_any2vec.BaseWordEmbeddingsModel.build_vocab`. diff --git a/gensim/models/word2vec.py b/gensim/models/word2vec.py index c574a5a511..0d4d10edf5 100755 --- a/gensim/models/word2vec.py +++ b/gensim/models/word2vec.py @@ -693,12 +693,14 @@ def __init__(self, sentences=None, size=100, alpha=0.025, window=5, min_count=5, Can be None (min_count will be used, look to :func:`~gensim.utils.keep_vocab_item`), or a callable that accepts parameters (word, count, min_count) and returns either :attr:`gensim.utils.RULE_DISCARD`, :attr:`gensim.utils.RULE_KEEP` or :attr:`gensim.utils.RULE_DEFAULT`. - The input parameters are of the following types - * word: str. The word we are examining - * count: int. The word's occurence count in the corpus - * min_count: int. The minimum count threshold. - Note: The rule, if given, is only used to prune vocabulary during build_vocab() and is not stored as part - of the model. + The rule, if given, is only used to prune vocabulary during build_vocab() and is not stored as part of the + model. + + The input parameters are of the following types: + * `word` (str) - the word we are examining + * `count` (int) - the word's frequency count in the corpus + * `min_count` (int) - the minimum count threshold. + sorted_vocab : int {1,0} If 1, sort the vocabulary by descending frequency before assigning word indexes. batch_words : int From cec8c44c139f592764f592a9b2ca1225d5a351c5 Mon Sep 17 00:00:00 2001 From: ivan Date: Wed, 11 Apr 2018 22:27:14 +0500 Subject: [PATCH 33/41] add missing rst --- docs/src/apiref.rst | 1 + docs/src/models/base_any2vec.rst | 10 ++++++++++ 2 files changed, 11 insertions(+) create mode 100644 docs/src/models/base_any2vec.rst diff --git a/docs/src/apiref.rst b/docs/src/apiref.rst index 66fe192b07..1c968c7cd7 100644 --- a/docs/src/apiref.rst +++ b/docs/src/apiref.rst @@ -64,6 +64,7 @@ Modules: models/deprecated/word2vec models/deprecated/keyedvectors models/deprecated/fasttext_wrapper + models/base_any2vec similarities/docsim similarities/index sklearn_api/atmodel diff --git a/docs/src/models/base_any2vec.rst b/docs/src/models/base_any2vec.rst new file mode 100644 index 0000000000..e6685cda66 --- /dev/null +++ b/docs/src/models/base_any2vec.rst @@ -0,0 +1,10 @@ +:mod:`models.base_any2vec` -- Base classes for any2vec models +============================================================= + +.. automodule:: gensim.models.base_any2vec + :synopsis: Base classes for any2vec models + :members: + :inherited-members: + :special-members: __getitem__ + :undoc-members: + :show-inheritance: From 585f81fb8a3512951ff847503f4232496352ee54 Mon Sep 17 00:00:00 2001 From: ivan Date: Fri, 13 Apr 2018 15:14:50 +0500 Subject: [PATCH 34/41] fix base_any2vec --- gensim/models/base_any2vec.py | 166 +++++++++++++++++++--------------- 1 file changed, 91 insertions(+), 75 deletions(-) diff --git a/gensim/models/base_any2vec.py b/gensim/models/base_any2vec.py index 06c6401ca2..60d15cbd92 100644 --- a/gensim/models/base_any2vec.py +++ b/gensim/models/base_any2vec.py @@ -22,9 +22,13 @@ See Also -------- :class:`~gensim.models.word2vec.Word2Vec`. + Word2Vec model - embeddings for words. :class:`~gensim.models.fasttext.FastText`. + FastText model - embeddings for words (ngram-based). :class:`~gensim.models.doc2vec.Doc2Vec`. + Doc2Vec model - embeddings for documents. :class:`~gensim.models.poincare.PoincareModel` + Poincare model - embeddings for graphs. """ from gensim import utils @@ -61,14 +65,14 @@ class BaseAny2VecModel(utils.SaveLoad): Notes ----- A subclass should initialize the following attributes: - - self.kv (instance of concrete implementation of `BaseKeyedVectors` interface) - - self.vocabulary (instance of concrete implementation of `BaseVocabBuilder` abstract class) - - self.trainables (instance of concrete implementation of `BaseTrainables` abstract class) - """ + * self.kv - keyed vectors in model (see :class:`~gensim.models.keyedvectors.Word2VecKeyedVectors` as example) + * self.vocabulary - vocabulary (see :class:`~gensim.models.word2vec.Word2VecVocab` as example) + * self.trainables - internal matrices (see :class:`~gensim.models.word2vec.Word2VecTrainables` as example) + """ def __init__(self, workers=3, vector_size=100, epochs=5, callbacks=(), batch_words=10000): - """Initialize model parameters. + """ Parameters ---------- @@ -128,7 +132,7 @@ def _check_training_sanity(self, epochs=None, total_examples=None, total_words=N def _worker_loop(self, job_queue, progress_queue): """Train the model, lifting lists of data from the queue. - This function will be called in paralle by multiple workers (threads or processes) to make + This function will be called in parallel by multiple workers (threads or processes) to make optimal use of multicore machines. Parameters @@ -330,10 +334,10 @@ def _train_epoch(self, data_iterable, cur_epoch=0, total_examples=None, For example in many implementations the learning rate would be dropping with the number of epochs. total_examples : int, optional Count of objects in the `data_iterator`. In the usual case this would correspond to the number of sentences - in a corpus. Used to log progress. + in a corpus, used to log progress. total_words : int, optional Count of total objects in `data_iterator`. In the usual case this would correspond to the number of raw - words in a corpus. Used to log progress. + words in a corpus, used to log progress. queue_factor : int, optional Multiplier for size of queue -> size = number of workers * queue_factor. report_delay : float, optional @@ -343,7 +347,7 @@ def _train_epoch(self, data_iterable, cur_epoch=0, total_examples=None, ------- (int, int, int) The training report for this epoch consisting of three elements: - * size of data chunk processed, for example number of sentences in the corpus chunk. + * Size of data chunk processed, for example number of sentences in the corpus chunk. * Effective word count used in training (after ignoring unknown words and trimming the sentence length). * Total word count used in training. @@ -385,17 +389,17 @@ def train(self, data_iterable, epochs=None, total_examples=None, Number of epochs (training iterations over the whole input) of training. total_examples : int, optional Count of objects in the `data_iterator`. In the usual case this would correspond to the number of sentences - in a corpus. Used to log progress. + in a corpus, used to log progress. total_words : int, optional Count of total objects in `data_iterator`. In the usual case this would correspond to the number of raw - words in a corpus. Used to log progress. + words in a corpus, used to log progress. queue_factor : int, optional Multiplier for size of queue -> size = number of workers * queue_factor. report_delay : float, optional Number of seconds between two consecutive progress report messages in the logger. callbacks : list of :class:`~gensim.models.callbacks.CallbackAny2Vec`, optional List of callbacks that need to be executed/run at specific stages during training. - **kwargs + **kwargs : object Additional key word parameters for the specific model inheriting from this class. Returns @@ -450,18 +454,19 @@ def train(self, data_iterable, epochs=None, total_examples=None, @classmethod def load(cls, fname_or_handle, **kwargs): - """Load a previously saved object (using :meth:`~gensim.base_any2vec.BaseAny2VecModel.save`) from file. + """Load a previously saved object (using :meth:`gensim.models.base_any2vec.BaseAny2VecModel.save`) from file. Parameters ---------- fname_or_handle : {str, file-like object} Path to file that contains needed object or handle to the opened file. - **kwargs + **kwargs : object Key word arguments propagated to :meth:`~gensim.utils.SaveLoad.load`. See Also -------- - :meth:`~gensim.base_any2vec.BaseAny2VecModel.save` + :meth:`gensim.base_any2vec.BaseAny2VecModel.save` + Method for save a model. Returns ------- @@ -483,12 +488,13 @@ def save(self, fname_or_handle, **kwargs): ---------- fname_or_handle : {str, file-like object} Path to file where the model will be persisted. - **kwargs + **kwargs : object Key word arguments propagated to :meth:`~gensim.utils.SaveLoad.save`. See Also -------- - :meth:`~gensim.models.base_any2vec.BaseAny2VecModel.save` + :meth:`gensim.models.base_any2vec.BaseAny2VecModel.save` + Method for load model after current method. """ super(BaseAny2VecModel, self).save(fname_or_handle, **kwargs) @@ -499,11 +505,16 @@ class BaseWordEmbeddingsModel(BaseAny2VecModel): See Also -------- - :class:`~gensim.models.word2vec.Word2Vec` - :class:`~gensim.models.word2vec.FastText`, etc. + :class:`~gensim.models.word2vec.Word2Vec`. + Word2Vec model - embeddings for words. + :class:`~gensim.models.fasttext.FastText`. + FastText model - embeddings for words (ngram-based). + :class:`~gensim.models.doc2vec.Doc2Vec`. + Doc2Vec model - embeddings for documents. + :class:`~gensim.models.poincare.PoincareModel` + Poincare model - embeddings for graphs. """ - def _clear_post_train(self): raise NotImplementedError() @@ -516,7 +527,7 @@ def _set_train_params(self, **kwargs): def __init__(self, sentences=None, workers=3, vector_size=100, epochs=5, callbacks=(), batch_words=10000, trim_rule=None, sg=0, alpha=0.025, window=5, seed=1, hs=0, negative=5, cbow_mean=1, min_alpha=0.0001, compute_loss=False, fast_version=0, **kwargs): - """Construct a base word embeddings model. + """ Parameters ---------- @@ -541,8 +552,8 @@ def __init__(self, sentences=None, workers=3, vector_size=100, epochs=5, callbac Can be None (min_count will be used, look to :func:`~gensim.utils.keep_vocab_item`), or a callable that accepts parameters (word, count, min_count) and returns either :attr:`gensim.utils.RULE_DISCARD`, :attr:`gensim.utils.RULE_KEEP` or :attr:`gensim.utils.RULE_DEFAULT`. - The rule, if given, is only used to prune vocabulary during build_vocab() and is not stored as part of the - model. + The rule, if given, is only used to prune vocabulary during current method call and is not stored as part + of the model. The input parameters are of the following types: * `word` (str) - the word we are examining @@ -557,10 +568,11 @@ def __init__(self, sentences=None, workers=3, vector_size=100, epochs=5, callbac The maximum distance between the current and predicted word within a sentence. seed : int, optional Seed for the random number generator. Initial vectors for each word are seeded with a hash of - the concatenation of word + `str(seed)`. Note that for a fully deterministically-reproducible run, - you must also limit the model to a single worker thread (`workers=1`), to eliminate ordering jitter - from OS thread scheduling. (In Python 3, reproducibility between interpreter launches also requires - use of the `PYTHONHASHSEED` environment variable to control hash randomization). + the concatenation of word + `str(seed)`. + Note that for a fully deterministically-reproducible run, you must also limit the model to a single worker + thread (`workers=1`), to eliminate ordering jitter from OS thread scheduling. + In Python 3, reproducibility between interpreter launches also requires use of the `PYTHONHASHSEED` + environment variable to control hash randomization. hs : {1,0}, optional If 1, hierarchical softmax will be used for model training. If set to 0, and `negative` is non-zero, negative sampling will be used. @@ -574,10 +586,10 @@ def __init__(self, sentences=None, workers=3, vector_size=100, epochs=5, callbac Final learning rate. Drops linearly with the number of iterations from `alpha`. compute_loss : bool, optional If True, loss will be computed while training the Word2Vec model and stored in - :attr:`~gensim.models.base_any2vec.BaseWordEmbeddingsModel.running_training_loss`. + :attr:`~gensim.models.base_any2vec.BaseWordEmbeddingsModel.running_training_loss` attribute. fast_version : {-1, 1}, optional Whether or not the fast cython implementation of the internal training methods is available. 1 means it is. - **kwargs + **kwargs : object Key word arguments needed to allow children classes to accept more arguments. """ @@ -741,7 +753,7 @@ def __str__(self): Returns ------- str - A human readable string containing the class name, as well as the id to word mapping, number of + A human readable string containing the class name, as well as the size of dictionary, number of features and starting learning rate used by the object. """ @@ -771,15 +783,15 @@ def build_vocab(self, sentences, update=False, progress_per=10000, keep_raw_voca Can be None (min_count will be used, look to :func:`~gensim.utils.keep_vocab_item`), or a callable that accepts parameters (word, count, min_count) and returns either :attr:`gensim.utils.RULE_DISCARD`, :attr:`gensim.utils.RULE_KEEP` or :attr:`gensim.utils.RULE_DEFAULT`. - The rule, if given, is only used to prune vocabulary during build_vocab() and is not stored as part of the - model. + The rule, if given, is only used to prune vocabulary during current method call and is not stored as part + of the model. The input parameters are of the following types: * `word` (str) - the word we are examining * `count` (int) - the word's frequency count in the corpus * `min_count` (int) - the minimum count threshold. - **kwargs + **kwargs : object Key word arguments propagated to `self.vocabulary.prepare_vocab` """ @@ -809,8 +821,8 @@ def build_vocab_from_freq(self, word_freq, keep_raw_vocab=False, corpus_count=No Can be None (min_count will be used, look to :func:`~gensim.utils.keep_vocab_item`), or a callable that accepts parameters (word, count, min_count) and returns either :attr:`gensim.utils.RULE_DISCARD`, :attr:`gensim.utils.RULE_KEEP` or :attr:`gensim.utils.RULE_DEFAULT`. - The rule, if given, is only used to prune vocabulary during build_vocab() and is not stored as part of the - model. + The rule, if given, is only used to prune vocabulary during current method call and is not stored as part + of the model. The input parameters are of the following types: * `word` (str) - the word we are examining @@ -820,13 +832,6 @@ def build_vocab_from_freq(self, word_freq, keep_raw_vocab=False, corpus_count=No update : bool, optional If true, the new provided words in `word_freq` dict will be added to model's vocab. - Examples - -------- - >>> from gensim.models import Word2Vec - >>> - >>> model= Word2Vec() - >>> model.build_vocab_from_freq({"Word1": 15, "Word2": 20}) - """ logger.info("Processing provided word frequencies") # Instead of scanning text, this will assign provided word frequencies dictionary(word_freq) @@ -953,7 +958,7 @@ def _update_job_params(self, job_params, epoch_progress, cur_epoch): Parameters ---------- job_params : dict of (str, obj) - Unused + NOT USED. epoch_progress : float Ratio of finished work in the current epoch. cur_epoch : int @@ -1017,7 +1022,7 @@ def _check_training_sanity(self, epochs=None, total_examples=None, total_words=N Number of documents in the corpus. Either `total_examples` or `total_words` **must** be supplied. total_words : int, optional Number of words in the corpus. Either `total_examples` or `total_words` **must** be supplied. - **kwargs + **kwargs : object Unused. Present to preserve signature among base and inherited implementations. Raises @@ -1070,14 +1075,15 @@ def load(cls, *args, **kwargs): Parameters ---------- - *args + *args : object Positional arguments passed to :meth:`~gensim.utils.SaveLoad.load`. - **kwargs + **kwargs : object Key word arguments passed to :meth:`~gensim.utils.SaveLoad.load`. See Also -------- :meth:`~gensim.models.base_any2vec.BaseWordEmbeddingsModel.save` + Method for save a model. Returns ------- @@ -1090,7 +1096,6 @@ def load(cls, *args, **kwargs): When methods are called on instance (should be called from class). """ - model = super(BaseWordEmbeddingsModel, cls).load(*args, **kwargs) if model.negative and hasattr(model.wv, 'index2word'): model.vocabulary.make_cum_table(model.wv) # rebuild cum_table from vocabulary @@ -1153,7 +1158,7 @@ def _log_progress(self, job_queue, progress_queue, cur_epoch, example_count, tot def _log_epoch_end(self, cur_epoch, example_count, total_examples, raw_word_count, total_words, trained_word_count, elapsed): - """Callback used to log the end of a training epoch + """Callback used to log the end of a training epoch. Parameters ---------- @@ -1222,73 +1227,84 @@ def _log_train_end(self, raw_word_count, trained_word_count, total_elapsed, job_ # for backward compatibility @deprecated("Method will be removed in 4.0.0, use self.wv.most_similar() instead") def most_similar(self, positive=None, negative=None, topn=10, restrict_vocab=None, indexer=None): - """ - Deprecated. Use self.wv.most_similar() instead. - Refer to the documentation for `gensim.models.keyedvectors.WordEmbeddingsKeyedVectors.most_similar` + """Deprecated, use self.wv.most_similar() instead. + + Refer to the documentation for :meth:`~gensim.models.keyedvectors.WordEmbeddingsKeyedVectors.most_similar`. + """ return self.wv.most_similar(positive, negative, topn, restrict_vocab, indexer) @deprecated("Method will be removed in 4.0.0, use self.wv.wmdistance() instead") def wmdistance(self, document1, document2): - """ - Deprecated. Use self.wv.wmdistance() instead. - Refer to the documentation for `gensim.models.keyedvectors.WordEmbeddingsKeyedVectors.wmdistance` + """Deprecated, use self.wv.wmdistance() instead. + + Refer to the documentation for :meth:`~gensim.models.keyedvectors.WordEmbeddingsKeyedVectors.wmdistance`. + """ return self.wv.wmdistance(document1, document2) @deprecated("Method will be removed in 4.0.0, use self.wv.most_similar_cosmul() instead") def most_similar_cosmul(self, positive=None, negative=None, topn=10): - """ - Deprecated. Use self.wv.most_similar_cosmul() instead. - Refer to the documentation for `gensim.models.keyedvectors.WordEmbeddingsKeyedVectors.most_similar_cosmul` + """Deprecated, use self.wv.most_similar_cosmul() instead. + + Refer to the documentation for + :meth:`~gensim.models.keyedvectors.WordEmbeddingsKeyedVectors.most_similar_cosmul`. + """ return self.wv.most_similar_cosmul(positive, negative, topn) @deprecated("Method will be removed in 4.0.0, use self.wv.similar_by_word() instead") def similar_by_word(self, word, topn=10, restrict_vocab=None): - """ - Deprecated. Use self.wv.similar_by_word() instead. - Refer to the documentation for `gensim.models.keyedvectors.WordEmbeddingsKeyedVectors.similar_by_word` + """Deprecated, use self.wv.similar_by_word() instead. + + Refer to the documentation for :meth:`~gensim.models.keyedvectors.WordEmbeddingsKeyedVectors.similar_by_word`. + """ return self.wv.similar_by_word(word, topn, restrict_vocab) @deprecated("Method will be removed in 4.0.0, use self.wv.similar_by_vector() instead") def similar_by_vector(self, vector, topn=10, restrict_vocab=None): - """ - Deprecated. Use self.wv.similar_by_vector() instead. - Refer to the documentation for `gensim.models.keyedvectors.WordEmbeddingsKeyedVectors.similar_by_vector` + """Deprecated, use self.wv.similar_by_vector() instead. + + Refer to the documentation for :meth:`~gensim.models.keyedvectors.WordEmbeddingsKeyedVectors.similar_by_vector`. + """ return self.wv.similar_by_vector(vector, topn, restrict_vocab) @deprecated("Method will be removed in 4.0.0, use self.wv.doesnt_match() instead") def doesnt_match(self, words): - """ - Deprecated. Use self.wv.doesnt_match() instead. - Refer to the documentation for `gensim.models.keyedvectors.WordEmbeddingsKeyedVectors.doesnt_match` + """Deprecated, use self.wv.doesnt_match() instead. + + Refer to the documentation for :meth:`~gensim.models.keyedvectors.WordEmbeddingsKeyedVectors.doesnt_match`. + """ return self.wv.doesnt_match(words) @deprecated("Method will be removed in 4.0.0, use self.wv.similarity() instead") def similarity(self, w1, w2): - """ - Deprecated. Use self.wv.similarity() instead. - Refer to the documentation for `gensim.models.keyedvectors.WordEmbeddingsKeyedVectors.similarity` + """Deprecated, use self.wv.similarity() instead. + + Refer to the documentation for :meth:`~gensim.models.keyedvectors.WordEmbeddingsKeyedVectors.similarity`. + """ return self.wv.similarity(w1, w2) @deprecated("Method will be removed in 4.0.0, use self.wv.n_similarity() instead") def n_similarity(self, ws1, ws2): - """ - Deprecated. Use self.wv.n_similarity() instead. - Refer to the documentation for `gensim.models.keyedvectors.WordEmbeddingsKeyedVectors.n_similarity` + """Deprecated, use self.wv.n_similarity() instead. + + Refer to the documentation for :meth:`~gensim.models.keyedvectors.WordEmbeddingsKeyedVectors.n_similarity`. + """ return self.wv.n_similarity(ws1, ws2) @deprecated("Method will be removed in 4.0.0, use self.wv.evaluate_word_pairs() instead") def evaluate_word_pairs(self, pairs, delimiter='\t', restrict_vocab=300000, case_insensitive=True, dummy4unknown=False): - """ - Deprecated. Use self.wv.evaluate_word_pairs() instead. - Refer to the documentation for `gensim.models.keyedvectors.WordEmbeddingsKeyedVectors.evaluate_word_pairs` + """Deprecated, use self.wv.evaluate_word_pairs() instead. + + Refer to the documentation for + :meth:`~gensim.models.keyedvectors.WordEmbeddingsKeyedVectors.evaluate_word_pairs`. + """ return self.wv.evaluate_word_pairs(pairs, delimiter, restrict_vocab, case_insensitive, dummy4unknown) From b5d84ff27287ca53861870eb090e49988718e287 Mon Sep 17 00:00:00 2001 From: ivan Date: Fri, 13 Apr 2018 16:10:05 +0500 Subject: [PATCH 35/41] fix doc2vec[1] --- gensim/models/doc2vec.py | 50 +++++++++++++++++++--------------------- 1 file changed, 24 insertions(+), 26 deletions(-) diff --git a/gensim/models/doc2vec.py b/gensim/models/doc2vec.py index b92e123b9c..ab188ef5c8 100644 --- a/gensim/models/doc2vec.py +++ b/gensim/models/doc2vec.py @@ -6,8 +6,7 @@ # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html -""" -Deep learning via the distributed memory and distributed bag of words models from +"""Deep learning via the distributed memory and distributed bag of words models from `Quoc Le and Tomas Mikolov: "Distributed Representations of Sentences and Documents" `_, using either hierarchical softmax or negative sampling, see `Tomas Mikolov, Kai Chen, Greg Corrado, and Jeffrey Dean: "Efficient Estimation of Word Representations in @@ -25,7 +24,7 @@ Examples -------- -#. Initialize a model with e.g. :: +* Initialize a model with e.g. :: >>> from gensim.test.utils import common_texts, get_tmpfile >>> from gensim.models.doc2vec import Doc2Vec, TaggedDocument @@ -33,17 +32,17 @@ >>> documents = [TaggedDocument(word, [i]) for i, word in enumerate(common_texts)] >>> model = Doc2Vec(documents, vector_size=5, window=2, min_count=1, workers=4) -#. Persist a model to disk with :: +* Persist a model to disk with :: >>> tmp_f = get_tmpfile("model") >>> model.save(tmp_f) >>> model = Doc2Vec.load(tmp_f) # you can continue training with the loaded model! -#. If you're finished training a model (=no more updates, only querying, reduce memory usage), you can do :: +* If you're finished training a model (=no more updates, only querying, reduce memory usage), you can do :: >>> model.delete_temporary_training_data(keep_doctags_vectors=True, keep_inference=True) -#. Infer vector for new document +* Infer vector for new document >>> vector = model.infer_vector(["system", "response"]) @@ -96,7 +95,7 @@ def train_document_dbow(model, doc_words, doctag_indexes, alpha, work=None, Notes ----- This is the non-optimized, Python version. If you have cython installed, gensim - will use the optimized version from :mod:`~gensim.models.doc2vec_inner` instead. + will use the optimized version from :mod:`gensim.models.doc2vec_inner` instead. Parameters ---------- @@ -122,9 +121,9 @@ def train_document_dbow(model, doc_words, doctag_indexes, alpha, work=None, learn_hidden : bool, optional Whether or not the weights of the hidden layer will be updated. word_vectors : object, optional - Unused. + UNUSED. word_locks : object, optional - Unused. + UNUSED. doctag_vectors : list of list of float, optional Vector representations of the tags. If None, these will be retrieved from the model. doctag_locks : list of float, optional @@ -155,7 +154,6 @@ def train_document_dbow(model, doc_words, doctag_indexes, alpha, work=None, def train_document_dm(model, doc_words, doctag_indexes, alpha, work=None, neu1=None, learn_doctags=True, learn_words=True, learn_hidden=True, word_vectors=None, word_locks=None, doctag_vectors=None, doctag_locks=None): - """Update distributed memory model ("PV-DM") by training on a single document. Called internally from :meth:`~gensim.models.doc2vec.Doc2Vec.train` and @@ -180,9 +178,9 @@ def train_document_dm(model, doc_words, doctag_indexes, alpha, work=None, neu1=N alpha : float Learning rate. work : object - Unused + UNUSED. neu1 : object - Unused. + UNUSED. learn_doctags : bool, optional Whether the tag vectors should be updated. learn_words : bool, optional @@ -192,7 +190,7 @@ def train_document_dm(model, doc_words, doctag_indexes, alpha, work=None, neu1=N Whether or not the weights of the hidden layer will be updated. word_vectors : iterable of list of float, optional Vector representations of each word in the model's vocabulary. - word_locks : listf of float, optional + word_locks : list of float, optional Lock factors for each word in the vocabulary. doctag_vectors : list of list of float, optional Vector representations of the tags. If None, these will be retrieved from the model. @@ -203,7 +201,7 @@ def train_document_dm(model, doc_words, doctag_indexes, alpha, work=None, neu1=N ------- int Number of words in the input document that were actually used for training (they were found in the - vocavulary and they were not discarded by negative sampling). + vocabulary and they were not discarded by negative sampling). """ if word_vectors is None: @@ -266,9 +264,9 @@ def train_document_dm_concat(model, doc_words, doctag_indexes, alpha, work=None, alpha : float Learning rate. work : object - Unused. + UNUSED. neu1 : object - Unused. + UNUSED. learn_doctags : bool, optional Whether the tag vectors should be updated. learn_words : bool, optional @@ -289,7 +287,7 @@ def train_document_dm_concat(model, doc_words, doctag_indexes, alpha, work=None, ------- int Number of words in the input document that were actually used for training (they were found in the - vocavulary and they were not discarded by negative sampling). + vocabulary and they were not discarded by negative sampling). """ if word_vectors is None: @@ -343,22 +341,20 @@ def train_document_dm_concat(model, doc_words, doctag_indexes, alpha, work=None, class TaggedDocument(namedtuple('TaggedDocument', 'words tags')): """Represents a document along with a tag. - A single document, made up of `words` (a list of unicode string tokens) - and `tags` (a list of tokens). Tags may be one or more unicode string - tokens, but typical practice (which will also be most memory-efficient) is - for the tags list to include a unique integer id as the only tag. + A single document, made up of `words` (a list of unicode string tokens) and `tags` (a list of tokens). + Tags may be one or more unicode string tokens, but typical practice (which will also be most memory-efficient) + is for the tags list to include a unique integer id as the only tag. - Replaces "sentence as a list of words" from Word2Vec. + Replaces "sentence as a list of words" from :class:`gensim.models.word2vec.Word2Vec`. """ - def __str__(self): """Human readable representation of the object's state, used for debugging. Returns ------- str - Human readable representation of the object's state. + Human readable representation of the object's state (words and tags). """ return '%s(%s, %s)' % (self.__class__.__name__, self.words, self.tags) @@ -367,12 +363,13 @@ def __str__(self): # for compatibility @deprecated("Class will be removed in 4.0.0, use TaggedDocument instead") class LabeledSentence(TaggedDocument): + """Deprecated, use :class:`~gensim.models.doc2vec.TaggedDocument` instead.""" pass class Doctag(namedtuple('Doctag', 'offset, word_count, doc_count')): - """A string document tag discovered during the initial vocabulary - scan. (The document-vector equivalent of a Vocab object.) + """A string document tag discovered during the initial vocabulary scan. + The document-vector equivalent of a Vocab object. Will not be used if all presented document tags are ints. @@ -380,6 +377,7 @@ class Doctag(namedtuple('Doctag', 'offset, word_count, doc_count')): if-and-only-if no raw-int tags were used. If any raw-int tags were used, string Doctag vectors begin at index (max_rawint + 1), so the true index is (rawint_index + 1 + offset). See also _index_to_doctag(). + """ __slots__ = () From 6f32e7806463d4621bb9685710de832efb4cecf2 Mon Sep 17 00:00:00 2001 From: ivan Date: Fri, 13 Apr 2018 16:27:13 +0500 Subject: [PATCH 36/41] fix doc2vec[2] --- gensim/models/doc2vec.py | 63 ++++++++++++++++++++++++++++------------ 1 file changed, 44 insertions(+), 19 deletions(-) diff --git a/gensim/models/doc2vec.py b/gensim/models/doc2vec.py index ab188ef5c8..21a42ed865 100644 --- a/gensim/models/doc2vec.py +++ b/gensim/models/doc2vec.py @@ -339,7 +339,7 @@ def train_document_dm_concat(model, doc_words, doctag_indexes, alpha, work=None, class TaggedDocument(namedtuple('TaggedDocument', 'words tags')): - """Represents a document along with a tag. + """Represents a document along with a tag, input document format for :class:`~gensim.models.doc2vec.Doc2Vec`. A single document, made up of `words` (a list of unicode string tokens) and `tags` (a list of tokens). Tags may be one or more unicode string tokens, but typical practice (which will also be most memory-efficient) @@ -373,10 +373,9 @@ class Doctag(namedtuple('Doctag', 'offset, word_count, doc_count')): Will not be used if all presented document tags are ints. - The offset is only the true index into the doctags_syn0/doctags_syn0_lockf - if-and-only-if no raw-int tags were used. If any raw-int tags were used, - string Doctag vectors begin at index (max_rawint + 1), so the true index is - (rawint_index + 1 + offset). See also _index_to_doctag(). + The offset is only the true index into the doctags_syn0/doctags_syn0_lockf if-and-only-if no raw-int tags were used. + If any raw-int tags were used, string Doctag vectors begin at index (max_rawint + 1), so the true index is + (rawint_index + 1 + offset), see also :meth:`~gensim.models.keyedvectors.Doc2VecKeyedVectors._index_to_doctag`. """ __slots__ = () @@ -1280,13 +1279,28 @@ def get_doctag_trainables(self, doc_words, vector_size): class TaggedBrownCorpus(object): - """Iterate over documents from the Brown corpus (part of NLTK data), yielding - each document out as a TaggedDocument object.""" + """Reader for the `Brown corpus (part of NLTK data) `_.""" def __init__(self, dirname): + """ + + Parameters + ---------- + dirname : str + Path to folder with Brown corpus. + + """ self.dirname = dirname def __iter__(self): + """Iterate through the corpus. + + Yields + ------ + :class:`~gensim.models.doc2vec.TaggedDocument` + Document from `source`. + + """ for fname in os.listdir(self.dirname): fname = os.path.join(self.dirname, fname) if not os.path.isfile(fname): @@ -1304,29 +1318,40 @@ def __iter__(self): class TaggedLineDocument(object): - """Simple format: one document = one line = one TaggedDocument object. + """Simple reader for format: one document = one line = one :class:`~gensim.models.doc2vec.TaggedDocument` object. - Words are expected to be already preprocessed and separated by whitespace, - tags are constructed automatically from the document line number.""" + Words are expected to be already preprocessed and separated by whitespace, tags are constructed automatically + from the document line number. + """ def __init__(self, source): """ - `source` can be either a string (filename) or a file object. - Example:: - - documents = TaggedLineDocument('myfile.txt') - - Or for compressed files:: + Parameters + ---------- + source : str + Path to source file. - documents = TaggedLineDocument('compressed_text.txt.bz2') - documents = TaggedLineDocument('compressed_text.txt.gz') + Examples + -------- + >>> from gensim.test.utils import datapath + >>> from gensim.models.doc2vec import TaggedLineDocument + >>> + >>> for document in TaggedLineDocument(datapath("head500.noblanks.cor")): + ... pass """ self.source = source def __iter__(self): - """Iterate through the lines in the source.""" + """Iterate through the lines in the source. + + Yields + ------ + :class:`~gensim.models.doc2vec.TaggedDocument` + Document from `source`. + + """ try: # Assume it is a file-like object and try treating it as such # Things that don't have seek will trigger an exception From 2e3a0b71a99d24fe4f7019a6b1dc91fe24980b89 Mon Sep 17 00:00:00 2001 From: ivan Date: Fri, 13 Apr 2018 17:33:01 +0500 Subject: [PATCH 37/41] =?UTF-8?q?fix=20doc2vec[3=D1=8A?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- gensim/models/base_any2vec.py | 2 +- gensim/models/doc2vec.py | 66 ++++++++++++++++++----------------- 2 files changed, 35 insertions(+), 33 deletions(-) diff --git a/gensim/models/base_any2vec.py b/gensim/models/base_any2vec.py index 60d15cbd92..a11d2074c8 100644 --- a/gensim/models/base_any2vec.py +++ b/gensim/models/base_any2vec.py @@ -958,7 +958,7 @@ def _update_job_params(self, job_params, epoch_progress, cur_epoch): Parameters ---------- job_params : dict of (str, obj) - NOT USED. + UNUSED. epoch_progress : float Ratio of finished work in the current epoch. cur_epoch : int diff --git a/gensim/models/doc2vec.py b/gensim/models/doc2vec.py index 21a42ed865..82462ee0db 100644 --- a/gensim/models/doc2vec.py +++ b/gensim/models/doc2vec.py @@ -390,45 +390,46 @@ class Doc2Vec(BaseWordEmbeddingsModel): Some important attributes are the following: - self.wv : :class:`~gensim.models.keyedvectors.Word2VecKeyedVectors` + Attributes + ---------- + wv : :class:`~gensim.models.keyedvectors.Word2VecKeyedVectors` This object essentially contains the mapping between words and embeddings. After training, it can be used directly to query those embeddings in various ways. See the module level docstring for examples. - self.docvecs : :class:`~gensim.models.keyedvectors.Doc2VecKeyedVectors` + docvecs : :class:`~gensim.models.keyedvectors.Doc2VecKeyedVectors` This object contains the paragraph vectors. Remember that the only difference between this model and - Word2Vec is that besides the word vectors we also include paragraph embeddings to capture the paragraph. + :class:`~gensim.models.word2vec.Word2Vec` is that besides the word vectors we also include paragraph embeddings + to capture the paragraph. In this way we can capture the difference between the same word used in a different wide context. - For example we now have a different representation of the word "leaves" in the following two sentences:: + For example we now have a different representation of the word "leaves" in the following two sentences :: 1. Manos leaves the office every day at 18:00 to catch his train 2. This season is called Fall, because leaves fall from the trees. - In a plain Word2Vec model the word would have exactly the same representation in both sentences, in Doc2Vec it - will not. + In a plain :class:`~gensim.models.word2vec.Word2Vec` model the word would have exactly the same representation + in both sentences, in :class:`~gensim.models.doc2vec.Doc2Vec` it will not. - self.vocabulary : :class:'~gensim.models.doc2vec.Doc2VecVocab' + vocabulary : :class:'~gensim.models.doc2vec.Doc2VecVocab' This object represents the vocabulary (sometimes called Dictionary in gensim) of the model. Besides keeping track of all unique words, this object provides extra functionality, such as sorting words by frequency, or discarding extremely rare words. - self.trainables : :class:`~gensim.models.doc2vec.Doc2VecTrainables` + trainables : :class:`~gensim.models.doc2vec.Doc2VecTrainables` This object represents the inner shallow neural network used to train the embeddings. The semantics of the network differ slightly in the two available training modes (CBOW or SG) but you can think of it as a NN with a single projection and hidden layer which we train on the corpus. The weights are then used as our embeddings - The only addition to the underlying NN used in Word2Vec is that the input includes not only the word vectors - of each word in the context, but also the paragraph vector. + The only addition to the underlying NN used in :class:`~gensim.models.word2vec.Word2Vec` is that the input + includes not only the word vectors of each word in the context, but also the paragraph vector. """ - def __init__(self, documents=None, dm_mean=None, dm=1, dbow_words=0, dm_concat=0, dm_tag_count=1, docvecs=None, docvecs_mapfile=None, comment=None, trim_rule=None, callbacks=(), **kwargs): - """Initialize the model from an iterable of `documents`. Each document is a - :class:`~gensim.models.doc2vec.TaggedDocument` object that will be used for training. + """ Parameters ---------- documents : iterable of list of :class:`~gensim.models.doc2vec.TaggedDocument`, optional - Can be simply a list of elements, but for larger corpora,consider an iterable that streams + Input corpus, can be simply a list of elements, but for larger corpora,consider an iterable that streams the documents directly from disk/network. If you don't supply `documents`, the model is left uninitialized -- use if you plan to initialize it in some other way. dm : {1,0}, optional @@ -446,8 +447,9 @@ def __init__(self, documents=None, dm_mean=None, dm=1, dbow_words=0, dm_concat=0 Seed for the random number generator. Initial vectors for each word are seeded with a hash of the concatenation of word + `str(seed)`. Note that for a fully deterministically-reproducible run, you must also limit the model to a single worker thread (`workers=1`), to eliminate ordering jitter - from OS thread scheduling. (In Python 3, reproducibility between interpreter launches also requires - use of the `PYTHONHASHSEED` environment variable to control hash randomization). + from OS thread scheduling. + In Python 3, reproducibility between interpreter launches also requires use of the `PYTHONHASHSEED` + environment variable to control hash randomization. min_count : int, optional Ignores all words with total frequency lower than this. max_vocab_size : int, optional @@ -488,17 +490,18 @@ def __init__(self, documents=None, dm_mean=None, dm=1, dbow_words=0, dm_concat=0 Can be None (min_count will be used, look to :func:`~gensim.utils.keep_vocab_item`), or a callable that accepts parameters (word, count, min_count) and returns either :attr:`gensim.utils.RULE_DISCARD`, :attr:`gensim.utils.RULE_KEEP` or :attr:`gensim.utils.RULE_DEFAULT`. - The input parameters are of the following types: - - word: str. The word we are examining - - count: int. The word's occurence count in the corpus - - min_count: int. The minimum count threshold. - Note: The rule, if given, is only used to prune vocabulary during build_vocab() and is not stored as part + The rule, if given, is only used to prune vocabulary during current method call and is not stored as part of the model. + + The input parameters are of the following types: + * `word` (str) - the word we are examining + * `count` (int) - the word's frequency count in the corpus + * `min_count` (int) - the minimum count threshold. + callbacks : :obj: `list` of :obj: `~gensim.models.callbacks.CallbackAny2Vec`, optional List of callbacks that need to be executed/run at specific stages during training. """ - if 'sentences' in kwargs: raise DeprecationWarning( "Parameter 'sentences' was renamed to 'documents', and will be removed in 4.0.0, " @@ -554,19 +557,17 @@ def __init__(self, documents=None, dm_mean=None, dm=1, dbow_words=0, dm_concat=0 @property def dm(self): - """Indicates whether 'distributed memory' (PV-DM) will be used, else `distributed bag of words` + """Indicates whether 'distributed memory' (PV-DM) will be used, else 'distributed bag of words' (PV-DBOW) is used. - Either this or :meth:`~gensim.models.doc2vec.Doc2Vec.dbow` will return True. """ return not self.sg # opposite of SG @property def dbow(self): - """Indicates whether `distributed bag of words` (PV-DBOW) will be used, else 'distributed memory' + """Indicates whether 'distributed bag of words' (PV-DBOW) will be used, else 'distributed memory' (PV-DM) is used. - Either this or :meth:`~gensim.models.doc2vec.Doc2Vec.dm` will return True. """ return self.sg # same as SG @@ -582,12 +583,12 @@ def clear_sims(self): self.wv.vectors_docs_norm = None def reset_from(self, other_model): - """Copy shareable data structures from another (possibly pretrained) model. + """Copy shareable data structures from another (possibly pre-trained) model. Parameters ---------- other_model : :class:`~gensim.models.doc2vec.Doc2Vec` - Another model whose internal data structures will be copied over to the current object. + Other model whose internal data structures will be copied over to the current object. """ self.wv.vocab = other_model.wv.vocab @@ -1059,18 +1060,18 @@ def build_vocab_from_freq(self, word_freq, keep_raw_vocab=False, corpus_count=No class Doc2VecVocab(Word2VecVocab): """Vocabulary used by :class:`~gensim.models.doc2vec.Doc2Vec`. - This includes a mapping from words found in the corpus to their total occurence count. + This includes a mapping from words found in the corpus to their total frequency count. """ def __init__(self, max_vocab_size=None, min_count=5, sample=1e-3, sorted_vocab=True, null_word=0): - """Initialize the vocabulary. + """ Parameters ---------- max_vocab_size : int, optional Maximum number of words in the Vocabulary. Used to limit the RAM during vocabulary building; if there are more unique words than this, then prune the infrequent ones. - Every 10 million word types need about 1GB of RAM. Set to `None` for no limit. + Every 10 million word types need about 1GB of RAM, set to `None` for no limit. min_count : int Words with frequency lower than this limit will be discarded form the vocabulary. sample : float, optional @@ -1088,7 +1089,7 @@ def __init__(self, max_vocab_size=None, min_count=5, sample=1e-3, sorted_vocab=T sorted_vocab=sorted_vocab, null_word=null_word) def scan_vocab(self, documents, docvecs, progress_per=10000, trim_rule=None): - """Create the models Vocabulary: A mapping from unique words in the corpus to their occurence count. + """Create the models Vocabulary: A mapping from unique words in the corpus to their frequency count. Parameters ---------- @@ -1232,6 +1233,7 @@ def _tag_seen(self, index, docvecs): class Doc2VecTrainables(Word2VecTrainables): + """Represents the inner shallow neural network used to train :class:`~gensim.models.doc2vec.Doc2Vec`.""" def __init__(self, dm=1, dm_concat=0, dm_tag_count=1, vector_size=100, seed=1, hashfxn=hash, window=5): super(Doc2VecTrainables, self).__init__( vector_size=vector_size, seed=seed, hashfxn=hashfxn) From 2d9616ca01e45b5e70015dd30222cdd835059b01 Mon Sep 17 00:00:00 2001 From: ivan Date: Wed, 18 Apr 2018 17:33:38 +0500 Subject: [PATCH 38/41] fix doc2vec[4] --- gensim/models/doc2vec.py | 125 ++++++++++++++++++++------------------- 1 file changed, 63 insertions(+), 62 deletions(-) diff --git a/gensim/models/doc2vec.py b/gensim/models/doc2vec.py index 82462ee0db..56833e9ad7 100644 --- a/gensim/models/doc2vec.py +++ b/gensim/models/doc2vec.py @@ -42,7 +42,7 @@ >>> model.delete_temporary_training_data(keep_doctags_vectors=True, keep_inference=True) -* Infer vector for new document +* Infer vector for new document :: >>> vector = model.infer_vector(["system", "response"]) @@ -90,7 +90,7 @@ def train_document_dbow(model, doc_words, doctag_indexes, alpha, work=None, """Update distributed bag of words model ("PV-DBOW") by training on a single document. Called internally from :meth:`~gensim.models.doc2vec.Doc2Vec.train` and - :meth:`~gensim.models.doc2vec.Doc2Vec.infer_vector()`. + :meth:`~gensim.models.doc2vec.Doc2Vec.infer_vector`. Notes ----- @@ -157,14 +157,14 @@ def train_document_dm(model, doc_words, doctag_indexes, alpha, work=None, neu1=N """Update distributed memory model ("PV-DM") by training on a single document. Called internally from :meth:`~gensim.models.doc2vec.Doc2Vec.train` and - :meth:`~gensim.models.doc2vec.Doc2Vec.infer_vector()`. This method implements + :meth:`~gensim.models.doc2vec.Doc2Vec.infer_vector`. This method implements the DM model with a projection (input) layer that is either the sum or mean of the context vectors, depending on the model's `dm_mean` configuration field. Notes ----- This is the non-optimized, Python version. If you have cython installed, gensim - will use the optimized version from :mod:`~gensim.models.doc2vec_inner` instead. + will use the optimized version from :mod:`gensim.models.doc2vec_inner` instead. Parameters ---------- @@ -245,12 +245,13 @@ def train_document_dm_concat(model, doc_words, doctag_indexes, alpha, work=None, concatenation of the context window word vectors (rather than a sum or average). This might be slower since the input at each batch will be significantly larger. - Called internally from `Doc2Vec.train()` and `Doc2Vec.infer_vector()`. + Called internally from :meth:`~gensim.models.doc2vec.Doc2Vec.train` and + :meth:`~gensim.models.doc2vec.Doc2Vec.infer_vector`. Notes ----- This is the non-optimized, Python version. If you have cython installed, gensim - will use the optimized version from :mod:`~gensim.models.doc2vec_inner` instead. + will use the optimized version from :mod:`gensim.models.doc2vec_inner` instead. Parameters ---------- @@ -386,9 +387,9 @@ def repeat(self, word_count): class Doc2Vec(BaseWordEmbeddingsModel): """Class for training, using and evaluating neural networks described in - `Distributed Representations of Sentences and Documents `_. + `Distributed Representations of Sentences and Documents `_. - Some important attributes are the following: + Some important internal attributes are the following: Attributes ---------- @@ -409,7 +410,7 @@ class Doc2Vec(BaseWordEmbeddingsModel): In a plain :class:`~gensim.models.word2vec.Word2Vec` model the word would have exactly the same representation in both sentences, in :class:`~gensim.models.doc2vec.Doc2Vec` it will not. - vocabulary : :class:'~gensim.models.doc2vec.Doc2VecVocab' + vocabulary : :class:`~gensim.models.doc2vec.Doc2VecVocab` This object represents the vocabulary (sometimes called Dictionary in gensim) of the model. Besides keeping track of all unique words, this object provides extra functionality, such as sorting words by frequency, or discarding extremely rare words. @@ -558,7 +559,7 @@ def __init__(self, documents=None, dm_mean=None, dm=1, dbow_words=0, dm_concat=0 @property def dm(self): """Indicates whether 'distributed memory' (PV-DM) will be used, else 'distributed bag of words' - (PV-DBOW) is used. + (PV-DBOW) is used. """ return not self.sg # opposite of SG @@ -575,6 +576,7 @@ def _set_train_params(self, **kwargs): pass def _clear_post_train(self): + """Alias for :meth:`~gensim.models.doc2vec.Doc2Vec.clear_sims`.""" self.clear_sims() def clear_sims(self): @@ -601,11 +603,11 @@ def reset_from(self, other_model): self.trainables.reset_weights(self.hs, self.negative, self.wv, self.docvecs) def _do_train_job(self, job, alpha, inits): - """ + """Train model using `job` data. Parameters ---------- - job : iterable of list of str + job : iterable of list of :class:`~gensim.models.doc2vec.TaggedDocument` The corpus chunk to be used for training this batch. alpha : float Learning rate to be used for training this batch. @@ -644,19 +646,18 @@ def _do_train_job(self, job, alpha, inits): def train(self, documents, total_examples=None, total_words=None, epochs=None, start_alpha=None, end_alpha=None, word_count=0, queue_factor=2, report_delay=1.0, callbacks=()): - """Update the model's neural weights from a sequence of sentences (can be a once-only generator stream). - The `documents` iterable can be simply a list of TaggedDocument elements. + """Update the model's neural weights. To support linear learning-rate decay from (initial) alpha to min_alpha, and accurate - progress-percentage logging, either total_examples (count of sentences) or total_words (count of - raw words in sentences) **MUST** be provided (if the corpus is the same as was provided to - :meth:`~gensim.models.word2vec.Word2Vec.build_vocab()`, the count of examples in that corpus - will be available in the model's :attr:`corpus_count` property). + progress-percentage logging, either total_examples (count of sentences) or total_words (count of raw words + in sentences) **MUST** be provided (if the corpus is the same as was provided to + :meth:`~gensim.models.word2vec.Word2Vec.build_vocab`, the count of examples in that corpus will be available + in the model's :attr:`corpus_count` property). - To avoid common mistakes around the model's ability to do multiple training passes itself, an - explicit `epochs` argument **MUST** be provided. In the common and recommended case, - where :meth:`~gensim.models.word2vec.Word2Vec.train()` is only called once, - the model's cached `iter` value should be supplied as `epochs` value. + To avoid common mistakes around the model's ability to do multiple training passes itself, an explicit `epochs` + argument **MUST** be provided. In the common and recommended case, + where :meth:`~gensim.models.word2vec.Word2Vec.train` is only called once, the model's cached `iter` value + should be supplied as `epochs` value. Parameters ---------- @@ -683,6 +684,7 @@ def train(self, documents, total_examples=None, total_words=None, Seconds to wait before reporting progress. callbacks : :obj: `list` of :obj: `~gensim.models.callbacks.CallbackAny2Vec`, optional List of callbacks that need to be executed/run at specific stages during training. + """ super(Doc2Vec, self).train( documents, total_examples=total_examples, total_words=total_words, @@ -694,7 +696,7 @@ def _raw_word_count(self, job): Parameters ---------- - job : iterable of list of str + job : iterable of list of :class:`~gensim.models.doc2vec.TaggedDocument` Corpus chunk. Returns @@ -706,12 +708,13 @@ def _raw_word_count(self, job): return sum(len(sentence.words) for sentence in job) def estimated_lookup_memory(self): - """Estimated memory for tag lookup; 0 if using pure int tags. + """Get estimated memory for tag lookup, 0 if using pure int tags. Returns ------- int The estimated RAM required to look up a tag in bytes. + """ return 60 * len(self.docvecs.offset2doctag) + 140 * len(self.docvecs.doctags) @@ -726,15 +729,14 @@ def infer_vector(self, doc_words, alpha=0.1, min_alpha=0.0001, steps=5): Parameters ---------- doc_words : list of str - A document for which the vector representation will be inferred. Note this does not have to - be already used in training; it can be an completely new document. + A document for which the vector representation will be inferred. alpha : float, optional The initial learning rate. min_alpha : float, optional Learning rate will linearly drop to `min_alpha` as training progresses. steps : int, optional - Number of times to train the new document. A higher value may slow down training, but - it will result in more stable representations. + Number of times to train the new document. A higher value may slow down training, but it will result in more + stable representations. Returns ------- @@ -789,12 +791,13 @@ def __getitem__(self, tag): return vstack([self[i] for i in tag]) def __str__(self): - """Abbreviated name reflecting major configuration paramaters. + """Abbreviated name reflecting major configuration parameters. Returns ------- str Human readable representation of the models internal state. + """ segments = [] if self.comment: @@ -834,9 +837,9 @@ def delete_temporary_training_data(self, keep_doctags_vectors=True, keep_inferen Parameters ---------- keep_doctags_vectors : bool, optional - Set to False if you don't want to save doctags vectors. In this case you will not be able to - use :meth:`~gensim.models.keyedvectors.Doc2VecKeyedVectors.most_similar`, - :meth:`~gensim.models.keyedvectors.Doc2VecKeyedVectors.similarity` etc. methods. + Set to False if you don't want to save doctags vectors. In this case you will not be able to use + :meth:`~gensim.models.keyedvectors.Doc2VecKeyedVectors.most_similar`, + :meth:`~gensim.models.keyedvectors.Doc2VecKeyedVectors.similarity`, etc methods. keep_inference : bool, optional Set to False if you don't want to store parameters that are used for :meth:`~gensim.models.doc2vec.Doc2Vec.infer_vector` method. @@ -856,8 +859,7 @@ def delete_temporary_training_data(self, keep_doctags_vectors=True, keep_inferen del self.trainables.vectors_docs_lockf def save_word2vec_format(self, fname, doctag_vec=False, word_vec=True, prefix='*dt_', fvocab=None, binary=False): - """Store the input-hidden weight matrix in the same format used by the original - C word2vec-tool, for compatibility. + """Store the input-hidden weight matrix in the same format used by the original C word2vec-tool. Parameters ---------- @@ -868,12 +870,12 @@ def save_word2vec_format(self, fname, doctag_vec=False, word_vec=True, prefix='* word_vec : bool, optional Indicates whether to store word vectors. prefix : str, optional - Uniquely identifies doctags from word vocab, and avoids collision - in case of repeated string in doctag and word vocab. + Uniquely identifies doctags from word vocab, and avoids collision in case of repeated string in doctag + and word vocab. fvocab : str, optional - Optional file path used to save the vocabulary + Optional file path used to save the vocabulary. binary : bool, optional - If True, the data wil be saved in binary word2vec format, else it will be saved in plain text. + If True, the data wil be saved in binary word2vec format, otherwise - will be saved in plain text. """ total_vec = len(self.wv.vocab) + len(self.docvecs) @@ -893,20 +895,13 @@ def save_word2vec_format(self, fname, doctag_vec=False, word_vec=True, prefix='* binary=binary, write_first_line=write_first_line) def init_sims(self, replace=False): - """Precompute L2-normalized vectors. + """Pre-compute L2-normalized vectors. Parameters ---------- replace : bool - If set, forget the original vectors and only keep the normalized ones to saved RAM. - - Notes - ----- - You **cannot continue training or inference** after doing a replace. - The model becomes effectively read-only - you can call - :meth:`~gensim.models.keyedvectors.Doc2VecKeyedVectors.most_similar`, - :meth:`~gensim.models.keyedvectors.Doc2VecKeyedVectors.similarity` etc., but not - :meth:`~gensim.models.doc2vec.Doc2Vec.train` or :meth:`~gensim.models.doc2vec.Doc2Vec.infer_vector`. + If True - forget the original vectors and only keep the normalized ones to saved RAM (also you can't + continue training if call it with `replace=True`). """ self.docvecs.init_sims(replace=replace) @@ -914,17 +909,26 @@ def init_sims(self, replace=False): @classmethod def load(cls, *args, **kwargs): """Loads a previously saved :class:`~gensim.models.doc2vec.Doc2Vec` model. - Also see :meth:`~gensim.models.doc2vec.Doc2Vec.save`. Parameters ---------- fname : str Path to the saved file. + *args : object + Additional arguments, see `~gensim.models.base_any2vec.BaseWordEmbeddingsModel.load`. + **kwargs : object + Additional arguments, see `~gensim.models.base_any2vec.BaseWordEmbeddingsModel.load`. + + See Also + -------- + :meth:`~gensim.models.doc2vec.Doc2Vec.save` + Save :class:`~gensim.models.doc2vec.Doc2Vec` model. Returns ------- :class:`~gensim.models.doc2vec.Doc2Vec` Loaded model. + """ try: return super(Doc2Vec, cls).load(*args, **kwargs) @@ -938,7 +942,7 @@ def estimate_memory(self, vocab_size=None, report=None): Parameters ---------- - vocab_size : int + vocab_size : int, optional Number of raw words in the vocabulary. report : dict of (str, int), optional A dictionary from string representations of the **specific** model's memory consuming members @@ -977,14 +981,17 @@ def build_vocab(self, documents, update=False, progress_per=10000, keep_raw_voca Can be None (min_count will be used, look to :func:`~gensim.utils.keep_vocab_item`), or a callable that accepts parameters (word, count, min_count) and returns either :attr:`gensim.utils.RULE_DISCARD`, :attr:`gensim.utils.RULE_KEEP` or :attr:`gensim.utils.RULE_DEFAULT`. - The input parameters are of the following types: - * word: str. The word we are examining. - * count: int. The word's occurence count in the corpus. - * min_count: int. The minimum count threshold. - Note: The rule, if given, is only used to prune vocabulary during build_vocab() and is not stored as part + The rule, if given, is only used to prune vocabulary during current method call and is not stored as part of the model. + + The input parameters are of the following types: + * `word` (str) - the word we are examining + * `count` (int) - the word's frequency count in the corpus + * `min_count` (int) - the minimum count threshold. + **kwargs Additional key word arguments passed to the internal vocabulary construction. + """ total_words, corpus_count = self.vocabulary.scan_vocab( documents, self.docvecs, progress_per=progress_per, trim_rule=trim_rule) @@ -1006,7 +1013,7 @@ def build_vocab_from_freq(self, word_freq, keep_raw_vocab=False, corpus_count=No Parameters ---------- word_freq : dict of (str, int) - Word count mapping. + Word <-> count mapping. keep_raw_vocab : bool, optional If not true, delete the raw vocabulary after the scaling is done and free up RAM. corpus_count : int, optional @@ -1028,12 +1035,6 @@ def build_vocab_from_freq(self, word_freq, keep_raw_vocab=False, corpus_count=No update : bool, optional If true, the new provided words in `word_freq` dict will be added to model's vocab. - Examples - -------- - >>> from gensim.models.word2vec import Word2Vec - >>> - >>> model= Word2Vec() - >>> model.build_vocab_from_freq({"Word1": 15, "Word2": 20}) """ logger.info("Processing provided word frequencies") # Instead of scanning text, this will assign provided word frequencies dictionary(word_freq) From 2fcd2f1663a741d3cc947f2f2e1aa42d116de388 Mon Sep 17 00:00:00 2001 From: ivan Date: Wed, 18 Apr 2018 18:11:35 +0500 Subject: [PATCH 39/41] fix doc2vec_inner + remove unused imports --- docs/src/apiref.rst | 1 + docs/src/models/doc2vec_inner.rst | 9 + gensim/models/doc2vec_inner.c | 2298 ++++++++++++++--------------- gensim/models/doc2vec_inner.pyx | 90 +- 4 files changed, 1184 insertions(+), 1214 deletions(-) create mode 100644 docs/src/models/doc2vec_inner.rst diff --git a/docs/src/apiref.rst b/docs/src/apiref.rst index 1c968c7cd7..ffb19b9c5e 100644 --- a/docs/src/apiref.rst +++ b/docs/src/apiref.rst @@ -45,6 +45,7 @@ Modules: models/word2vec models/keyedvectors models/doc2vec + models/doc2vec_inner models/fasttext models/phrases models/poincare diff --git a/docs/src/models/doc2vec_inner.rst b/docs/src/models/doc2vec_inner.rst new file mode 100644 index 0000000000..1f4ff1d5a0 --- /dev/null +++ b/docs/src/models/doc2vec_inner.rst @@ -0,0 +1,9 @@ +:mod:`models.doc2vec_inner` -- Cython job for training Doc2Vec model +==================================================================== + +.. automodule:: gensim.models.doc2vec_inner + :synopsis: Cython job for training Doc2Vec model + :members: + :inherited-members: + :undoc-members: + :show-inheritance: diff --git a/gensim/models/doc2vec_inner.c b/gensim/models/doc2vec_inner.c index 103f34da81..3df79dfaee 100644 --- a/gensim/models/doc2vec_inner.c +++ b/gensim/models/doc2vec_inner.c @@ -524,7 +524,6 @@ static CYTHON_INLINE float __PYX_NAN() { #include #include "numpy/arrayobject.h" #include "numpy/ufuncobject.h" -#include #include "voidptr.h" #ifdef _OPENMP #include @@ -768,7 +767,7 @@ static const char *__pyx_f[] = { #endif -/* "../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":743 +/* "../../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":743 * # in Cython to enable them only on the right systems. * * ctypedef npy_int8 int8_t # <<<<<<<<<<<<<< @@ -777,7 +776,7 @@ static const char *__pyx_f[] = { */ typedef npy_int8 __pyx_t_5numpy_int8_t; -/* "../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":744 +/* "../../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":744 * * ctypedef npy_int8 int8_t * ctypedef npy_int16 int16_t # <<<<<<<<<<<<<< @@ -786,7 +785,7 @@ typedef npy_int8 __pyx_t_5numpy_int8_t; */ typedef npy_int16 __pyx_t_5numpy_int16_t; -/* "../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":745 +/* "../../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":745 * ctypedef npy_int8 int8_t * ctypedef npy_int16 int16_t * ctypedef npy_int32 int32_t # <<<<<<<<<<<<<< @@ -795,7 +794,7 @@ typedef npy_int16 __pyx_t_5numpy_int16_t; */ typedef npy_int32 __pyx_t_5numpy_int32_t; -/* "../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":746 +/* "../../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":746 * ctypedef npy_int16 int16_t * ctypedef npy_int32 int32_t * ctypedef npy_int64 int64_t # <<<<<<<<<<<<<< @@ -804,7 +803,7 @@ typedef npy_int32 __pyx_t_5numpy_int32_t; */ typedef npy_int64 __pyx_t_5numpy_int64_t; -/* "../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":750 +/* "../../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":750 * #ctypedef npy_int128 int128_t * * ctypedef npy_uint8 uint8_t # <<<<<<<<<<<<<< @@ -813,7 +812,7 @@ typedef npy_int64 __pyx_t_5numpy_int64_t; */ typedef npy_uint8 __pyx_t_5numpy_uint8_t; -/* "../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":751 +/* "../../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":751 * * ctypedef npy_uint8 uint8_t * ctypedef npy_uint16 uint16_t # <<<<<<<<<<<<<< @@ -822,7 +821,7 @@ typedef npy_uint8 __pyx_t_5numpy_uint8_t; */ typedef npy_uint16 __pyx_t_5numpy_uint16_t; -/* "../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":752 +/* "../../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":752 * ctypedef npy_uint8 uint8_t * ctypedef npy_uint16 uint16_t * ctypedef npy_uint32 uint32_t # <<<<<<<<<<<<<< @@ -831,7 +830,7 @@ typedef npy_uint16 __pyx_t_5numpy_uint16_t; */ typedef npy_uint32 __pyx_t_5numpy_uint32_t; -/* "../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":753 +/* "../../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":753 * ctypedef npy_uint16 uint16_t * ctypedef npy_uint32 uint32_t * ctypedef npy_uint64 uint64_t # <<<<<<<<<<<<<< @@ -840,7 +839,7 @@ typedef npy_uint32 __pyx_t_5numpy_uint32_t; */ typedef npy_uint64 __pyx_t_5numpy_uint64_t; -/* "../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":757 +/* "../../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":757 * #ctypedef npy_uint128 uint128_t * * ctypedef npy_float32 float32_t # <<<<<<<<<<<<<< @@ -849,7 +848,7 @@ typedef npy_uint64 __pyx_t_5numpy_uint64_t; */ typedef npy_float32 __pyx_t_5numpy_float32_t; -/* "../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":758 +/* "../../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":758 * * ctypedef npy_float32 float32_t * ctypedef npy_float64 float64_t # <<<<<<<<<<<<<< @@ -858,7 +857,7 @@ typedef npy_float32 __pyx_t_5numpy_float32_t; */ typedef npy_float64 __pyx_t_5numpy_float64_t; -/* "../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":767 +/* "../../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":767 * # The int types are mapped a bit surprising -- * # numpy.int corresponds to 'l' and numpy.long to 'q' * ctypedef npy_long int_t # <<<<<<<<<<<<<< @@ -867,7 +866,7 @@ typedef npy_float64 __pyx_t_5numpy_float64_t; */ typedef npy_long __pyx_t_5numpy_int_t; -/* "../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":768 +/* "../../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":768 * # numpy.int corresponds to 'l' and numpy.long to 'q' * ctypedef npy_long int_t * ctypedef npy_longlong long_t # <<<<<<<<<<<<<< @@ -876,7 +875,7 @@ typedef npy_long __pyx_t_5numpy_int_t; */ typedef npy_longlong __pyx_t_5numpy_long_t; -/* "../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":769 +/* "../../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":769 * ctypedef npy_long int_t * ctypedef npy_longlong long_t * ctypedef npy_longlong longlong_t # <<<<<<<<<<<<<< @@ -885,7 +884,7 @@ typedef npy_longlong __pyx_t_5numpy_long_t; */ typedef npy_longlong __pyx_t_5numpy_longlong_t; -/* "../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":771 +/* "../../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":771 * ctypedef npy_longlong longlong_t * * ctypedef npy_ulong uint_t # <<<<<<<<<<<<<< @@ -894,7 +893,7 @@ typedef npy_longlong __pyx_t_5numpy_longlong_t; */ typedef npy_ulong __pyx_t_5numpy_uint_t; -/* "../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":772 +/* "../../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":772 * * ctypedef npy_ulong uint_t * ctypedef npy_ulonglong ulong_t # <<<<<<<<<<<<<< @@ -903,7 +902,7 @@ typedef npy_ulong __pyx_t_5numpy_uint_t; */ typedef npy_ulonglong __pyx_t_5numpy_ulong_t; -/* "../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":773 +/* "../../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":773 * ctypedef npy_ulong uint_t * ctypedef npy_ulonglong ulong_t * ctypedef npy_ulonglong ulonglong_t # <<<<<<<<<<<<<< @@ -912,7 +911,7 @@ typedef npy_ulonglong __pyx_t_5numpy_ulong_t; */ typedef npy_ulonglong __pyx_t_5numpy_ulonglong_t; -/* "../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":775 +/* "../../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":775 * ctypedef npy_ulonglong ulonglong_t * * ctypedef npy_intp intp_t # <<<<<<<<<<<<<< @@ -921,7 +920,7 @@ typedef npy_ulonglong __pyx_t_5numpy_ulonglong_t; */ typedef npy_intp __pyx_t_5numpy_intp_t; -/* "../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":776 +/* "../../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":776 * * ctypedef npy_intp intp_t * ctypedef npy_uintp uintp_t # <<<<<<<<<<<<<< @@ -930,7 +929,7 @@ typedef npy_intp __pyx_t_5numpy_intp_t; */ typedef npy_uintp __pyx_t_5numpy_uintp_t; -/* "../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":778 +/* "../../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":778 * ctypedef npy_uintp uintp_t * * ctypedef npy_double float_t # <<<<<<<<<<<<<< @@ -939,7 +938,7 @@ typedef npy_uintp __pyx_t_5numpy_uintp_t; */ typedef npy_double __pyx_t_5numpy_float_t; -/* "../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":779 +/* "../../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":779 * * ctypedef npy_double float_t * ctypedef npy_double double_t # <<<<<<<<<<<<<< @@ -948,7 +947,7 @@ typedef npy_double __pyx_t_5numpy_float_t; */ typedef npy_double __pyx_t_5numpy_double_t; -/* "../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":780 +/* "../../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":780 * ctypedef npy_double float_t * ctypedef npy_double double_t * ctypedef npy_longdouble longdouble_t # <<<<<<<<<<<<<< @@ -992,7 +991,7 @@ static CYTHON_INLINE __pyx_t_double_complex __pyx_t_double_complex_from_parts(do /*--- Type declarations ---*/ -/* "../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":782 +/* "../../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":782 * ctypedef npy_longdouble longdouble_t * * ctypedef npy_cfloat cfloat_t # <<<<<<<<<<<<<< @@ -1001,7 +1000,7 @@ static CYTHON_INLINE __pyx_t_double_complex __pyx_t_double_complex_from_parts(do */ typedef npy_cfloat __pyx_t_5numpy_cfloat_t; -/* "../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":783 +/* "../../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":783 * * ctypedef npy_cfloat cfloat_t * ctypedef npy_cdouble cdouble_t # <<<<<<<<<<<<<< @@ -1010,7 +1009,7 @@ typedef npy_cfloat __pyx_t_5numpy_cfloat_t; */ typedef npy_cdouble __pyx_t_5numpy_cdouble_t; -/* "../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":784 +/* "../../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":784 * ctypedef npy_cfloat cfloat_t * ctypedef npy_cdouble cdouble_t * ctypedef npy_clongdouble clongdouble_t # <<<<<<<<<<<<<< @@ -1019,7 +1018,7 @@ typedef npy_cdouble __pyx_t_5numpy_cdouble_t; */ typedef npy_clongdouble __pyx_t_5numpy_clongdouble_t; -/* "../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":786 +/* "../../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":786 * ctypedef npy_clongdouble clongdouble_t * * ctypedef npy_cdouble complex_t # <<<<<<<<<<<<<< @@ -1578,8 +1577,6 @@ static PyTypeObject *__pyx_ptype_5numpy_ndarray = 0; static PyTypeObject *__pyx_ptype_5numpy_ufunc = 0; static CYTHON_INLINE char *__pyx_f_5numpy__util_dtypestring(PyArray_Descr *, char *, char *, int *); /*proto*/ -/* Module declarations from 'libc.math' */ - /* Module declarations from 'gensim.models.word2vec_inner' */ static __pyx_t_6gensim_6models_14word2vec_inner_scopy_ptr *__pyx_vp_6gensim_6models_14word2vec_inner_scopy = 0; #define __pyx_v_6gensim_6models_14word2vec_inner_scopy (*__pyx_vp_6gensim_6models_14word2vec_inner_scopy) @@ -1599,10 +1596,6 @@ static __pyx_t_6gensim_6models_14word2vec_inner_our_dot_ptr *__pyx_vp_6gensim_6m #define __pyx_v_6gensim_6models_14word2vec_inner_our_dot (*__pyx_vp_6gensim_6models_14word2vec_inner_our_dot) static __pyx_t_6gensim_6models_14word2vec_inner_our_saxpy_ptr *__pyx_vp_6gensim_6models_14word2vec_inner_our_saxpy = 0; #define __pyx_v_6gensim_6models_14word2vec_inner_our_saxpy (*__pyx_vp_6gensim_6models_14word2vec_inner_our_saxpy) -static __pyx_t_6gensim_6models_14word2vec_inner_REAL_t (*__pyx_f_6gensim_6models_14word2vec_inner_our_dot_double)(int const *, float const *, int const *, float const *, int const *); /*proto*/ -static __pyx_t_6gensim_6models_14word2vec_inner_REAL_t (*__pyx_f_6gensim_6models_14word2vec_inner_our_dot_float)(int const *, float const *, int const *, float const *, int const *); /*proto*/ -static __pyx_t_6gensim_6models_14word2vec_inner_REAL_t (*__pyx_f_6gensim_6models_14word2vec_inner_our_dot_noblas)(int const *, float const *, int const *, float const *, int const *); /*proto*/ -static void (*__pyx_f_6gensim_6models_14word2vec_inner_our_saxpy_noblas)(int const *, float const *, float const *, int const *, float *, int const *); /*proto*/ static unsigned PY_LONG_LONG (*__pyx_f_6gensim_6models_14word2vec_inner_bisect_left)(__pyx_t_5numpy_uint32_t *, unsigned PY_LONG_LONG, unsigned PY_LONG_LONG, unsigned PY_LONG_LONG); /*proto*/ static unsigned PY_LONG_LONG (*__pyx_f_6gensim_6models_14word2vec_inner_random_int32)(unsigned PY_LONG_LONG *); /*proto*/ @@ -1676,7 +1669,6 @@ static const char __pyx_k_vectors[] = "vectors"; static const char __pyx_k_vlookup[] = "vlookup"; static const char __pyx_k_codelens[] = "codelens"; static const char __pyx_k_negative[] = "negative"; -static const char __pyx_k_word2vec[] = "word2vec"; static const char __pyx_k_cbow_mean[] = "cbow_mean"; static const char __pyx_k_cum_table[] = "cum_table"; static const char __pyx_k_doc_words[] = "doc_words"; @@ -1694,7 +1686,6 @@ static const char __pyx_k_learn_words[] = "learn_words"; static const char __pyx_k_next_random[] = "next_random"; static const char __pyx_k_train_words[] = "train_words"; static const char __pyx_k_vector_size[] = "vector_size"; -static const char __pyx_k_FAST_VERSION[] = "FAST_VERSION"; static const char __pyx_k_RuntimeError[] = "RuntimeError"; static const char __pyx_k_dm_tag_count[] = "dm_tag_count"; static const char __pyx_k_doctag_locks[] = "doctag_locks"; @@ -1734,10 +1725,10 @@ static const char __pyx_k_numpy_core_multiarray_failed_to[] = "numpy.core.multia static const char __pyx_k_unknown_dtype_code_in_numpy_pxd[] = "unknown dtype code in numpy.pxd (%d)"; static const char __pyx_k_Format_string_allocated_too_shor[] = "Format string allocated too short, see comment in numpy.pxd"; static const char __pyx_k_Non_native_byte_order_not_suppor[] = "Non-native byte order not supported"; +static const char __pyx_k_Optimized_cython_functions_for_t[] = "Optimized cython functions for training :class:`~gensim.models.doc2vec.Doc2Vec` model."; static const char __pyx_k_ndarray_is_not_Fortran_contiguou[] = "ndarray is not Fortran contiguous"; static const char __pyx_k_numpy_core_umath_failed_to_impor[] = "numpy.core.umath failed to import"; static const char __pyx_k_Format_string_allocated_too_shor_2[] = "Format string allocated too short."; -static PyObject *__pyx_n_s_FAST_VERSION; static PyObject *__pyx_kp_u_Format_string_allocated_too_shor; static PyObject *__pyx_kp_u_Format_string_allocated_too_shor_2; static PyObject *__pyx_n_s_ImportError; @@ -1840,7 +1831,6 @@ static PyObject *__pyx_n_s_vocab; static PyObject *__pyx_n_s_vocabulary; static PyObject *__pyx_n_s_window; static PyObject *__pyx_n_s_window_indexes; -static PyObject *__pyx_n_s_word2vec; static PyObject *__pyx_n_s_word_locks; static PyObject *__pyx_n_s_word_locks_2; static PyObject *__pyx_n_s_word_vectors; @@ -1878,7 +1868,7 @@ static PyObject *__pyx_codeobj__19; static PyObject *__pyx_codeobj__21; static PyObject *__pyx_codeobj__23; -/* "gensim/models/doc2vec_inner.pyx":41 +/* "gensim/models/doc2vec_inner.pyx":35 * DEF MAX_EXP = 6 * * cdef void fast_document_dbow_hs( # <<<<<<<<<<<<<< @@ -1897,7 +1887,7 @@ static void __pyx_f_6gensim_6models_13doc2vec_inner_fast_document_dbow_hs(__pyx_ int __pyx_t_3; int __pyx_t_4; - /* "gensim/models/doc2vec_inner.pyx":48 + /* "gensim/models/doc2vec_inner.pyx":42 * * cdef long long a, b * cdef long long row1 = context_index * size, row2 # <<<<<<<<<<<<<< @@ -1906,7 +1896,7 @@ static void __pyx_f_6gensim_6models_13doc2vec_inner_fast_document_dbow_hs(__pyx_ */ __pyx_v_row1 = (__pyx_v_context_index * __pyx_v_size); - /* "gensim/models/doc2vec_inner.pyx":51 + /* "gensim/models/doc2vec_inner.pyx":45 * cdef REAL_t f, g * * memset(work, 0, size * cython.sizeof(REAL_t)) # <<<<<<<<<<<<<< @@ -1915,7 +1905,7 @@ static void __pyx_f_6gensim_6models_13doc2vec_inner_fast_document_dbow_hs(__pyx_ */ memset(__pyx_v_work, 0, (__pyx_v_size * (sizeof(__pyx_t_6gensim_6models_14word2vec_inner_REAL_t)))); - /* "gensim/models/doc2vec_inner.pyx":52 + /* "gensim/models/doc2vec_inner.pyx":46 * * memset(work, 0, size * cython.sizeof(REAL_t)) * for b in range(codelen): # <<<<<<<<<<<<<< @@ -1926,7 +1916,7 @@ static void __pyx_f_6gensim_6models_13doc2vec_inner_fast_document_dbow_hs(__pyx_ for (__pyx_t_2 = 0; __pyx_t_2 < __pyx_t_1; __pyx_t_2+=1) { __pyx_v_b = __pyx_t_2; - /* "gensim/models/doc2vec_inner.pyx":53 + /* "gensim/models/doc2vec_inner.pyx":47 * memset(work, 0, size * cython.sizeof(REAL_t)) * for b in range(codelen): * row2 = word_point[b] * size # <<<<<<<<<<<<<< @@ -1935,7 +1925,7 @@ static void __pyx_f_6gensim_6models_13doc2vec_inner_fast_document_dbow_hs(__pyx_ */ __pyx_v_row2 = ((__pyx_v_word_point[__pyx_v_b]) * __pyx_v_size); - /* "gensim/models/doc2vec_inner.pyx":54 + /* "gensim/models/doc2vec_inner.pyx":48 * for b in range(codelen): * row2 = word_point[b] * size * f = our_dot(&size, &context_vectors[row1], &ONE, &syn1[row2], &ONE) # <<<<<<<<<<<<<< @@ -1944,7 +1934,7 @@ static void __pyx_f_6gensim_6models_13doc2vec_inner_fast_document_dbow_hs(__pyx_ */ __pyx_v_f = __pyx_v_6gensim_6models_14word2vec_inner_our_dot((&__pyx_v_size), (&(__pyx_v_context_vectors[__pyx_v_row1])), (&__pyx_v_6gensim_6models_13doc2vec_inner_ONE), (&(__pyx_v_syn1[__pyx_v_row2])), (&__pyx_v_6gensim_6models_13doc2vec_inner_ONE)); - /* "gensim/models/doc2vec_inner.pyx":55 + /* "gensim/models/doc2vec_inner.pyx":49 * row2 = word_point[b] * size * f = our_dot(&size, &context_vectors[row1], &ONE, &syn1[row2], &ONE) * if f <= -MAX_EXP or f >= MAX_EXP: # <<<<<<<<<<<<<< @@ -1962,7 +1952,7 @@ static void __pyx_f_6gensim_6models_13doc2vec_inner_fast_document_dbow_hs(__pyx_ __pyx_L6_bool_binop_done:; if (__pyx_t_3) { - /* "gensim/models/doc2vec_inner.pyx":56 + /* "gensim/models/doc2vec_inner.pyx":50 * f = our_dot(&size, &context_vectors[row1], &ONE, &syn1[row2], &ONE) * if f <= -MAX_EXP or f >= MAX_EXP: * continue # <<<<<<<<<<<<<< @@ -1971,7 +1961,7 @@ static void __pyx_f_6gensim_6models_13doc2vec_inner_fast_document_dbow_hs(__pyx_ */ goto __pyx_L3_continue; - /* "gensim/models/doc2vec_inner.pyx":55 + /* "gensim/models/doc2vec_inner.pyx":49 * row2 = word_point[b] * size * f = our_dot(&size, &context_vectors[row1], &ONE, &syn1[row2], &ONE) * if f <= -MAX_EXP or f >= MAX_EXP: # <<<<<<<<<<<<<< @@ -1980,7 +1970,7 @@ static void __pyx_f_6gensim_6models_13doc2vec_inner_fast_document_dbow_hs(__pyx_ */ } - /* "gensim/models/doc2vec_inner.pyx":57 + /* "gensim/models/doc2vec_inner.pyx":51 * if f <= -MAX_EXP or f >= MAX_EXP: * continue * f = EXP_TABLE[((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))] # <<<<<<<<<<<<<< @@ -1989,7 +1979,7 @@ static void __pyx_f_6gensim_6models_13doc2vec_inner_fast_document_dbow_hs(__pyx_ */ __pyx_v_f = (__pyx_v_6gensim_6models_14word2vec_inner_EXP_TABLE[((int)((__pyx_v_f + 6.0) * 83.0))]); - /* "gensim/models/doc2vec_inner.pyx":58 + /* "gensim/models/doc2vec_inner.pyx":52 * continue * f = EXP_TABLE[((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))] * g = (1 - word_code[b] - f) * alpha # <<<<<<<<<<<<<< @@ -1998,7 +1988,7 @@ static void __pyx_f_6gensim_6models_13doc2vec_inner_fast_document_dbow_hs(__pyx_ */ __pyx_v_g = (((1 - (__pyx_v_word_code[__pyx_v_b])) - __pyx_v_f) * __pyx_v_alpha); - /* "gensim/models/doc2vec_inner.pyx":59 + /* "gensim/models/doc2vec_inner.pyx":53 * f = EXP_TABLE[((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))] * g = (1 - word_code[b] - f) * alpha * our_saxpy(&size, &g, &syn1[row2], &ONE, work, &ONE) # <<<<<<<<<<<<<< @@ -2007,7 +1997,7 @@ static void __pyx_f_6gensim_6models_13doc2vec_inner_fast_document_dbow_hs(__pyx_ */ __pyx_v_6gensim_6models_14word2vec_inner_our_saxpy((&__pyx_v_size), (&__pyx_v_g), (&(__pyx_v_syn1[__pyx_v_row2])), (&__pyx_v_6gensim_6models_13doc2vec_inner_ONE), __pyx_v_work, (&__pyx_v_6gensim_6models_13doc2vec_inner_ONE)); - /* "gensim/models/doc2vec_inner.pyx":60 + /* "gensim/models/doc2vec_inner.pyx":54 * g = (1 - word_code[b] - f) * alpha * our_saxpy(&size, &g, &syn1[row2], &ONE, work, &ONE) * if learn_hidden: # <<<<<<<<<<<<<< @@ -2017,7 +2007,7 @@ static void __pyx_f_6gensim_6models_13doc2vec_inner_fast_document_dbow_hs(__pyx_ __pyx_t_3 = (__pyx_v_learn_hidden != 0); if (__pyx_t_3) { - /* "gensim/models/doc2vec_inner.pyx":61 + /* "gensim/models/doc2vec_inner.pyx":55 * our_saxpy(&size, &g, &syn1[row2], &ONE, work, &ONE) * if learn_hidden: * our_saxpy(&size, &g, &context_vectors[row1], &ONE, &syn1[row2], &ONE) # <<<<<<<<<<<<<< @@ -2026,7 +2016,7 @@ static void __pyx_f_6gensim_6models_13doc2vec_inner_fast_document_dbow_hs(__pyx_ */ __pyx_v_6gensim_6models_14word2vec_inner_our_saxpy((&__pyx_v_size), (&__pyx_v_g), (&(__pyx_v_context_vectors[__pyx_v_row1])), (&__pyx_v_6gensim_6models_13doc2vec_inner_ONE), (&(__pyx_v_syn1[__pyx_v_row2])), (&__pyx_v_6gensim_6models_13doc2vec_inner_ONE)); - /* "gensim/models/doc2vec_inner.pyx":60 + /* "gensim/models/doc2vec_inner.pyx":54 * g = (1 - word_code[b] - f) * alpha * our_saxpy(&size, &g, &syn1[row2], &ONE, work, &ONE) * if learn_hidden: # <<<<<<<<<<<<<< @@ -2037,7 +2027,7 @@ static void __pyx_f_6gensim_6models_13doc2vec_inner_fast_document_dbow_hs(__pyx_ __pyx_L3_continue:; } - /* "gensim/models/doc2vec_inner.pyx":62 + /* "gensim/models/doc2vec_inner.pyx":56 * if learn_hidden: * our_saxpy(&size, &g, &context_vectors[row1], &ONE, &syn1[row2], &ONE) * if learn_context: # <<<<<<<<<<<<<< @@ -2047,7 +2037,7 @@ static void __pyx_f_6gensim_6models_13doc2vec_inner_fast_document_dbow_hs(__pyx_ __pyx_t_3 = (__pyx_v_learn_context != 0); if (__pyx_t_3) { - /* "gensim/models/doc2vec_inner.pyx":63 + /* "gensim/models/doc2vec_inner.pyx":57 * our_saxpy(&size, &g, &context_vectors[row1], &ONE, &syn1[row2], &ONE) * if learn_context: * our_saxpy(&size, &context_locks[context_index], work, &ONE, &context_vectors[row1], &ONE) # <<<<<<<<<<<<<< @@ -2056,7 +2046,7 @@ static void __pyx_f_6gensim_6models_13doc2vec_inner_fast_document_dbow_hs(__pyx_ */ __pyx_v_6gensim_6models_14word2vec_inner_our_saxpy((&__pyx_v_size), (&(__pyx_v_context_locks[__pyx_v_context_index])), __pyx_v_work, (&__pyx_v_6gensim_6models_13doc2vec_inner_ONE), (&(__pyx_v_context_vectors[__pyx_v_row1])), (&__pyx_v_6gensim_6models_13doc2vec_inner_ONE)); - /* "gensim/models/doc2vec_inner.pyx":62 + /* "gensim/models/doc2vec_inner.pyx":56 * if learn_hidden: * our_saxpy(&size, &g, &context_vectors[row1], &ONE, &syn1[row2], &ONE) * if learn_context: # <<<<<<<<<<<<<< @@ -2065,7 +2055,7 @@ static void __pyx_f_6gensim_6models_13doc2vec_inner_fast_document_dbow_hs(__pyx_ */ } - /* "gensim/models/doc2vec_inner.pyx":41 + /* "gensim/models/doc2vec_inner.pyx":35 * DEF MAX_EXP = 6 * * cdef void fast_document_dbow_hs( # <<<<<<<<<<<<<< @@ -2076,7 +2066,7 @@ static void __pyx_f_6gensim_6models_13doc2vec_inner_fast_document_dbow_hs(__pyx_ /* function exit code */ } -/* "gensim/models/doc2vec_inner.pyx":66 +/* "gensim/models/doc2vec_inner.pyx":60 * * * cdef unsigned long long fast_document_dbow_neg( # <<<<<<<<<<<<<< @@ -2099,7 +2089,7 @@ static unsigned PY_LONG_LONG __pyx_f_6gensim_6models_13doc2vec_inner_fast_docume int __pyx_t_3; int __pyx_t_4; - /* "gensim/models/doc2vec_inner.pyx":73 + /* "gensim/models/doc2vec_inner.pyx":67 * * cdef long long a * cdef long long row1 = context_index * size, row2 # <<<<<<<<<<<<<< @@ -2108,7 +2098,7 @@ static unsigned PY_LONG_LONG __pyx_f_6gensim_6models_13doc2vec_inner_fast_docume */ __pyx_v_row1 = (__pyx_v_context_index * __pyx_v_size); - /* "gensim/models/doc2vec_inner.pyx":74 + /* "gensim/models/doc2vec_inner.pyx":68 * cdef long long a * cdef long long row1 = context_index * size, row2 * cdef unsigned long long modulo = 281474976710655ULL # <<<<<<<<<<<<<< @@ -2117,7 +2107,7 @@ static unsigned PY_LONG_LONG __pyx_f_6gensim_6models_13doc2vec_inner_fast_docume */ __pyx_v_modulo = 281474976710655ULL; - /* "gensim/models/doc2vec_inner.pyx":79 + /* "gensim/models/doc2vec_inner.pyx":73 * cdef int d * * memset(work, 0, size * cython.sizeof(REAL_t)) # <<<<<<<<<<<<<< @@ -2126,7 +2116,7 @@ static unsigned PY_LONG_LONG __pyx_f_6gensim_6models_13doc2vec_inner_fast_docume */ memset(__pyx_v_work, 0, (__pyx_v_size * (sizeof(__pyx_t_6gensim_6models_14word2vec_inner_REAL_t)))); - /* "gensim/models/doc2vec_inner.pyx":81 + /* "gensim/models/doc2vec_inner.pyx":75 * memset(work, 0, size * cython.sizeof(REAL_t)) * * for d in range(negative+1): # <<<<<<<<<<<<<< @@ -2137,7 +2127,7 @@ static unsigned PY_LONG_LONG __pyx_f_6gensim_6models_13doc2vec_inner_fast_docume for (__pyx_t_2 = 0; __pyx_t_2 < __pyx_t_1; __pyx_t_2+=1) { __pyx_v_d = __pyx_t_2; - /* "gensim/models/doc2vec_inner.pyx":82 + /* "gensim/models/doc2vec_inner.pyx":76 * * for d in range(negative+1): * if d == 0: # <<<<<<<<<<<<<< @@ -2147,7 +2137,7 @@ static unsigned PY_LONG_LONG __pyx_f_6gensim_6models_13doc2vec_inner_fast_docume __pyx_t_3 = ((__pyx_v_d == 0) != 0); if (__pyx_t_3) { - /* "gensim/models/doc2vec_inner.pyx":83 + /* "gensim/models/doc2vec_inner.pyx":77 * for d in range(negative+1): * if d == 0: * target_index = word_index # <<<<<<<<<<<<<< @@ -2156,7 +2146,7 @@ static unsigned PY_LONG_LONG __pyx_f_6gensim_6models_13doc2vec_inner_fast_docume */ __pyx_v_target_index = __pyx_v_word_index; - /* "gensim/models/doc2vec_inner.pyx":84 + /* "gensim/models/doc2vec_inner.pyx":78 * if d == 0: * target_index = word_index * label = ONEF # <<<<<<<<<<<<<< @@ -2165,7 +2155,7 @@ static unsigned PY_LONG_LONG __pyx_f_6gensim_6models_13doc2vec_inner_fast_docume */ __pyx_v_label = __pyx_v_6gensim_6models_13doc2vec_inner_ONEF; - /* "gensim/models/doc2vec_inner.pyx":82 + /* "gensim/models/doc2vec_inner.pyx":76 * * for d in range(negative+1): * if d == 0: # <<<<<<<<<<<<<< @@ -2175,7 +2165,7 @@ static unsigned PY_LONG_LONG __pyx_f_6gensim_6models_13doc2vec_inner_fast_docume goto __pyx_L5; } - /* "gensim/models/doc2vec_inner.pyx":86 + /* "gensim/models/doc2vec_inner.pyx":80 * label = ONEF * else: * target_index = bisect_left(cum_table, (next_random >> 16) % cum_table[cum_table_len-1], 0, cum_table_len) # <<<<<<<<<<<<<< @@ -2185,7 +2175,7 @@ static unsigned PY_LONG_LONG __pyx_f_6gensim_6models_13doc2vec_inner_fast_docume /*else*/ { __pyx_v_target_index = __pyx_f_6gensim_6models_14word2vec_inner_bisect_left(__pyx_v_cum_table, ((__pyx_v_next_random >> 16) % (__pyx_v_cum_table[(__pyx_v_cum_table_len - 1)])), 0, __pyx_v_cum_table_len); - /* "gensim/models/doc2vec_inner.pyx":87 + /* "gensim/models/doc2vec_inner.pyx":81 * else: * target_index = bisect_left(cum_table, (next_random >> 16) % cum_table[cum_table_len-1], 0, cum_table_len) * next_random = (next_random * 25214903917ULL + 11) & modulo # <<<<<<<<<<<<<< @@ -2194,7 +2184,7 @@ static unsigned PY_LONG_LONG __pyx_f_6gensim_6models_13doc2vec_inner_fast_docume */ __pyx_v_next_random = (((__pyx_v_next_random * ((unsigned PY_LONG_LONG)25214903917ULL)) + 11) & __pyx_v_modulo); - /* "gensim/models/doc2vec_inner.pyx":88 + /* "gensim/models/doc2vec_inner.pyx":82 * target_index = bisect_left(cum_table, (next_random >> 16) % cum_table[cum_table_len-1], 0, cum_table_len) * next_random = (next_random * 25214903917ULL + 11) & modulo * if target_index == word_index: # <<<<<<<<<<<<<< @@ -2204,7 +2194,7 @@ static unsigned PY_LONG_LONG __pyx_f_6gensim_6models_13doc2vec_inner_fast_docume __pyx_t_3 = ((__pyx_v_target_index == __pyx_v_word_index) != 0); if (__pyx_t_3) { - /* "gensim/models/doc2vec_inner.pyx":89 + /* "gensim/models/doc2vec_inner.pyx":83 * next_random = (next_random * 25214903917ULL + 11) & modulo * if target_index == word_index: * continue # <<<<<<<<<<<<<< @@ -2213,7 +2203,7 @@ static unsigned PY_LONG_LONG __pyx_f_6gensim_6models_13doc2vec_inner_fast_docume */ goto __pyx_L3_continue; - /* "gensim/models/doc2vec_inner.pyx":88 + /* "gensim/models/doc2vec_inner.pyx":82 * target_index = bisect_left(cum_table, (next_random >> 16) % cum_table[cum_table_len-1], 0, cum_table_len) * next_random = (next_random * 25214903917ULL + 11) & modulo * if target_index == word_index: # <<<<<<<<<<<<<< @@ -2222,7 +2212,7 @@ static unsigned PY_LONG_LONG __pyx_f_6gensim_6models_13doc2vec_inner_fast_docume */ } - /* "gensim/models/doc2vec_inner.pyx":90 + /* "gensim/models/doc2vec_inner.pyx":84 * if target_index == word_index: * continue * label = 0.0 # <<<<<<<<<<<<<< @@ -2233,7 +2223,7 @@ static unsigned PY_LONG_LONG __pyx_f_6gensim_6models_13doc2vec_inner_fast_docume } __pyx_L5:; - /* "gensim/models/doc2vec_inner.pyx":91 + /* "gensim/models/doc2vec_inner.pyx":85 * continue * label = 0.0 * row2 = target_index * size # <<<<<<<<<<<<<< @@ -2242,7 +2232,7 @@ static unsigned PY_LONG_LONG __pyx_f_6gensim_6models_13doc2vec_inner_fast_docume */ __pyx_v_row2 = (__pyx_v_target_index * __pyx_v_size); - /* "gensim/models/doc2vec_inner.pyx":92 + /* "gensim/models/doc2vec_inner.pyx":86 * label = 0.0 * row2 = target_index * size * f = our_dot(&size, &context_vectors[row1], &ONE, &syn1neg[row2], &ONE) # <<<<<<<<<<<<<< @@ -2251,7 +2241,7 @@ static unsigned PY_LONG_LONG __pyx_f_6gensim_6models_13doc2vec_inner_fast_docume */ __pyx_v_f = __pyx_v_6gensim_6models_14word2vec_inner_our_dot((&__pyx_v_size), (&(__pyx_v_context_vectors[__pyx_v_row1])), (&__pyx_v_6gensim_6models_13doc2vec_inner_ONE), (&(__pyx_v_syn1neg[__pyx_v_row2])), (&__pyx_v_6gensim_6models_13doc2vec_inner_ONE)); - /* "gensim/models/doc2vec_inner.pyx":93 + /* "gensim/models/doc2vec_inner.pyx":87 * row2 = target_index * size * f = our_dot(&size, &context_vectors[row1], &ONE, &syn1neg[row2], &ONE) * if f <= -MAX_EXP or f >= MAX_EXP: # <<<<<<<<<<<<<< @@ -2269,7 +2259,7 @@ static unsigned PY_LONG_LONG __pyx_f_6gensim_6models_13doc2vec_inner_fast_docume __pyx_L8_bool_binop_done:; if (__pyx_t_3) { - /* "gensim/models/doc2vec_inner.pyx":94 + /* "gensim/models/doc2vec_inner.pyx":88 * f = our_dot(&size, &context_vectors[row1], &ONE, &syn1neg[row2], &ONE) * if f <= -MAX_EXP or f >= MAX_EXP: * continue # <<<<<<<<<<<<<< @@ -2278,7 +2268,7 @@ static unsigned PY_LONG_LONG __pyx_f_6gensim_6models_13doc2vec_inner_fast_docume */ goto __pyx_L3_continue; - /* "gensim/models/doc2vec_inner.pyx":93 + /* "gensim/models/doc2vec_inner.pyx":87 * row2 = target_index * size * f = our_dot(&size, &context_vectors[row1], &ONE, &syn1neg[row2], &ONE) * if f <= -MAX_EXP or f >= MAX_EXP: # <<<<<<<<<<<<<< @@ -2287,7 +2277,7 @@ static unsigned PY_LONG_LONG __pyx_f_6gensim_6models_13doc2vec_inner_fast_docume */ } - /* "gensim/models/doc2vec_inner.pyx":95 + /* "gensim/models/doc2vec_inner.pyx":89 * if f <= -MAX_EXP or f >= MAX_EXP: * continue * f = EXP_TABLE[((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))] # <<<<<<<<<<<<<< @@ -2296,7 +2286,7 @@ static unsigned PY_LONG_LONG __pyx_f_6gensim_6models_13doc2vec_inner_fast_docume */ __pyx_v_f = (__pyx_v_6gensim_6models_14word2vec_inner_EXP_TABLE[((int)((__pyx_v_f + 6.0) * 83.0))]); - /* "gensim/models/doc2vec_inner.pyx":96 + /* "gensim/models/doc2vec_inner.pyx":90 * continue * f = EXP_TABLE[((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))] * g = (label - f) * alpha # <<<<<<<<<<<<<< @@ -2305,7 +2295,7 @@ static unsigned PY_LONG_LONG __pyx_f_6gensim_6models_13doc2vec_inner_fast_docume */ __pyx_v_g = ((__pyx_v_label - __pyx_v_f) * __pyx_v_alpha); - /* "gensim/models/doc2vec_inner.pyx":97 + /* "gensim/models/doc2vec_inner.pyx":91 * f = EXP_TABLE[((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))] * g = (label - f) * alpha * our_saxpy(&size, &g, &syn1neg[row2], &ONE, work, &ONE) # <<<<<<<<<<<<<< @@ -2314,7 +2304,7 @@ static unsigned PY_LONG_LONG __pyx_f_6gensim_6models_13doc2vec_inner_fast_docume */ __pyx_v_6gensim_6models_14word2vec_inner_our_saxpy((&__pyx_v_size), (&__pyx_v_g), (&(__pyx_v_syn1neg[__pyx_v_row2])), (&__pyx_v_6gensim_6models_13doc2vec_inner_ONE), __pyx_v_work, (&__pyx_v_6gensim_6models_13doc2vec_inner_ONE)); - /* "gensim/models/doc2vec_inner.pyx":98 + /* "gensim/models/doc2vec_inner.pyx":92 * g = (label - f) * alpha * our_saxpy(&size, &g, &syn1neg[row2], &ONE, work, &ONE) * if learn_hidden: # <<<<<<<<<<<<<< @@ -2324,7 +2314,7 @@ static unsigned PY_LONG_LONG __pyx_f_6gensim_6models_13doc2vec_inner_fast_docume __pyx_t_3 = (__pyx_v_learn_hidden != 0); if (__pyx_t_3) { - /* "gensim/models/doc2vec_inner.pyx":99 + /* "gensim/models/doc2vec_inner.pyx":93 * our_saxpy(&size, &g, &syn1neg[row2], &ONE, work, &ONE) * if learn_hidden: * our_saxpy(&size, &g, &context_vectors[row1], &ONE, &syn1neg[row2], &ONE) # <<<<<<<<<<<<<< @@ -2333,7 +2323,7 @@ static unsigned PY_LONG_LONG __pyx_f_6gensim_6models_13doc2vec_inner_fast_docume */ __pyx_v_6gensim_6models_14word2vec_inner_our_saxpy((&__pyx_v_size), (&__pyx_v_g), (&(__pyx_v_context_vectors[__pyx_v_row1])), (&__pyx_v_6gensim_6models_13doc2vec_inner_ONE), (&(__pyx_v_syn1neg[__pyx_v_row2])), (&__pyx_v_6gensim_6models_13doc2vec_inner_ONE)); - /* "gensim/models/doc2vec_inner.pyx":98 + /* "gensim/models/doc2vec_inner.pyx":92 * g = (label - f) * alpha * our_saxpy(&size, &g, &syn1neg[row2], &ONE, work, &ONE) * if learn_hidden: # <<<<<<<<<<<<<< @@ -2344,7 +2334,7 @@ static unsigned PY_LONG_LONG __pyx_f_6gensim_6models_13doc2vec_inner_fast_docume __pyx_L3_continue:; } - /* "gensim/models/doc2vec_inner.pyx":100 + /* "gensim/models/doc2vec_inner.pyx":94 * if learn_hidden: * our_saxpy(&size, &g, &context_vectors[row1], &ONE, &syn1neg[row2], &ONE) * if learn_context: # <<<<<<<<<<<<<< @@ -2354,7 +2344,7 @@ static unsigned PY_LONG_LONG __pyx_f_6gensim_6models_13doc2vec_inner_fast_docume __pyx_t_3 = (__pyx_v_learn_context != 0); if (__pyx_t_3) { - /* "gensim/models/doc2vec_inner.pyx":101 + /* "gensim/models/doc2vec_inner.pyx":95 * our_saxpy(&size, &g, &context_vectors[row1], &ONE, &syn1neg[row2], &ONE) * if learn_context: * our_saxpy(&size, &context_locks[context_index], work, &ONE, &context_vectors[row1], &ONE) # <<<<<<<<<<<<<< @@ -2363,7 +2353,7 @@ static unsigned PY_LONG_LONG __pyx_f_6gensim_6models_13doc2vec_inner_fast_docume */ __pyx_v_6gensim_6models_14word2vec_inner_our_saxpy((&__pyx_v_size), (&(__pyx_v_context_locks[__pyx_v_context_index])), __pyx_v_work, (&__pyx_v_6gensim_6models_13doc2vec_inner_ONE), (&(__pyx_v_context_vectors[__pyx_v_row1])), (&__pyx_v_6gensim_6models_13doc2vec_inner_ONE)); - /* "gensim/models/doc2vec_inner.pyx":100 + /* "gensim/models/doc2vec_inner.pyx":94 * if learn_hidden: * our_saxpy(&size, &g, &context_vectors[row1], &ONE, &syn1neg[row2], &ONE) * if learn_context: # <<<<<<<<<<<<<< @@ -2372,7 +2362,7 @@ static unsigned PY_LONG_LONG __pyx_f_6gensim_6models_13doc2vec_inner_fast_docume */ } - /* "gensim/models/doc2vec_inner.pyx":103 + /* "gensim/models/doc2vec_inner.pyx":97 * our_saxpy(&size, &context_locks[context_index], work, &ONE, &context_vectors[row1], &ONE) * * return next_random # <<<<<<<<<<<<<< @@ -2382,7 +2372,7 @@ static unsigned PY_LONG_LONG __pyx_f_6gensim_6models_13doc2vec_inner_fast_docume __pyx_r = __pyx_v_next_random; goto __pyx_L0; - /* "gensim/models/doc2vec_inner.pyx":66 + /* "gensim/models/doc2vec_inner.pyx":60 * * * cdef unsigned long long fast_document_dbow_neg( # <<<<<<<<<<<<<< @@ -2395,7 +2385,7 @@ static unsigned PY_LONG_LONG __pyx_f_6gensim_6models_13doc2vec_inner_fast_docume return __pyx_r; } -/* "gensim/models/doc2vec_inner.pyx":106 +/* "gensim/models/doc2vec_inner.pyx":100 * * * cdef void fast_document_dm_hs( # <<<<<<<<<<<<<< @@ -2413,7 +2403,7 @@ static void __pyx_f_6gensim_6models_13doc2vec_inner_fast_document_dm_hs(__pyx_t_ int __pyx_t_3; int __pyx_t_4; - /* "gensim/models/doc2vec_inner.pyx":117 + /* "gensim/models/doc2vec_inner.pyx":111 * # l1 already composed by caller, passed in as neu1 * # work (also passed in) will accumulate l1 error * for b in range(word_code_len): # <<<<<<<<<<<<<< @@ -2424,7 +2414,7 @@ static void __pyx_f_6gensim_6models_13doc2vec_inner_fast_document_dm_hs(__pyx_t_ for (__pyx_t_2 = 0; __pyx_t_2 < __pyx_t_1; __pyx_t_2+=1) { __pyx_v_b = __pyx_t_2; - /* "gensim/models/doc2vec_inner.pyx":118 + /* "gensim/models/doc2vec_inner.pyx":112 * # work (also passed in) will accumulate l1 error * for b in range(word_code_len): * row2 = word_point[b] * size # <<<<<<<<<<<<<< @@ -2433,7 +2423,7 @@ static void __pyx_f_6gensim_6models_13doc2vec_inner_fast_document_dm_hs(__pyx_t_ */ __pyx_v_row2 = ((__pyx_v_word_point[__pyx_v_b]) * __pyx_v_size); - /* "gensim/models/doc2vec_inner.pyx":119 + /* "gensim/models/doc2vec_inner.pyx":113 * for b in range(word_code_len): * row2 = word_point[b] * size * f = our_dot(&size, neu1, &ONE, &syn1[row2], &ONE) # <<<<<<<<<<<<<< @@ -2442,7 +2432,7 @@ static void __pyx_f_6gensim_6models_13doc2vec_inner_fast_document_dm_hs(__pyx_t_ */ __pyx_v_f = __pyx_v_6gensim_6models_14word2vec_inner_our_dot((&__pyx_v_size), __pyx_v_neu1, (&__pyx_v_6gensim_6models_13doc2vec_inner_ONE), (&(__pyx_v_syn1[__pyx_v_row2])), (&__pyx_v_6gensim_6models_13doc2vec_inner_ONE)); - /* "gensim/models/doc2vec_inner.pyx":120 + /* "gensim/models/doc2vec_inner.pyx":114 * row2 = word_point[b] * size * f = our_dot(&size, neu1, &ONE, &syn1[row2], &ONE) * if f <= -MAX_EXP or f >= MAX_EXP: # <<<<<<<<<<<<<< @@ -2460,7 +2450,7 @@ static void __pyx_f_6gensim_6models_13doc2vec_inner_fast_document_dm_hs(__pyx_t_ __pyx_L6_bool_binop_done:; if (__pyx_t_3) { - /* "gensim/models/doc2vec_inner.pyx":121 + /* "gensim/models/doc2vec_inner.pyx":115 * f = our_dot(&size, neu1, &ONE, &syn1[row2], &ONE) * if f <= -MAX_EXP or f >= MAX_EXP: * continue # <<<<<<<<<<<<<< @@ -2469,7 +2459,7 @@ static void __pyx_f_6gensim_6models_13doc2vec_inner_fast_document_dm_hs(__pyx_t_ */ goto __pyx_L3_continue; - /* "gensim/models/doc2vec_inner.pyx":120 + /* "gensim/models/doc2vec_inner.pyx":114 * row2 = word_point[b] * size * f = our_dot(&size, neu1, &ONE, &syn1[row2], &ONE) * if f <= -MAX_EXP or f >= MAX_EXP: # <<<<<<<<<<<<<< @@ -2478,7 +2468,7 @@ static void __pyx_f_6gensim_6models_13doc2vec_inner_fast_document_dm_hs(__pyx_t_ */ } - /* "gensim/models/doc2vec_inner.pyx":122 + /* "gensim/models/doc2vec_inner.pyx":116 * if f <= -MAX_EXP or f >= MAX_EXP: * continue * f = EXP_TABLE[((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))] # <<<<<<<<<<<<<< @@ -2487,7 +2477,7 @@ static void __pyx_f_6gensim_6models_13doc2vec_inner_fast_document_dm_hs(__pyx_t_ */ __pyx_v_f = (__pyx_v_6gensim_6models_14word2vec_inner_EXP_TABLE[((int)((__pyx_v_f + 6.0) * 83.0))]); - /* "gensim/models/doc2vec_inner.pyx":123 + /* "gensim/models/doc2vec_inner.pyx":117 * continue * f = EXP_TABLE[((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))] * g = (1 - word_code[b] - f) * alpha # <<<<<<<<<<<<<< @@ -2496,7 +2486,7 @@ static void __pyx_f_6gensim_6models_13doc2vec_inner_fast_document_dm_hs(__pyx_t_ */ __pyx_v_g = (((1 - (__pyx_v_word_code[__pyx_v_b])) - __pyx_v_f) * __pyx_v_alpha); - /* "gensim/models/doc2vec_inner.pyx":124 + /* "gensim/models/doc2vec_inner.pyx":118 * f = EXP_TABLE[((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))] * g = (1 - word_code[b] - f) * alpha * our_saxpy(&size, &g, &syn1[row2], &ONE, work, &ONE) # <<<<<<<<<<<<<< @@ -2505,7 +2495,7 @@ static void __pyx_f_6gensim_6models_13doc2vec_inner_fast_document_dm_hs(__pyx_t_ */ __pyx_v_6gensim_6models_14word2vec_inner_our_saxpy((&__pyx_v_size), (&__pyx_v_g), (&(__pyx_v_syn1[__pyx_v_row2])), (&__pyx_v_6gensim_6models_13doc2vec_inner_ONE), __pyx_v_work, (&__pyx_v_6gensim_6models_13doc2vec_inner_ONE)); - /* "gensim/models/doc2vec_inner.pyx":125 + /* "gensim/models/doc2vec_inner.pyx":119 * g = (1 - word_code[b] - f) * alpha * our_saxpy(&size, &g, &syn1[row2], &ONE, work, &ONE) * if learn_hidden: # <<<<<<<<<<<<<< @@ -2515,7 +2505,7 @@ static void __pyx_f_6gensim_6models_13doc2vec_inner_fast_document_dm_hs(__pyx_t_ __pyx_t_3 = (__pyx_v_learn_hidden != 0); if (__pyx_t_3) { - /* "gensim/models/doc2vec_inner.pyx":126 + /* "gensim/models/doc2vec_inner.pyx":120 * our_saxpy(&size, &g, &syn1[row2], &ONE, work, &ONE) * if learn_hidden: * our_saxpy(&size, &g, neu1, &ONE, &syn1[row2], &ONE) # <<<<<<<<<<<<<< @@ -2524,7 +2514,7 @@ static void __pyx_f_6gensim_6models_13doc2vec_inner_fast_document_dm_hs(__pyx_t_ */ __pyx_v_6gensim_6models_14word2vec_inner_our_saxpy((&__pyx_v_size), (&__pyx_v_g), __pyx_v_neu1, (&__pyx_v_6gensim_6models_13doc2vec_inner_ONE), (&(__pyx_v_syn1[__pyx_v_row2])), (&__pyx_v_6gensim_6models_13doc2vec_inner_ONE)); - /* "gensim/models/doc2vec_inner.pyx":125 + /* "gensim/models/doc2vec_inner.pyx":119 * g = (1 - word_code[b] - f) * alpha * our_saxpy(&size, &g, &syn1[row2], &ONE, work, &ONE) * if learn_hidden: # <<<<<<<<<<<<<< @@ -2535,7 +2525,7 @@ static void __pyx_f_6gensim_6models_13doc2vec_inner_fast_document_dm_hs(__pyx_t_ __pyx_L3_continue:; } - /* "gensim/models/doc2vec_inner.pyx":106 + /* "gensim/models/doc2vec_inner.pyx":100 * * * cdef void fast_document_dm_hs( # <<<<<<<<<<<<<< @@ -2546,7 +2536,7 @@ static void __pyx_f_6gensim_6models_13doc2vec_inner_fast_document_dm_hs(__pyx_t_ /* function exit code */ } -/* "gensim/models/doc2vec_inner.pyx":129 +/* "gensim/models/doc2vec_inner.pyx":123 * * * cdef unsigned long long fast_document_dm_neg( # <<<<<<<<<<<<<< @@ -2568,7 +2558,7 @@ static unsigned PY_LONG_LONG __pyx_f_6gensim_6models_13doc2vec_inner_fast_docume int __pyx_t_3; int __pyx_t_4; - /* "gensim/models/doc2vec_inner.pyx":135 + /* "gensim/models/doc2vec_inner.pyx":129 * * cdef long long row2 * cdef unsigned long long modulo = 281474976710655ULL # <<<<<<<<<<<<<< @@ -2577,7 +2567,7 @@ static unsigned PY_LONG_LONG __pyx_f_6gensim_6models_13doc2vec_inner_fast_docume */ __pyx_v_modulo = 281474976710655ULL; - /* "gensim/models/doc2vec_inner.pyx":142 + /* "gensim/models/doc2vec_inner.pyx":136 * # l1 already composed by caller, passed in as neu1 * # work (also passsed in) will accumulate l1 error for outside application * for d in range(negative+1): # <<<<<<<<<<<<<< @@ -2588,7 +2578,7 @@ static unsigned PY_LONG_LONG __pyx_f_6gensim_6models_13doc2vec_inner_fast_docume for (__pyx_t_2 = 0; __pyx_t_2 < __pyx_t_1; __pyx_t_2+=1) { __pyx_v_d = __pyx_t_2; - /* "gensim/models/doc2vec_inner.pyx":143 + /* "gensim/models/doc2vec_inner.pyx":137 * # work (also passsed in) will accumulate l1 error for outside application * for d in range(negative+1): * if d == 0: # <<<<<<<<<<<<<< @@ -2598,7 +2588,7 @@ static unsigned PY_LONG_LONG __pyx_f_6gensim_6models_13doc2vec_inner_fast_docume __pyx_t_3 = ((__pyx_v_d == 0) != 0); if (__pyx_t_3) { - /* "gensim/models/doc2vec_inner.pyx":144 + /* "gensim/models/doc2vec_inner.pyx":138 * for d in range(negative+1): * if d == 0: * target_index = predict_word_index # <<<<<<<<<<<<<< @@ -2607,7 +2597,7 @@ static unsigned PY_LONG_LONG __pyx_f_6gensim_6models_13doc2vec_inner_fast_docume */ __pyx_v_target_index = __pyx_v_predict_word_index; - /* "gensim/models/doc2vec_inner.pyx":145 + /* "gensim/models/doc2vec_inner.pyx":139 * if d == 0: * target_index = predict_word_index * label = ONEF # <<<<<<<<<<<<<< @@ -2616,7 +2606,7 @@ static unsigned PY_LONG_LONG __pyx_f_6gensim_6models_13doc2vec_inner_fast_docume */ __pyx_v_label = __pyx_v_6gensim_6models_13doc2vec_inner_ONEF; - /* "gensim/models/doc2vec_inner.pyx":143 + /* "gensim/models/doc2vec_inner.pyx":137 * # work (also passsed in) will accumulate l1 error for outside application * for d in range(negative+1): * if d == 0: # <<<<<<<<<<<<<< @@ -2626,7 +2616,7 @@ static unsigned PY_LONG_LONG __pyx_f_6gensim_6models_13doc2vec_inner_fast_docume goto __pyx_L5; } - /* "gensim/models/doc2vec_inner.pyx":147 + /* "gensim/models/doc2vec_inner.pyx":141 * label = ONEF * else: * target_index = bisect_left(cum_table, (next_random >> 16) % cum_table[cum_table_len-1], 0, cum_table_len) # <<<<<<<<<<<<<< @@ -2636,7 +2626,7 @@ static unsigned PY_LONG_LONG __pyx_f_6gensim_6models_13doc2vec_inner_fast_docume /*else*/ { __pyx_v_target_index = __pyx_f_6gensim_6models_14word2vec_inner_bisect_left(__pyx_v_cum_table, ((__pyx_v_next_random >> 16) % (__pyx_v_cum_table[(__pyx_v_cum_table_len - 1)])), 0, __pyx_v_cum_table_len); - /* "gensim/models/doc2vec_inner.pyx":148 + /* "gensim/models/doc2vec_inner.pyx":142 * else: * target_index = bisect_left(cum_table, (next_random >> 16) % cum_table[cum_table_len-1], 0, cum_table_len) * next_random = (next_random * 25214903917ULL + 11) & modulo # <<<<<<<<<<<<<< @@ -2645,7 +2635,7 @@ static unsigned PY_LONG_LONG __pyx_f_6gensim_6models_13doc2vec_inner_fast_docume */ __pyx_v_next_random = (((__pyx_v_next_random * ((unsigned PY_LONG_LONG)25214903917ULL)) + 11) & __pyx_v_modulo); - /* "gensim/models/doc2vec_inner.pyx":149 + /* "gensim/models/doc2vec_inner.pyx":143 * target_index = bisect_left(cum_table, (next_random >> 16) % cum_table[cum_table_len-1], 0, cum_table_len) * next_random = (next_random * 25214903917ULL + 11) & modulo * if target_index == predict_word_index: # <<<<<<<<<<<<<< @@ -2655,7 +2645,7 @@ static unsigned PY_LONG_LONG __pyx_f_6gensim_6models_13doc2vec_inner_fast_docume __pyx_t_3 = ((__pyx_v_target_index == __pyx_v_predict_word_index) != 0); if (__pyx_t_3) { - /* "gensim/models/doc2vec_inner.pyx":150 + /* "gensim/models/doc2vec_inner.pyx":144 * next_random = (next_random * 25214903917ULL + 11) & modulo * if target_index == predict_word_index: * continue # <<<<<<<<<<<<<< @@ -2664,7 +2654,7 @@ static unsigned PY_LONG_LONG __pyx_f_6gensim_6models_13doc2vec_inner_fast_docume */ goto __pyx_L3_continue; - /* "gensim/models/doc2vec_inner.pyx":149 + /* "gensim/models/doc2vec_inner.pyx":143 * target_index = bisect_left(cum_table, (next_random >> 16) % cum_table[cum_table_len-1], 0, cum_table_len) * next_random = (next_random * 25214903917ULL + 11) & modulo * if target_index == predict_word_index: # <<<<<<<<<<<<<< @@ -2673,7 +2663,7 @@ static unsigned PY_LONG_LONG __pyx_f_6gensim_6models_13doc2vec_inner_fast_docume */ } - /* "gensim/models/doc2vec_inner.pyx":151 + /* "gensim/models/doc2vec_inner.pyx":145 * if target_index == predict_word_index: * continue * label = 0.0 # <<<<<<<<<<<<<< @@ -2684,7 +2674,7 @@ static unsigned PY_LONG_LONG __pyx_f_6gensim_6models_13doc2vec_inner_fast_docume } __pyx_L5:; - /* "gensim/models/doc2vec_inner.pyx":153 + /* "gensim/models/doc2vec_inner.pyx":147 * label = 0.0 * * row2 = target_index * size # <<<<<<<<<<<<<< @@ -2693,7 +2683,7 @@ static unsigned PY_LONG_LONG __pyx_f_6gensim_6models_13doc2vec_inner_fast_docume */ __pyx_v_row2 = (__pyx_v_target_index * __pyx_v_size); - /* "gensim/models/doc2vec_inner.pyx":154 + /* "gensim/models/doc2vec_inner.pyx":148 * * row2 = target_index * size * f = our_dot(&size, neu1, &ONE, &syn1neg[row2], &ONE) # <<<<<<<<<<<<<< @@ -2702,7 +2692,7 @@ static unsigned PY_LONG_LONG __pyx_f_6gensim_6models_13doc2vec_inner_fast_docume */ __pyx_v_f = __pyx_v_6gensim_6models_14word2vec_inner_our_dot((&__pyx_v_size), __pyx_v_neu1, (&__pyx_v_6gensim_6models_13doc2vec_inner_ONE), (&(__pyx_v_syn1neg[__pyx_v_row2])), (&__pyx_v_6gensim_6models_13doc2vec_inner_ONE)); - /* "gensim/models/doc2vec_inner.pyx":155 + /* "gensim/models/doc2vec_inner.pyx":149 * row2 = target_index * size * f = our_dot(&size, neu1, &ONE, &syn1neg[row2], &ONE) * if f <= -MAX_EXP or f >= MAX_EXP: # <<<<<<<<<<<<<< @@ -2720,7 +2710,7 @@ static unsigned PY_LONG_LONG __pyx_f_6gensim_6models_13doc2vec_inner_fast_docume __pyx_L8_bool_binop_done:; if (__pyx_t_3) { - /* "gensim/models/doc2vec_inner.pyx":156 + /* "gensim/models/doc2vec_inner.pyx":150 * f = our_dot(&size, neu1, &ONE, &syn1neg[row2], &ONE) * if f <= -MAX_EXP or f >= MAX_EXP: * continue # <<<<<<<<<<<<<< @@ -2729,7 +2719,7 @@ static unsigned PY_LONG_LONG __pyx_f_6gensim_6models_13doc2vec_inner_fast_docume */ goto __pyx_L3_continue; - /* "gensim/models/doc2vec_inner.pyx":155 + /* "gensim/models/doc2vec_inner.pyx":149 * row2 = target_index * size * f = our_dot(&size, neu1, &ONE, &syn1neg[row2], &ONE) * if f <= -MAX_EXP or f >= MAX_EXP: # <<<<<<<<<<<<<< @@ -2738,7 +2728,7 @@ static unsigned PY_LONG_LONG __pyx_f_6gensim_6models_13doc2vec_inner_fast_docume */ } - /* "gensim/models/doc2vec_inner.pyx":157 + /* "gensim/models/doc2vec_inner.pyx":151 * if f <= -MAX_EXP or f >= MAX_EXP: * continue * f = EXP_TABLE[((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))] # <<<<<<<<<<<<<< @@ -2747,7 +2737,7 @@ static unsigned PY_LONG_LONG __pyx_f_6gensim_6models_13doc2vec_inner_fast_docume */ __pyx_v_f = (__pyx_v_6gensim_6models_14word2vec_inner_EXP_TABLE[((int)((__pyx_v_f + 6.0) * 83.0))]); - /* "gensim/models/doc2vec_inner.pyx":158 + /* "gensim/models/doc2vec_inner.pyx":152 * continue * f = EXP_TABLE[((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))] * g = (label - f) * alpha # <<<<<<<<<<<<<< @@ -2756,7 +2746,7 @@ static unsigned PY_LONG_LONG __pyx_f_6gensim_6models_13doc2vec_inner_fast_docume */ __pyx_v_g = ((__pyx_v_label - __pyx_v_f) * __pyx_v_alpha); - /* "gensim/models/doc2vec_inner.pyx":159 + /* "gensim/models/doc2vec_inner.pyx":153 * f = EXP_TABLE[((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))] * g = (label - f) * alpha * our_saxpy(&size, &g, &syn1neg[row2], &ONE, work, &ONE) # <<<<<<<<<<<<<< @@ -2765,7 +2755,7 @@ static unsigned PY_LONG_LONG __pyx_f_6gensim_6models_13doc2vec_inner_fast_docume */ __pyx_v_6gensim_6models_14word2vec_inner_our_saxpy((&__pyx_v_size), (&__pyx_v_g), (&(__pyx_v_syn1neg[__pyx_v_row2])), (&__pyx_v_6gensim_6models_13doc2vec_inner_ONE), __pyx_v_work, (&__pyx_v_6gensim_6models_13doc2vec_inner_ONE)); - /* "gensim/models/doc2vec_inner.pyx":160 + /* "gensim/models/doc2vec_inner.pyx":154 * g = (label - f) * alpha * our_saxpy(&size, &g, &syn1neg[row2], &ONE, work, &ONE) * if learn_hidden: # <<<<<<<<<<<<<< @@ -2775,7 +2765,7 @@ static unsigned PY_LONG_LONG __pyx_f_6gensim_6models_13doc2vec_inner_fast_docume __pyx_t_3 = (__pyx_v_learn_hidden != 0); if (__pyx_t_3) { - /* "gensim/models/doc2vec_inner.pyx":161 + /* "gensim/models/doc2vec_inner.pyx":155 * our_saxpy(&size, &g, &syn1neg[row2], &ONE, work, &ONE) * if learn_hidden: * our_saxpy(&size, &g, neu1, &ONE, &syn1neg[row2], &ONE) # <<<<<<<<<<<<<< @@ -2784,7 +2774,7 @@ static unsigned PY_LONG_LONG __pyx_f_6gensim_6models_13doc2vec_inner_fast_docume */ __pyx_v_6gensim_6models_14word2vec_inner_our_saxpy((&__pyx_v_size), (&__pyx_v_g), __pyx_v_neu1, (&__pyx_v_6gensim_6models_13doc2vec_inner_ONE), (&(__pyx_v_syn1neg[__pyx_v_row2])), (&__pyx_v_6gensim_6models_13doc2vec_inner_ONE)); - /* "gensim/models/doc2vec_inner.pyx":160 + /* "gensim/models/doc2vec_inner.pyx":154 * g = (label - f) * alpha * our_saxpy(&size, &g, &syn1neg[row2], &ONE, work, &ONE) * if learn_hidden: # <<<<<<<<<<<<<< @@ -2795,7 +2785,7 @@ static unsigned PY_LONG_LONG __pyx_f_6gensim_6models_13doc2vec_inner_fast_docume __pyx_L3_continue:; } - /* "gensim/models/doc2vec_inner.pyx":163 + /* "gensim/models/doc2vec_inner.pyx":157 * our_saxpy(&size, &g, neu1, &ONE, &syn1neg[row2], &ONE) * * return next_random # <<<<<<<<<<<<<< @@ -2805,7 +2795,7 @@ static unsigned PY_LONG_LONG __pyx_f_6gensim_6models_13doc2vec_inner_fast_docume __pyx_r = __pyx_v_next_random; goto __pyx_L0; - /* "gensim/models/doc2vec_inner.pyx":129 + /* "gensim/models/doc2vec_inner.pyx":123 * * * cdef unsigned long long fast_document_dm_neg( # <<<<<<<<<<<<<< @@ -2818,7 +2808,7 @@ static unsigned PY_LONG_LONG __pyx_f_6gensim_6models_13doc2vec_inner_fast_docume return __pyx_r; } -/* "gensim/models/doc2vec_inner.pyx":165 +/* "gensim/models/doc2vec_inner.pyx":159 * return next_random * * cdef void fast_document_dmc_hs( # <<<<<<<<<<<<<< @@ -2836,7 +2826,7 @@ static void __pyx_f_6gensim_6models_13doc2vec_inner_fast_document_dmc_hs(__pyx_t int __pyx_t_3; int __pyx_t_4; - /* "gensim/models/doc2vec_inner.pyx":177 + /* "gensim/models/doc2vec_inner.pyx":171 * # l1 already composed by caller, passed in as neu1 * # work accumulates net l1 error; eventually applied by caller * for b in range(word_code_len): # <<<<<<<<<<<<<< @@ -2847,7 +2837,7 @@ static void __pyx_f_6gensim_6models_13doc2vec_inner_fast_document_dmc_hs(__pyx_t for (__pyx_t_2 = 0; __pyx_t_2 < __pyx_t_1; __pyx_t_2+=1) { __pyx_v_b = __pyx_t_2; - /* "gensim/models/doc2vec_inner.pyx":178 + /* "gensim/models/doc2vec_inner.pyx":172 * # work accumulates net l1 error; eventually applied by caller * for b in range(word_code_len): * row2 = word_point[b] * layer1_size # <<<<<<<<<<<<<< @@ -2856,7 +2846,7 @@ static void __pyx_f_6gensim_6models_13doc2vec_inner_fast_document_dmc_hs(__pyx_t */ __pyx_v_row2 = ((__pyx_v_word_point[__pyx_v_b]) * __pyx_v_layer1_size); - /* "gensim/models/doc2vec_inner.pyx":179 + /* "gensim/models/doc2vec_inner.pyx":173 * for b in range(word_code_len): * row2 = word_point[b] * layer1_size * f = our_dot(&layer1_size, neu1, &ONE, &syn1[row2], &ONE) # <<<<<<<<<<<<<< @@ -2865,7 +2855,7 @@ static void __pyx_f_6gensim_6models_13doc2vec_inner_fast_document_dmc_hs(__pyx_t */ __pyx_v_f = __pyx_v_6gensim_6models_14word2vec_inner_our_dot((&__pyx_v_layer1_size), __pyx_v_neu1, (&__pyx_v_6gensim_6models_13doc2vec_inner_ONE), (&(__pyx_v_syn1[__pyx_v_row2])), (&__pyx_v_6gensim_6models_13doc2vec_inner_ONE)); - /* "gensim/models/doc2vec_inner.pyx":180 + /* "gensim/models/doc2vec_inner.pyx":174 * row2 = word_point[b] * layer1_size * f = our_dot(&layer1_size, neu1, &ONE, &syn1[row2], &ONE) * if f <= -MAX_EXP or f >= MAX_EXP: # <<<<<<<<<<<<<< @@ -2883,7 +2873,7 @@ static void __pyx_f_6gensim_6models_13doc2vec_inner_fast_document_dmc_hs(__pyx_t __pyx_L6_bool_binop_done:; if (__pyx_t_3) { - /* "gensim/models/doc2vec_inner.pyx":181 + /* "gensim/models/doc2vec_inner.pyx":175 * f = our_dot(&layer1_size, neu1, &ONE, &syn1[row2], &ONE) * if f <= -MAX_EXP or f >= MAX_EXP: * continue # <<<<<<<<<<<<<< @@ -2892,7 +2882,7 @@ static void __pyx_f_6gensim_6models_13doc2vec_inner_fast_document_dmc_hs(__pyx_t */ goto __pyx_L3_continue; - /* "gensim/models/doc2vec_inner.pyx":180 + /* "gensim/models/doc2vec_inner.pyx":174 * row2 = word_point[b] * layer1_size * f = our_dot(&layer1_size, neu1, &ONE, &syn1[row2], &ONE) * if f <= -MAX_EXP or f >= MAX_EXP: # <<<<<<<<<<<<<< @@ -2901,7 +2891,7 @@ static void __pyx_f_6gensim_6models_13doc2vec_inner_fast_document_dmc_hs(__pyx_t */ } - /* "gensim/models/doc2vec_inner.pyx":182 + /* "gensim/models/doc2vec_inner.pyx":176 * if f <= -MAX_EXP or f >= MAX_EXP: * continue * f = EXP_TABLE[((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))] # <<<<<<<<<<<<<< @@ -2910,7 +2900,7 @@ static void __pyx_f_6gensim_6models_13doc2vec_inner_fast_document_dmc_hs(__pyx_t */ __pyx_v_f = (__pyx_v_6gensim_6models_14word2vec_inner_EXP_TABLE[((int)((__pyx_v_f + 6.0) * 83.0))]); - /* "gensim/models/doc2vec_inner.pyx":183 + /* "gensim/models/doc2vec_inner.pyx":177 * continue * f = EXP_TABLE[((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))] * g = (1 - word_code[b] - f) * alpha # <<<<<<<<<<<<<< @@ -2919,7 +2909,7 @@ static void __pyx_f_6gensim_6models_13doc2vec_inner_fast_document_dmc_hs(__pyx_t */ __pyx_v_g = (((1 - (__pyx_v_word_code[__pyx_v_b])) - __pyx_v_f) * __pyx_v_alpha); - /* "gensim/models/doc2vec_inner.pyx":184 + /* "gensim/models/doc2vec_inner.pyx":178 * f = EXP_TABLE[((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))] * g = (1 - word_code[b] - f) * alpha * our_saxpy(&layer1_size, &g, &syn1[row2], &ONE, work, &ONE) # <<<<<<<<<<<<<< @@ -2928,7 +2918,7 @@ static void __pyx_f_6gensim_6models_13doc2vec_inner_fast_document_dmc_hs(__pyx_t */ __pyx_v_6gensim_6models_14word2vec_inner_our_saxpy((&__pyx_v_layer1_size), (&__pyx_v_g), (&(__pyx_v_syn1[__pyx_v_row2])), (&__pyx_v_6gensim_6models_13doc2vec_inner_ONE), __pyx_v_work, (&__pyx_v_6gensim_6models_13doc2vec_inner_ONE)); - /* "gensim/models/doc2vec_inner.pyx":185 + /* "gensim/models/doc2vec_inner.pyx":179 * g = (1 - word_code[b] - f) * alpha * our_saxpy(&layer1_size, &g, &syn1[row2], &ONE, work, &ONE) * if learn_hidden: # <<<<<<<<<<<<<< @@ -2938,7 +2928,7 @@ static void __pyx_f_6gensim_6models_13doc2vec_inner_fast_document_dmc_hs(__pyx_t __pyx_t_3 = (__pyx_v_learn_hidden != 0); if (__pyx_t_3) { - /* "gensim/models/doc2vec_inner.pyx":186 + /* "gensim/models/doc2vec_inner.pyx":180 * our_saxpy(&layer1_size, &g, &syn1[row2], &ONE, work, &ONE) * if learn_hidden: * our_saxpy(&layer1_size, &g, neu1, &ONE, &syn1[row2], &ONE) # <<<<<<<<<<<<<< @@ -2947,7 +2937,7 @@ static void __pyx_f_6gensim_6models_13doc2vec_inner_fast_document_dmc_hs(__pyx_t */ __pyx_v_6gensim_6models_14word2vec_inner_our_saxpy((&__pyx_v_layer1_size), (&__pyx_v_g), __pyx_v_neu1, (&__pyx_v_6gensim_6models_13doc2vec_inner_ONE), (&(__pyx_v_syn1[__pyx_v_row2])), (&__pyx_v_6gensim_6models_13doc2vec_inner_ONE)); - /* "gensim/models/doc2vec_inner.pyx":185 + /* "gensim/models/doc2vec_inner.pyx":179 * g = (1 - word_code[b] - f) * alpha * our_saxpy(&layer1_size, &g, &syn1[row2], &ONE, work, &ONE) * if learn_hidden: # <<<<<<<<<<<<<< @@ -2958,7 +2948,7 @@ static void __pyx_f_6gensim_6models_13doc2vec_inner_fast_document_dmc_hs(__pyx_t __pyx_L3_continue:; } - /* "gensim/models/doc2vec_inner.pyx":165 + /* "gensim/models/doc2vec_inner.pyx":159 * return next_random * * cdef void fast_document_dmc_hs( # <<<<<<<<<<<<<< @@ -2969,7 +2959,7 @@ static void __pyx_f_6gensim_6models_13doc2vec_inner_fast_document_dmc_hs(__pyx_t /* function exit code */ } -/* "gensim/models/doc2vec_inner.pyx":189 +/* "gensim/models/doc2vec_inner.pyx":183 * * * cdef unsigned long long fast_document_dmc_neg( # <<<<<<<<<<<<<< @@ -2991,7 +2981,7 @@ static unsigned PY_LONG_LONG __pyx_f_6gensim_6models_13doc2vec_inner_fast_docume int __pyx_t_3; int __pyx_t_4; - /* "gensim/models/doc2vec_inner.pyx":196 + /* "gensim/models/doc2vec_inner.pyx":190 * cdef long long a * cdef long long row2 * cdef unsigned long long modulo = 281474976710655ULL # <<<<<<<<<<<<<< @@ -3000,7 +2990,7 @@ static unsigned PY_LONG_LONG __pyx_f_6gensim_6models_13doc2vec_inner_fast_docume */ __pyx_v_modulo = 281474976710655ULL; - /* "gensim/models/doc2vec_inner.pyx":203 + /* "gensim/models/doc2vec_inner.pyx":197 * # l1 already composed by caller, passed in as neu1 * # work accumulates net l1 error; eventually applied by caller * for d in range(negative+1): # <<<<<<<<<<<<<< @@ -3011,7 +3001,7 @@ static unsigned PY_LONG_LONG __pyx_f_6gensim_6models_13doc2vec_inner_fast_docume for (__pyx_t_2 = 0; __pyx_t_2 < __pyx_t_1; __pyx_t_2+=1) { __pyx_v_d = __pyx_t_2; - /* "gensim/models/doc2vec_inner.pyx":204 + /* "gensim/models/doc2vec_inner.pyx":198 * # work accumulates net l1 error; eventually applied by caller * for d in range(negative+1): * if d == 0: # <<<<<<<<<<<<<< @@ -3021,7 +3011,7 @@ static unsigned PY_LONG_LONG __pyx_f_6gensim_6models_13doc2vec_inner_fast_docume __pyx_t_3 = ((__pyx_v_d == 0) != 0); if (__pyx_t_3) { - /* "gensim/models/doc2vec_inner.pyx":205 + /* "gensim/models/doc2vec_inner.pyx":199 * for d in range(negative+1): * if d == 0: * target_index = predict_word_index # <<<<<<<<<<<<<< @@ -3030,7 +3020,7 @@ static unsigned PY_LONG_LONG __pyx_f_6gensim_6models_13doc2vec_inner_fast_docume */ __pyx_v_target_index = __pyx_v_predict_word_index; - /* "gensim/models/doc2vec_inner.pyx":206 + /* "gensim/models/doc2vec_inner.pyx":200 * if d == 0: * target_index = predict_word_index * label = ONEF # <<<<<<<<<<<<<< @@ -3039,7 +3029,7 @@ static unsigned PY_LONG_LONG __pyx_f_6gensim_6models_13doc2vec_inner_fast_docume */ __pyx_v_label = __pyx_v_6gensim_6models_13doc2vec_inner_ONEF; - /* "gensim/models/doc2vec_inner.pyx":204 + /* "gensim/models/doc2vec_inner.pyx":198 * # work accumulates net l1 error; eventually applied by caller * for d in range(negative+1): * if d == 0: # <<<<<<<<<<<<<< @@ -3049,7 +3039,7 @@ static unsigned PY_LONG_LONG __pyx_f_6gensim_6models_13doc2vec_inner_fast_docume goto __pyx_L5; } - /* "gensim/models/doc2vec_inner.pyx":208 + /* "gensim/models/doc2vec_inner.pyx":202 * label = ONEF * else: * target_index = bisect_left(cum_table, (next_random >> 16) % cum_table[cum_table_len-1], 0, cum_table_len) # <<<<<<<<<<<<<< @@ -3059,7 +3049,7 @@ static unsigned PY_LONG_LONG __pyx_f_6gensim_6models_13doc2vec_inner_fast_docume /*else*/ { __pyx_v_target_index = __pyx_f_6gensim_6models_14word2vec_inner_bisect_left(__pyx_v_cum_table, ((__pyx_v_next_random >> 16) % (__pyx_v_cum_table[(__pyx_v_cum_table_len - 1)])), 0, __pyx_v_cum_table_len); - /* "gensim/models/doc2vec_inner.pyx":209 + /* "gensim/models/doc2vec_inner.pyx":203 * else: * target_index = bisect_left(cum_table, (next_random >> 16) % cum_table[cum_table_len-1], 0, cum_table_len) * next_random = (next_random * 25214903917ULL + 11) & modulo # <<<<<<<<<<<<<< @@ -3068,7 +3058,7 @@ static unsigned PY_LONG_LONG __pyx_f_6gensim_6models_13doc2vec_inner_fast_docume */ __pyx_v_next_random = (((__pyx_v_next_random * ((unsigned PY_LONG_LONG)25214903917ULL)) + 11) & __pyx_v_modulo); - /* "gensim/models/doc2vec_inner.pyx":210 + /* "gensim/models/doc2vec_inner.pyx":204 * target_index = bisect_left(cum_table, (next_random >> 16) % cum_table[cum_table_len-1], 0, cum_table_len) * next_random = (next_random * 25214903917ULL + 11) & modulo * if target_index == predict_word_index: # <<<<<<<<<<<<<< @@ -3078,7 +3068,7 @@ static unsigned PY_LONG_LONG __pyx_f_6gensim_6models_13doc2vec_inner_fast_docume __pyx_t_3 = ((__pyx_v_target_index == __pyx_v_predict_word_index) != 0); if (__pyx_t_3) { - /* "gensim/models/doc2vec_inner.pyx":211 + /* "gensim/models/doc2vec_inner.pyx":205 * next_random = (next_random * 25214903917ULL + 11) & modulo * if target_index == predict_word_index: * continue # <<<<<<<<<<<<<< @@ -3087,7 +3077,7 @@ static unsigned PY_LONG_LONG __pyx_f_6gensim_6models_13doc2vec_inner_fast_docume */ goto __pyx_L3_continue; - /* "gensim/models/doc2vec_inner.pyx":210 + /* "gensim/models/doc2vec_inner.pyx":204 * target_index = bisect_left(cum_table, (next_random >> 16) % cum_table[cum_table_len-1], 0, cum_table_len) * next_random = (next_random * 25214903917ULL + 11) & modulo * if target_index == predict_word_index: # <<<<<<<<<<<<<< @@ -3096,7 +3086,7 @@ static unsigned PY_LONG_LONG __pyx_f_6gensim_6models_13doc2vec_inner_fast_docume */ } - /* "gensim/models/doc2vec_inner.pyx":212 + /* "gensim/models/doc2vec_inner.pyx":206 * if target_index == predict_word_index: * continue * label = 0.0 # <<<<<<<<<<<<<< @@ -3107,7 +3097,7 @@ static unsigned PY_LONG_LONG __pyx_f_6gensim_6models_13doc2vec_inner_fast_docume } __pyx_L5:; - /* "gensim/models/doc2vec_inner.pyx":214 + /* "gensim/models/doc2vec_inner.pyx":208 * label = 0.0 * * row2 = target_index * layer1_size # <<<<<<<<<<<<<< @@ -3116,7 +3106,7 @@ static unsigned PY_LONG_LONG __pyx_f_6gensim_6models_13doc2vec_inner_fast_docume */ __pyx_v_row2 = (__pyx_v_target_index * __pyx_v_layer1_size); - /* "gensim/models/doc2vec_inner.pyx":215 + /* "gensim/models/doc2vec_inner.pyx":209 * * row2 = target_index * layer1_size * f = our_dot(&layer1_size, neu1, &ONE, &syn1neg[row2], &ONE) # <<<<<<<<<<<<<< @@ -3125,7 +3115,7 @@ static unsigned PY_LONG_LONG __pyx_f_6gensim_6models_13doc2vec_inner_fast_docume */ __pyx_v_f = __pyx_v_6gensim_6models_14word2vec_inner_our_dot((&__pyx_v_layer1_size), __pyx_v_neu1, (&__pyx_v_6gensim_6models_13doc2vec_inner_ONE), (&(__pyx_v_syn1neg[__pyx_v_row2])), (&__pyx_v_6gensim_6models_13doc2vec_inner_ONE)); - /* "gensim/models/doc2vec_inner.pyx":216 + /* "gensim/models/doc2vec_inner.pyx":210 * row2 = target_index * layer1_size * f = our_dot(&layer1_size, neu1, &ONE, &syn1neg[row2], &ONE) * if f <= -MAX_EXP or f >= MAX_EXP: # <<<<<<<<<<<<<< @@ -3143,7 +3133,7 @@ static unsigned PY_LONG_LONG __pyx_f_6gensim_6models_13doc2vec_inner_fast_docume __pyx_L8_bool_binop_done:; if (__pyx_t_3) { - /* "gensim/models/doc2vec_inner.pyx":217 + /* "gensim/models/doc2vec_inner.pyx":211 * f = our_dot(&layer1_size, neu1, &ONE, &syn1neg[row2], &ONE) * if f <= -MAX_EXP or f >= MAX_EXP: * continue # <<<<<<<<<<<<<< @@ -3152,7 +3142,7 @@ static unsigned PY_LONG_LONG __pyx_f_6gensim_6models_13doc2vec_inner_fast_docume */ goto __pyx_L3_continue; - /* "gensim/models/doc2vec_inner.pyx":216 + /* "gensim/models/doc2vec_inner.pyx":210 * row2 = target_index * layer1_size * f = our_dot(&layer1_size, neu1, &ONE, &syn1neg[row2], &ONE) * if f <= -MAX_EXP or f >= MAX_EXP: # <<<<<<<<<<<<<< @@ -3161,7 +3151,7 @@ static unsigned PY_LONG_LONG __pyx_f_6gensim_6models_13doc2vec_inner_fast_docume */ } - /* "gensim/models/doc2vec_inner.pyx":218 + /* "gensim/models/doc2vec_inner.pyx":212 * if f <= -MAX_EXP or f >= MAX_EXP: * continue * f = EXP_TABLE[((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))] # <<<<<<<<<<<<<< @@ -3170,7 +3160,7 @@ static unsigned PY_LONG_LONG __pyx_f_6gensim_6models_13doc2vec_inner_fast_docume */ __pyx_v_f = (__pyx_v_6gensim_6models_14word2vec_inner_EXP_TABLE[((int)((__pyx_v_f + 6.0) * 83.0))]); - /* "gensim/models/doc2vec_inner.pyx":219 + /* "gensim/models/doc2vec_inner.pyx":213 * continue * f = EXP_TABLE[((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))] * g = (label - f) * alpha # <<<<<<<<<<<<<< @@ -3179,7 +3169,7 @@ static unsigned PY_LONG_LONG __pyx_f_6gensim_6models_13doc2vec_inner_fast_docume */ __pyx_v_g = ((__pyx_v_label - __pyx_v_f) * __pyx_v_alpha); - /* "gensim/models/doc2vec_inner.pyx":220 + /* "gensim/models/doc2vec_inner.pyx":214 * f = EXP_TABLE[((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))] * g = (label - f) * alpha * our_saxpy(&layer1_size, &g, &syn1neg[row2], &ONE, work, &ONE) # <<<<<<<<<<<<<< @@ -3188,7 +3178,7 @@ static unsigned PY_LONG_LONG __pyx_f_6gensim_6models_13doc2vec_inner_fast_docume */ __pyx_v_6gensim_6models_14word2vec_inner_our_saxpy((&__pyx_v_layer1_size), (&__pyx_v_g), (&(__pyx_v_syn1neg[__pyx_v_row2])), (&__pyx_v_6gensim_6models_13doc2vec_inner_ONE), __pyx_v_work, (&__pyx_v_6gensim_6models_13doc2vec_inner_ONE)); - /* "gensim/models/doc2vec_inner.pyx":221 + /* "gensim/models/doc2vec_inner.pyx":215 * g = (label - f) * alpha * our_saxpy(&layer1_size, &g, &syn1neg[row2], &ONE, work, &ONE) * if learn_hidden: # <<<<<<<<<<<<<< @@ -3198,7 +3188,7 @@ static unsigned PY_LONG_LONG __pyx_f_6gensim_6models_13doc2vec_inner_fast_docume __pyx_t_3 = (__pyx_v_learn_hidden != 0); if (__pyx_t_3) { - /* "gensim/models/doc2vec_inner.pyx":222 + /* "gensim/models/doc2vec_inner.pyx":216 * our_saxpy(&layer1_size, &g, &syn1neg[row2], &ONE, work, &ONE) * if learn_hidden: * our_saxpy(&layer1_size, &g, neu1, &ONE, &syn1neg[row2], &ONE) # <<<<<<<<<<<<<< @@ -3207,7 +3197,7 @@ static unsigned PY_LONG_LONG __pyx_f_6gensim_6models_13doc2vec_inner_fast_docume */ __pyx_v_6gensim_6models_14word2vec_inner_our_saxpy((&__pyx_v_layer1_size), (&__pyx_v_g), __pyx_v_neu1, (&__pyx_v_6gensim_6models_13doc2vec_inner_ONE), (&(__pyx_v_syn1neg[__pyx_v_row2])), (&__pyx_v_6gensim_6models_13doc2vec_inner_ONE)); - /* "gensim/models/doc2vec_inner.pyx":221 + /* "gensim/models/doc2vec_inner.pyx":215 * g = (label - f) * alpha * our_saxpy(&layer1_size, &g, &syn1neg[row2], &ONE, work, &ONE) * if learn_hidden: # <<<<<<<<<<<<<< @@ -3218,7 +3208,7 @@ static unsigned PY_LONG_LONG __pyx_f_6gensim_6models_13doc2vec_inner_fast_docume __pyx_L3_continue:; } - /* "gensim/models/doc2vec_inner.pyx":224 + /* "gensim/models/doc2vec_inner.pyx":218 * our_saxpy(&layer1_size, &g, neu1, &ONE, &syn1neg[row2], &ONE) * * return next_random # <<<<<<<<<<<<<< @@ -3228,7 +3218,7 @@ static unsigned PY_LONG_LONG __pyx_f_6gensim_6models_13doc2vec_inner_fast_docume __pyx_r = __pyx_v_next_random; goto __pyx_L0; - /* "gensim/models/doc2vec_inner.pyx":189 + /* "gensim/models/doc2vec_inner.pyx":183 * * * cdef unsigned long long fast_document_dmc_neg( # <<<<<<<<<<<<<< @@ -3241,7 +3231,7 @@ static unsigned PY_LONG_LONG __pyx_f_6gensim_6models_13doc2vec_inner_fast_docume return __pyx_r; } -/* "gensim/models/doc2vec_inner.pyx":227 +/* "gensim/models/doc2vec_inner.pyx":221 * * * def train_document_dbow(model, doc_words, doctag_indexes, alpha, work=None, # <<<<<<<<<<<<<< @@ -3251,7 +3241,8 @@ static unsigned PY_LONG_LONG __pyx_f_6gensim_6models_13doc2vec_inner_fast_docume /* Python wrapper */ static PyObject *__pyx_pw_6gensim_6models_13doc2vec_inner_1train_document_dbow(PyObject *__pyx_self, PyObject *__pyx_args, PyObject *__pyx_kwds); /*proto*/ -static PyMethodDef __pyx_mdef_6gensim_6models_13doc2vec_inner_1train_document_dbow = {"train_document_dbow", (PyCFunction)__pyx_pw_6gensim_6models_13doc2vec_inner_1train_document_dbow, METH_VARARGS|METH_KEYWORDS, 0}; +static char __pyx_doc_6gensim_6models_13doc2vec_inner_train_document_dbow[] = "train_document_dbow(model, doc_words, doctag_indexes, alpha, work=None, train_words=False, learn_doctags=True, learn_words=True, learn_hidden=True, word_vectors=None, word_locks=None, doctag_vectors=None, doctag_locks=None)\nUpdate distributed bag of words model (\"PV-DBOW\") by training on a single document.\n\n Called internally from :meth:`~gensim.models.doc2vec.Doc2Vec.train` and\n :meth:`~gensim.models.doc2vec.Doc2Vec.infer_vector`.\n\n Parameters\n ----------\n model : :class:`~gensim.models.doc2vec.Doc2Vec`\n The model to train.\n doc_words : list of str\n The input document as a list of words to be used for training. Each word will be looked up in\n the model's vocabulary.\n doctag_indexes : list of int\n Indices into `doctag_vectors` used to obtain the tags of the document.\n alpha : float\n Learning rate.\n work : list of float, optional\n Updates to be performed on each neuron in the hidden layer of the underlying network.\n train_words : bool, optional\n Word vectors will be updated exactly as per Word2Vec skip-gram training only if **both** `learn_words`\n and `train_words` are set to True.\n learn_doctags : bool, optional\n Whether the tag vectors should be updated.\n learn_words : bool, optional\n Word vectors will be updated exactly as per Word2Vec skip-gram training only if **both**\n `learn_words` and `train_words` are set to True.\n learn_hidden : bool, optional\n Whether or not the weights of the hidden layer will be updated.\n word_vectors : numpy.ndarray, optional\n The vector representation for each word in the vocabulary. If None, these will be retrieved from the model.\n word_locks : numpy.ndarray, optional\n A learning lock factor for each weight in the hidden layer for words, value 0 completely blocks updates,\n a value of 1 allows to update word-vectors.\n doctag_vectors : numpy.ndarray, ""optional\n Vector representations of the tags. If None, these will be retrieved from the model.\n doctag_locks : numpy.ndarray, optional\n The lock factors for each tag, same as `word_locks`, but for document-vectors.\n\n Returns\n -------\n int\n Number of words in the input document that were actually used for training.\n\n "; +static PyMethodDef __pyx_mdef_6gensim_6models_13doc2vec_inner_1train_document_dbow = {"train_document_dbow", (PyCFunction)__pyx_pw_6gensim_6models_13doc2vec_inner_1train_document_dbow, METH_VARARGS|METH_KEYWORDS, __pyx_doc_6gensim_6models_13doc2vec_inner_train_document_dbow}; static PyObject *__pyx_pw_6gensim_6models_13doc2vec_inner_1train_document_dbow(PyObject *__pyx_self, PyObject *__pyx_args, PyObject *__pyx_kwds) { PyObject *__pyx_v_model = 0; PyObject *__pyx_v_doc_words = 0; @@ -3274,24 +3265,24 @@ static PyObject *__pyx_pw_6gensim_6models_13doc2vec_inner_1train_document_dbow(P PyObject* values[13] = {0,0,0,0,0,0,0,0,0,0,0,0,0}; values[4] = ((PyObject *)Py_None); - /* "gensim/models/doc2vec_inner.pyx":228 + /* "gensim/models/doc2vec_inner.pyx":222 * * def train_document_dbow(model, doc_words, doctag_indexes, alpha, work=None, * train_words=False, learn_doctags=True, learn_words=True, learn_hidden=True, # <<<<<<<<<<<<<< * word_vectors=None, word_locks=None, doctag_vectors=None, doctag_locks=None): - * cdef int hs = model.hs + * """Update distributed bag of words model ("PV-DBOW") by training on a single document. */ values[5] = ((PyObject *)Py_False); values[6] = ((PyObject *)Py_True); values[7] = ((PyObject *)Py_True); values[8] = ((PyObject *)Py_True); - /* "gensim/models/doc2vec_inner.pyx":229 + /* "gensim/models/doc2vec_inner.pyx":223 * def train_document_dbow(model, doc_words, doctag_indexes, alpha, work=None, * train_words=False, learn_doctags=True, learn_words=True, learn_hidden=True, * word_vectors=None, word_locks=None, doctag_vectors=None, doctag_locks=None): # <<<<<<<<<<<<<< - * cdef int hs = model.hs - * cdef int negative = model.negative + * """Update distributed bag of words model ("PV-DBOW") by training on a single document. + * */ values[9] = ((PyObject *)Py_None); values[10] = ((PyObject *)Py_None); @@ -3339,19 +3330,19 @@ static PyObject *__pyx_pw_6gensim_6models_13doc2vec_inner_1train_document_dbow(P case 1: if (likely((values[1] = PyDict_GetItem(__pyx_kwds, __pyx_n_s_doc_words)) != 0)) kw_args--; else { - __Pyx_RaiseArgtupleInvalid("train_document_dbow", 0, 4, 13, 1); __PYX_ERR(0, 227, __pyx_L3_error) + __Pyx_RaiseArgtupleInvalid("train_document_dbow", 0, 4, 13, 1); __PYX_ERR(0, 221, __pyx_L3_error) } CYTHON_FALLTHROUGH; case 2: if (likely((values[2] = PyDict_GetItem(__pyx_kwds, __pyx_n_s_doctag_indexes)) != 0)) kw_args--; else { - __Pyx_RaiseArgtupleInvalid("train_document_dbow", 0, 4, 13, 2); __PYX_ERR(0, 227, __pyx_L3_error) + __Pyx_RaiseArgtupleInvalid("train_document_dbow", 0, 4, 13, 2); __PYX_ERR(0, 221, __pyx_L3_error) } CYTHON_FALLTHROUGH; case 3: if (likely((values[3] = PyDict_GetItem(__pyx_kwds, __pyx_n_s_alpha)) != 0)) kw_args--; else { - __Pyx_RaiseArgtupleInvalid("train_document_dbow", 0, 4, 13, 3); __PYX_ERR(0, 227, __pyx_L3_error) + __Pyx_RaiseArgtupleInvalid("train_document_dbow", 0, 4, 13, 3); __PYX_ERR(0, 221, __pyx_L3_error) } CYTHON_FALLTHROUGH; case 4: @@ -3409,7 +3400,7 @@ static PyObject *__pyx_pw_6gensim_6models_13doc2vec_inner_1train_document_dbow(P } } if (unlikely(kw_args > 0)) { - if (unlikely(__Pyx_ParseOptionalKeywords(__pyx_kwds, __pyx_pyargnames, 0, values, pos_args, "train_document_dbow") < 0)) __PYX_ERR(0, 227, __pyx_L3_error) + if (unlikely(__Pyx_ParseOptionalKeywords(__pyx_kwds, __pyx_pyargnames, 0, values, pos_args, "train_document_dbow") < 0)) __PYX_ERR(0, 221, __pyx_L3_error) } } else { switch (PyTuple_GET_SIZE(__pyx_args)) { @@ -3455,7 +3446,7 @@ static PyObject *__pyx_pw_6gensim_6models_13doc2vec_inner_1train_document_dbow(P } goto __pyx_L4_argument_unpacking_done; __pyx_L5_argtuple_error:; - __Pyx_RaiseArgtupleInvalid("train_document_dbow", 0, 4, 13, PyTuple_GET_SIZE(__pyx_args)); __PYX_ERR(0, 227, __pyx_L3_error) + __Pyx_RaiseArgtupleInvalid("train_document_dbow", 0, 4, 13, PyTuple_GET_SIZE(__pyx_args)); __PYX_ERR(0, 221, __pyx_L3_error) __pyx_L3_error:; __Pyx_AddTraceback("gensim.models.doc2vec_inner.train_document_dbow", __pyx_clineno, __pyx_lineno, __pyx_filename); __Pyx_RefNannyFinishContext(); @@ -3463,7 +3454,7 @@ static PyObject *__pyx_pw_6gensim_6models_13doc2vec_inner_1train_document_dbow(P __pyx_L4_argument_unpacking_done:; __pyx_r = __pyx_pf_6gensim_6models_13doc2vec_inner_train_document_dbow(__pyx_self, __pyx_v_model, __pyx_v_doc_words, __pyx_v_doctag_indexes, __pyx_v_alpha, __pyx_v_work, __pyx_v_train_words, __pyx_v_learn_doctags, __pyx_v_learn_words, __pyx_v_learn_hidden, __pyx_v_word_vectors, __pyx_v_word_locks, __pyx_v_doctag_vectors, __pyx_v_doctag_locks); - /* "gensim/models/doc2vec_inner.pyx":227 + /* "gensim/models/doc2vec_inner.pyx":221 * * * def train_document_dbow(model, doc_words, doctag_indexes, alpha, work=None, # <<<<<<<<<<<<<< @@ -3541,130 +3532,130 @@ static PyObject *__pyx_pf_6gensim_6models_13doc2vec_inner_train_document_dbow(CY __Pyx_INCREF(__pyx_v_doctag_vectors); __Pyx_INCREF(__pyx_v_doctag_locks); - /* "gensim/models/doc2vec_inner.pyx":230 - * train_words=False, learn_doctags=True, learn_words=True, learn_hidden=True, - * word_vectors=None, word_locks=None, doctag_vectors=None, doctag_locks=None): + /* "gensim/models/doc2vec_inner.pyx":268 + * + * """ * cdef int hs = model.hs # <<<<<<<<<<<<<< * cdef int negative = model.negative * cdef int sample = (model.vocabulary.sample != 0) */ - __pyx_t_1 = __Pyx_PyObject_GetAttrStr(__pyx_v_model, __pyx_n_s_hs); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 230, __pyx_L1_error) + __pyx_t_1 = __Pyx_PyObject_GetAttrStr(__pyx_v_model, __pyx_n_s_hs); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 268, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_1); - __pyx_t_2 = __Pyx_PyInt_As_int(__pyx_t_1); if (unlikely((__pyx_t_2 == (int)-1) && PyErr_Occurred())) __PYX_ERR(0, 230, __pyx_L1_error) + __pyx_t_2 = __Pyx_PyInt_As_int(__pyx_t_1); if (unlikely((__pyx_t_2 == (int)-1) && PyErr_Occurred())) __PYX_ERR(0, 268, __pyx_L1_error) __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0; __pyx_v_hs = __pyx_t_2; - /* "gensim/models/doc2vec_inner.pyx":231 - * word_vectors=None, word_locks=None, doctag_vectors=None, doctag_locks=None): + /* "gensim/models/doc2vec_inner.pyx":269 + * """ * cdef int hs = model.hs * cdef int negative = model.negative # <<<<<<<<<<<<<< * cdef int sample = (model.vocabulary.sample != 0) * cdef int _train_words = train_words */ - __pyx_t_1 = __Pyx_PyObject_GetAttrStr(__pyx_v_model, __pyx_n_s_negative); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 231, __pyx_L1_error) + __pyx_t_1 = __Pyx_PyObject_GetAttrStr(__pyx_v_model, __pyx_n_s_negative); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 269, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_1); - __pyx_t_2 = __Pyx_PyInt_As_int(__pyx_t_1); if (unlikely((__pyx_t_2 == (int)-1) && PyErr_Occurred())) __PYX_ERR(0, 231, __pyx_L1_error) + __pyx_t_2 = __Pyx_PyInt_As_int(__pyx_t_1); if (unlikely((__pyx_t_2 == (int)-1) && PyErr_Occurred())) __PYX_ERR(0, 269, __pyx_L1_error) __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0; __pyx_v_negative = __pyx_t_2; - /* "gensim/models/doc2vec_inner.pyx":232 + /* "gensim/models/doc2vec_inner.pyx":270 * cdef int hs = model.hs * cdef int negative = model.negative * cdef int sample = (model.vocabulary.sample != 0) # <<<<<<<<<<<<<< * cdef int _train_words = train_words * cdef int _learn_words = learn_words */ - __pyx_t_1 = __Pyx_PyObject_GetAttrStr(__pyx_v_model, __pyx_n_s_vocabulary); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 232, __pyx_L1_error) + __pyx_t_1 = __Pyx_PyObject_GetAttrStr(__pyx_v_model, __pyx_n_s_vocabulary); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 270, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_1); - __pyx_t_3 = __Pyx_PyObject_GetAttrStr(__pyx_t_1, __pyx_n_s_sample); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 232, __pyx_L1_error) + __pyx_t_3 = __Pyx_PyObject_GetAttrStr(__pyx_t_1, __pyx_n_s_sample); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 270, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_3); __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0; - __pyx_t_1 = PyObject_RichCompare(__pyx_t_3, __pyx_int_0, Py_NE); __Pyx_XGOTREF(__pyx_t_1); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 232, __pyx_L1_error) + __pyx_t_1 = PyObject_RichCompare(__pyx_t_3, __pyx_int_0, Py_NE); __Pyx_XGOTREF(__pyx_t_1); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 270, __pyx_L1_error) __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0; - __pyx_t_2 = __Pyx_PyInt_As_int(__pyx_t_1); if (unlikely((__pyx_t_2 == (int)-1) && PyErr_Occurred())) __PYX_ERR(0, 232, __pyx_L1_error) + __pyx_t_2 = __Pyx_PyInt_As_int(__pyx_t_1); if (unlikely((__pyx_t_2 == (int)-1) && PyErr_Occurred())) __PYX_ERR(0, 270, __pyx_L1_error) __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0; __pyx_v_sample = __pyx_t_2; - /* "gensim/models/doc2vec_inner.pyx":233 + /* "gensim/models/doc2vec_inner.pyx":271 * cdef int negative = model.negative * cdef int sample = (model.vocabulary.sample != 0) * cdef int _train_words = train_words # <<<<<<<<<<<<<< * cdef int _learn_words = learn_words * cdef int _learn_hidden = learn_hidden */ - __pyx_t_2 = __Pyx_PyInt_As_int(__pyx_v_train_words); if (unlikely((__pyx_t_2 == (int)-1) && PyErr_Occurred())) __PYX_ERR(0, 233, __pyx_L1_error) + __pyx_t_2 = __Pyx_PyInt_As_int(__pyx_v_train_words); if (unlikely((__pyx_t_2 == (int)-1) && PyErr_Occurred())) __PYX_ERR(0, 271, __pyx_L1_error) __pyx_v__train_words = __pyx_t_2; - /* "gensim/models/doc2vec_inner.pyx":234 + /* "gensim/models/doc2vec_inner.pyx":272 * cdef int sample = (model.vocabulary.sample != 0) * cdef int _train_words = train_words * cdef int _learn_words = learn_words # <<<<<<<<<<<<<< * cdef int _learn_hidden = learn_hidden * cdef int _learn_doctags = learn_doctags */ - __pyx_t_2 = __Pyx_PyInt_As_int(__pyx_v_learn_words); if (unlikely((__pyx_t_2 == (int)-1) && PyErr_Occurred())) __PYX_ERR(0, 234, __pyx_L1_error) + __pyx_t_2 = __Pyx_PyInt_As_int(__pyx_v_learn_words); if (unlikely((__pyx_t_2 == (int)-1) && PyErr_Occurred())) __PYX_ERR(0, 272, __pyx_L1_error) __pyx_v__learn_words = __pyx_t_2; - /* "gensim/models/doc2vec_inner.pyx":235 + /* "gensim/models/doc2vec_inner.pyx":273 * cdef int _train_words = train_words * cdef int _learn_words = learn_words * cdef int _learn_hidden = learn_hidden # <<<<<<<<<<<<<< * cdef int _learn_doctags = learn_doctags * */ - __pyx_t_2 = __Pyx_PyInt_As_int(__pyx_v_learn_hidden); if (unlikely((__pyx_t_2 == (int)-1) && PyErr_Occurred())) __PYX_ERR(0, 235, __pyx_L1_error) + __pyx_t_2 = __Pyx_PyInt_As_int(__pyx_v_learn_hidden); if (unlikely((__pyx_t_2 == (int)-1) && PyErr_Occurred())) __PYX_ERR(0, 273, __pyx_L1_error) __pyx_v__learn_hidden = __pyx_t_2; - /* "gensim/models/doc2vec_inner.pyx":236 + /* "gensim/models/doc2vec_inner.pyx":274 * cdef int _learn_words = learn_words * cdef int _learn_hidden = learn_hidden * cdef int _learn_doctags = learn_doctags # <<<<<<<<<<<<<< * * cdef REAL_t *_word_vectors */ - __pyx_t_2 = __Pyx_PyInt_As_int(__pyx_v_learn_doctags); if (unlikely((__pyx_t_2 == (int)-1) && PyErr_Occurred())) __PYX_ERR(0, 236, __pyx_L1_error) + __pyx_t_2 = __Pyx_PyInt_As_int(__pyx_v_learn_doctags); if (unlikely((__pyx_t_2 == (int)-1) && PyErr_Occurred())) __PYX_ERR(0, 274, __pyx_L1_error) __pyx_v__learn_doctags = __pyx_t_2; - /* "gensim/models/doc2vec_inner.pyx":243 + /* "gensim/models/doc2vec_inner.pyx":281 * cdef REAL_t *_doctag_locks * cdef REAL_t *_work * cdef REAL_t _alpha = alpha # <<<<<<<<<<<<<< * cdef int size = model.trainables.layer1_size * */ - __pyx_t_4 = __pyx_PyFloat_AsFloat(__pyx_v_alpha); if (unlikely((__pyx_t_4 == ((npy_float32)-1)) && PyErr_Occurred())) __PYX_ERR(0, 243, __pyx_L1_error) + __pyx_t_4 = __pyx_PyFloat_AsFloat(__pyx_v_alpha); if (unlikely((__pyx_t_4 == ((npy_float32)-1)) && PyErr_Occurred())) __PYX_ERR(0, 281, __pyx_L1_error) __pyx_v__alpha = __pyx_t_4; - /* "gensim/models/doc2vec_inner.pyx":244 + /* "gensim/models/doc2vec_inner.pyx":282 * cdef REAL_t *_work * cdef REAL_t _alpha = alpha * cdef int size = model.trainables.layer1_size # <<<<<<<<<<<<<< * * cdef int codelens[MAX_DOCUMENT_LEN] */ - __pyx_t_1 = __Pyx_PyObject_GetAttrStr(__pyx_v_model, __pyx_n_s_trainables); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 244, __pyx_L1_error) + __pyx_t_1 = __Pyx_PyObject_GetAttrStr(__pyx_v_model, __pyx_n_s_trainables); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 282, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_1); - __pyx_t_3 = __Pyx_PyObject_GetAttrStr(__pyx_t_1, __pyx_n_s_layer1_size); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 244, __pyx_L1_error) + __pyx_t_3 = __Pyx_PyObject_GetAttrStr(__pyx_t_1, __pyx_n_s_layer1_size); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 282, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_3); __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0; - __pyx_t_2 = __Pyx_PyInt_As_int(__pyx_t_3); if (unlikely((__pyx_t_2 == (int)-1) && PyErr_Occurred())) __PYX_ERR(0, 244, __pyx_L1_error) + __pyx_t_2 = __Pyx_PyInt_As_int(__pyx_t_3); if (unlikely((__pyx_t_2 == (int)-1) && PyErr_Occurred())) __PYX_ERR(0, 282, __pyx_L1_error) __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0; __pyx_v_size = __pyx_t_2; - /* "gensim/models/doc2vec_inner.pyx":252 + /* "gensim/models/doc2vec_inner.pyx":290 * cdef int document_len * cdef int doctag_len * cdef int window = model.window # <<<<<<<<<<<<<< * * cdef int i, j */ - __pyx_t_3 = __Pyx_PyObject_GetAttrStr(__pyx_v_model, __pyx_n_s_window); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 252, __pyx_L1_error) + __pyx_t_3 = __Pyx_PyObject_GetAttrStr(__pyx_v_model, __pyx_n_s_window); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 290, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_3); - __pyx_t_2 = __Pyx_PyInt_As_int(__pyx_t_3); if (unlikely((__pyx_t_2 == (int)-1) && PyErr_Occurred())) __PYX_ERR(0, 252, __pyx_L1_error) + __pyx_t_2 = __Pyx_PyInt_As_int(__pyx_t_3); if (unlikely((__pyx_t_2 == (int)-1) && PyErr_Occurred())) __PYX_ERR(0, 290, __pyx_L1_error) __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0; __pyx_v_window = __pyx_t_2; - /* "gensim/models/doc2vec_inner.pyx":256 + /* "gensim/models/doc2vec_inner.pyx":294 * cdef int i, j * cdef unsigned long long r * cdef long result = 0 # <<<<<<<<<<<<<< @@ -3673,7 +3664,7 @@ static PyObject *__pyx_pf_6gensim_6models_13doc2vec_inner_train_document_dbow(CY */ __pyx_v_result = 0; - /* "gensim/models/doc2vec_inner.pyx":270 + /* "gensim/models/doc2vec_inner.pyx":308 * * # default vectors, locks from syn0/doctag_syn0 * if word_vectors is None: # <<<<<<<<<<<<<< @@ -3684,22 +3675,22 @@ static PyObject *__pyx_pf_6gensim_6models_13doc2vec_inner_train_document_dbow(CY __pyx_t_6 = (__pyx_t_5 != 0); if (__pyx_t_6) { - /* "gensim/models/doc2vec_inner.pyx":271 + /* "gensim/models/doc2vec_inner.pyx":309 * # default vectors, locks from syn0/doctag_syn0 * if word_vectors is None: * word_vectors = model.wv.vectors # <<<<<<<<<<<<<< * _word_vectors = (np.PyArray_DATA(word_vectors)) * if doctag_vectors is None: */ - __pyx_t_3 = __Pyx_PyObject_GetAttrStr(__pyx_v_model, __pyx_n_s_wv); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 271, __pyx_L1_error) + __pyx_t_3 = __Pyx_PyObject_GetAttrStr(__pyx_v_model, __pyx_n_s_wv); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 309, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_3); - __pyx_t_1 = __Pyx_PyObject_GetAttrStr(__pyx_t_3, __pyx_n_s_vectors); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 271, __pyx_L1_error) + __pyx_t_1 = __Pyx_PyObject_GetAttrStr(__pyx_t_3, __pyx_n_s_vectors); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 309, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_1); __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0; __Pyx_DECREF_SET(__pyx_v_word_vectors, __pyx_t_1); __pyx_t_1 = 0; - /* "gensim/models/doc2vec_inner.pyx":270 + /* "gensim/models/doc2vec_inner.pyx":308 * * # default vectors, locks from syn0/doctag_syn0 * if word_vectors is None: # <<<<<<<<<<<<<< @@ -3708,17 +3699,17 @@ static PyObject *__pyx_pf_6gensim_6models_13doc2vec_inner_train_document_dbow(CY */ } - /* "gensim/models/doc2vec_inner.pyx":272 + /* "gensim/models/doc2vec_inner.pyx":310 * if word_vectors is None: * word_vectors = model.wv.vectors * _word_vectors = (np.PyArray_DATA(word_vectors)) # <<<<<<<<<<<<<< * if doctag_vectors is None: * doctag_vectors = model.docvecs.vectors_docs */ - if (!(likely(((__pyx_v_word_vectors) == Py_None) || likely(__Pyx_TypeTest(__pyx_v_word_vectors, __pyx_ptype_5numpy_ndarray))))) __PYX_ERR(0, 272, __pyx_L1_error) + if (!(likely(((__pyx_v_word_vectors) == Py_None) || likely(__Pyx_TypeTest(__pyx_v_word_vectors, __pyx_ptype_5numpy_ndarray))))) __PYX_ERR(0, 310, __pyx_L1_error) __pyx_v__word_vectors = ((__pyx_t_6gensim_6models_14word2vec_inner_REAL_t *)PyArray_DATA(((PyArrayObject *)__pyx_v_word_vectors))); - /* "gensim/models/doc2vec_inner.pyx":273 + /* "gensim/models/doc2vec_inner.pyx":311 * word_vectors = model.wv.vectors * _word_vectors = (np.PyArray_DATA(word_vectors)) * if doctag_vectors is None: # <<<<<<<<<<<<<< @@ -3729,22 +3720,22 @@ static PyObject *__pyx_pf_6gensim_6models_13doc2vec_inner_train_document_dbow(CY __pyx_t_5 = (__pyx_t_6 != 0); if (__pyx_t_5) { - /* "gensim/models/doc2vec_inner.pyx":274 + /* "gensim/models/doc2vec_inner.pyx":312 * _word_vectors = (np.PyArray_DATA(word_vectors)) * if doctag_vectors is None: * doctag_vectors = model.docvecs.vectors_docs # <<<<<<<<<<<<<< * _doctag_vectors = (np.PyArray_DATA(doctag_vectors)) * if word_locks is None: */ - __pyx_t_1 = __Pyx_PyObject_GetAttrStr(__pyx_v_model, __pyx_n_s_docvecs); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 274, __pyx_L1_error) + __pyx_t_1 = __Pyx_PyObject_GetAttrStr(__pyx_v_model, __pyx_n_s_docvecs); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 312, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_1); - __pyx_t_3 = __Pyx_PyObject_GetAttrStr(__pyx_t_1, __pyx_n_s_vectors_docs); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 274, __pyx_L1_error) + __pyx_t_3 = __Pyx_PyObject_GetAttrStr(__pyx_t_1, __pyx_n_s_vectors_docs); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 312, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_3); __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0; __Pyx_DECREF_SET(__pyx_v_doctag_vectors, __pyx_t_3); __pyx_t_3 = 0; - /* "gensim/models/doc2vec_inner.pyx":273 + /* "gensim/models/doc2vec_inner.pyx":311 * word_vectors = model.wv.vectors * _word_vectors = (np.PyArray_DATA(word_vectors)) * if doctag_vectors is None: # <<<<<<<<<<<<<< @@ -3753,17 +3744,17 @@ static PyObject *__pyx_pf_6gensim_6models_13doc2vec_inner_train_document_dbow(CY */ } - /* "gensim/models/doc2vec_inner.pyx":275 + /* "gensim/models/doc2vec_inner.pyx":313 * if doctag_vectors is None: * doctag_vectors = model.docvecs.vectors_docs * _doctag_vectors = (np.PyArray_DATA(doctag_vectors)) # <<<<<<<<<<<<<< * if word_locks is None: * word_locks = model.trainables.vectors_lockf */ - if (!(likely(((__pyx_v_doctag_vectors) == Py_None) || likely(__Pyx_TypeTest(__pyx_v_doctag_vectors, __pyx_ptype_5numpy_ndarray))))) __PYX_ERR(0, 275, __pyx_L1_error) + if (!(likely(((__pyx_v_doctag_vectors) == Py_None) || likely(__Pyx_TypeTest(__pyx_v_doctag_vectors, __pyx_ptype_5numpy_ndarray))))) __PYX_ERR(0, 313, __pyx_L1_error) __pyx_v__doctag_vectors = ((__pyx_t_6gensim_6models_14word2vec_inner_REAL_t *)PyArray_DATA(((PyArrayObject *)__pyx_v_doctag_vectors))); - /* "gensim/models/doc2vec_inner.pyx":276 + /* "gensim/models/doc2vec_inner.pyx":314 * doctag_vectors = model.docvecs.vectors_docs * _doctag_vectors = (np.PyArray_DATA(doctag_vectors)) * if word_locks is None: # <<<<<<<<<<<<<< @@ -3774,22 +3765,22 @@ static PyObject *__pyx_pf_6gensim_6models_13doc2vec_inner_train_document_dbow(CY __pyx_t_6 = (__pyx_t_5 != 0); if (__pyx_t_6) { - /* "gensim/models/doc2vec_inner.pyx":277 + /* "gensim/models/doc2vec_inner.pyx":315 * _doctag_vectors = (np.PyArray_DATA(doctag_vectors)) * if word_locks is None: * word_locks = model.trainables.vectors_lockf # <<<<<<<<<<<<<< * _word_locks = (np.PyArray_DATA(word_locks)) * if doctag_locks is None: */ - __pyx_t_3 = __Pyx_PyObject_GetAttrStr(__pyx_v_model, __pyx_n_s_trainables); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 277, __pyx_L1_error) + __pyx_t_3 = __Pyx_PyObject_GetAttrStr(__pyx_v_model, __pyx_n_s_trainables); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 315, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_3); - __pyx_t_1 = __Pyx_PyObject_GetAttrStr(__pyx_t_3, __pyx_n_s_vectors_lockf); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 277, __pyx_L1_error) + __pyx_t_1 = __Pyx_PyObject_GetAttrStr(__pyx_t_3, __pyx_n_s_vectors_lockf); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 315, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_1); __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0; __Pyx_DECREF_SET(__pyx_v_word_locks, __pyx_t_1); __pyx_t_1 = 0; - /* "gensim/models/doc2vec_inner.pyx":276 + /* "gensim/models/doc2vec_inner.pyx":314 * doctag_vectors = model.docvecs.vectors_docs * _doctag_vectors = (np.PyArray_DATA(doctag_vectors)) * if word_locks is None: # <<<<<<<<<<<<<< @@ -3798,17 +3789,17 @@ static PyObject *__pyx_pf_6gensim_6models_13doc2vec_inner_train_document_dbow(CY */ } - /* "gensim/models/doc2vec_inner.pyx":278 + /* "gensim/models/doc2vec_inner.pyx":316 * if word_locks is None: * word_locks = model.trainables.vectors_lockf * _word_locks = (np.PyArray_DATA(word_locks)) # <<<<<<<<<<<<<< * if doctag_locks is None: * doctag_locks = model.trainables.vectors_docs_lockf */ - if (!(likely(((__pyx_v_word_locks) == Py_None) || likely(__Pyx_TypeTest(__pyx_v_word_locks, __pyx_ptype_5numpy_ndarray))))) __PYX_ERR(0, 278, __pyx_L1_error) + if (!(likely(((__pyx_v_word_locks) == Py_None) || likely(__Pyx_TypeTest(__pyx_v_word_locks, __pyx_ptype_5numpy_ndarray))))) __PYX_ERR(0, 316, __pyx_L1_error) __pyx_v__word_locks = ((__pyx_t_6gensim_6models_14word2vec_inner_REAL_t *)PyArray_DATA(((PyArrayObject *)__pyx_v_word_locks))); - /* "gensim/models/doc2vec_inner.pyx":279 + /* "gensim/models/doc2vec_inner.pyx":317 * word_locks = model.trainables.vectors_lockf * _word_locks = (np.PyArray_DATA(word_locks)) * if doctag_locks is None: # <<<<<<<<<<<<<< @@ -3819,22 +3810,22 @@ static PyObject *__pyx_pf_6gensim_6models_13doc2vec_inner_train_document_dbow(CY __pyx_t_5 = (__pyx_t_6 != 0); if (__pyx_t_5) { - /* "gensim/models/doc2vec_inner.pyx":280 + /* "gensim/models/doc2vec_inner.pyx":318 * _word_locks = (np.PyArray_DATA(word_locks)) * if doctag_locks is None: * doctag_locks = model.trainables.vectors_docs_lockf # <<<<<<<<<<<<<< * _doctag_locks = (np.PyArray_DATA(doctag_locks)) * */ - __pyx_t_1 = __Pyx_PyObject_GetAttrStr(__pyx_v_model, __pyx_n_s_trainables); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 280, __pyx_L1_error) + __pyx_t_1 = __Pyx_PyObject_GetAttrStr(__pyx_v_model, __pyx_n_s_trainables); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 318, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_1); - __pyx_t_3 = __Pyx_PyObject_GetAttrStr(__pyx_t_1, __pyx_n_s_vectors_docs_lockf); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 280, __pyx_L1_error) + __pyx_t_3 = __Pyx_PyObject_GetAttrStr(__pyx_t_1, __pyx_n_s_vectors_docs_lockf); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 318, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_3); __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0; __Pyx_DECREF_SET(__pyx_v_doctag_locks, __pyx_t_3); __pyx_t_3 = 0; - /* "gensim/models/doc2vec_inner.pyx":279 + /* "gensim/models/doc2vec_inner.pyx":317 * word_locks = model.trainables.vectors_lockf * _word_locks = (np.PyArray_DATA(word_locks)) * if doctag_locks is None: # <<<<<<<<<<<<<< @@ -3843,17 +3834,17 @@ static PyObject *__pyx_pf_6gensim_6models_13doc2vec_inner_train_document_dbow(CY */ } - /* "gensim/models/doc2vec_inner.pyx":281 + /* "gensim/models/doc2vec_inner.pyx":319 * if doctag_locks is None: * doctag_locks = model.trainables.vectors_docs_lockf * _doctag_locks = (np.PyArray_DATA(doctag_locks)) # <<<<<<<<<<<<<< * * if hs: */ - if (!(likely(((__pyx_v_doctag_locks) == Py_None) || likely(__Pyx_TypeTest(__pyx_v_doctag_locks, __pyx_ptype_5numpy_ndarray))))) __PYX_ERR(0, 281, __pyx_L1_error) + if (!(likely(((__pyx_v_doctag_locks) == Py_None) || likely(__Pyx_TypeTest(__pyx_v_doctag_locks, __pyx_ptype_5numpy_ndarray))))) __PYX_ERR(0, 319, __pyx_L1_error) __pyx_v__doctag_locks = ((__pyx_t_6gensim_6models_14word2vec_inner_REAL_t *)PyArray_DATA(((PyArrayObject *)__pyx_v_doctag_locks))); - /* "gensim/models/doc2vec_inner.pyx":283 + /* "gensim/models/doc2vec_inner.pyx":321 * _doctag_locks = (np.PyArray_DATA(doctag_locks)) * * if hs: # <<<<<<<<<<<<<< @@ -3863,23 +3854,23 @@ static PyObject *__pyx_pf_6gensim_6models_13doc2vec_inner_train_document_dbow(CY __pyx_t_5 = (__pyx_v_hs != 0); if (__pyx_t_5) { - /* "gensim/models/doc2vec_inner.pyx":284 + /* "gensim/models/doc2vec_inner.pyx":322 * * if hs: * syn1 = (np.PyArray_DATA(model.trainables.syn1)) # <<<<<<<<<<<<<< * * if negative: */ - __pyx_t_3 = __Pyx_PyObject_GetAttrStr(__pyx_v_model, __pyx_n_s_trainables); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 284, __pyx_L1_error) + __pyx_t_3 = __Pyx_PyObject_GetAttrStr(__pyx_v_model, __pyx_n_s_trainables); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 322, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_3); - __pyx_t_1 = __Pyx_PyObject_GetAttrStr(__pyx_t_3, __pyx_n_s_syn1); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 284, __pyx_L1_error) + __pyx_t_1 = __Pyx_PyObject_GetAttrStr(__pyx_t_3, __pyx_n_s_syn1); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 322, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_1); __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0; - if (!(likely(((__pyx_t_1) == Py_None) || likely(__Pyx_TypeTest(__pyx_t_1, __pyx_ptype_5numpy_ndarray))))) __PYX_ERR(0, 284, __pyx_L1_error) + if (!(likely(((__pyx_t_1) == Py_None) || likely(__Pyx_TypeTest(__pyx_t_1, __pyx_ptype_5numpy_ndarray))))) __PYX_ERR(0, 322, __pyx_L1_error) __pyx_v_syn1 = ((__pyx_t_6gensim_6models_14word2vec_inner_REAL_t *)PyArray_DATA(((PyArrayObject *)__pyx_t_1))); __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0; - /* "gensim/models/doc2vec_inner.pyx":283 + /* "gensim/models/doc2vec_inner.pyx":321 * _doctag_locks = (np.PyArray_DATA(doctag_locks)) * * if hs: # <<<<<<<<<<<<<< @@ -3888,7 +3879,7 @@ static PyObject *__pyx_pf_6gensim_6models_13doc2vec_inner_train_document_dbow(CY */ } - /* "gensim/models/doc2vec_inner.pyx":286 + /* "gensim/models/doc2vec_inner.pyx":324 * syn1 = (np.PyArray_DATA(model.trainables.syn1)) * * if negative: # <<<<<<<<<<<<<< @@ -3898,55 +3889,55 @@ static PyObject *__pyx_pf_6gensim_6models_13doc2vec_inner_train_document_dbow(CY __pyx_t_5 = (__pyx_v_negative != 0); if (__pyx_t_5) { - /* "gensim/models/doc2vec_inner.pyx":287 + /* "gensim/models/doc2vec_inner.pyx":325 * * if negative: * syn1neg = (np.PyArray_DATA(model.trainables.syn1neg)) # <<<<<<<<<<<<<< * cum_table = (np.PyArray_DATA(model.vocabulary.cum_table)) * cum_table_len = len(model.vocabulary.cum_table) */ - __pyx_t_1 = __Pyx_PyObject_GetAttrStr(__pyx_v_model, __pyx_n_s_trainables); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 287, __pyx_L1_error) + __pyx_t_1 = __Pyx_PyObject_GetAttrStr(__pyx_v_model, __pyx_n_s_trainables); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 325, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_1); - __pyx_t_3 = __Pyx_PyObject_GetAttrStr(__pyx_t_1, __pyx_n_s_syn1neg); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 287, __pyx_L1_error) + __pyx_t_3 = __Pyx_PyObject_GetAttrStr(__pyx_t_1, __pyx_n_s_syn1neg); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 325, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_3); __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0; - if (!(likely(((__pyx_t_3) == Py_None) || likely(__Pyx_TypeTest(__pyx_t_3, __pyx_ptype_5numpy_ndarray))))) __PYX_ERR(0, 287, __pyx_L1_error) + if (!(likely(((__pyx_t_3) == Py_None) || likely(__Pyx_TypeTest(__pyx_t_3, __pyx_ptype_5numpy_ndarray))))) __PYX_ERR(0, 325, __pyx_L1_error) __pyx_v_syn1neg = ((__pyx_t_6gensim_6models_14word2vec_inner_REAL_t *)PyArray_DATA(((PyArrayObject *)__pyx_t_3))); __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0; - /* "gensim/models/doc2vec_inner.pyx":288 + /* "gensim/models/doc2vec_inner.pyx":326 * if negative: * syn1neg = (np.PyArray_DATA(model.trainables.syn1neg)) * cum_table = (np.PyArray_DATA(model.vocabulary.cum_table)) # <<<<<<<<<<<<<< * cum_table_len = len(model.vocabulary.cum_table) * if negative or sample: */ - __pyx_t_3 = __Pyx_PyObject_GetAttrStr(__pyx_v_model, __pyx_n_s_vocabulary); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 288, __pyx_L1_error) + __pyx_t_3 = __Pyx_PyObject_GetAttrStr(__pyx_v_model, __pyx_n_s_vocabulary); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 326, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_3); - __pyx_t_1 = __Pyx_PyObject_GetAttrStr(__pyx_t_3, __pyx_n_s_cum_table); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 288, __pyx_L1_error) + __pyx_t_1 = __Pyx_PyObject_GetAttrStr(__pyx_t_3, __pyx_n_s_cum_table); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 326, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_1); __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0; - if (!(likely(((__pyx_t_1) == Py_None) || likely(__Pyx_TypeTest(__pyx_t_1, __pyx_ptype_5numpy_ndarray))))) __PYX_ERR(0, 288, __pyx_L1_error) + if (!(likely(((__pyx_t_1) == Py_None) || likely(__Pyx_TypeTest(__pyx_t_1, __pyx_ptype_5numpy_ndarray))))) __PYX_ERR(0, 326, __pyx_L1_error) __pyx_v_cum_table = ((__pyx_t_5numpy_uint32_t *)PyArray_DATA(((PyArrayObject *)__pyx_t_1))); __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0; - /* "gensim/models/doc2vec_inner.pyx":289 + /* "gensim/models/doc2vec_inner.pyx":327 * syn1neg = (np.PyArray_DATA(model.trainables.syn1neg)) * cum_table = (np.PyArray_DATA(model.vocabulary.cum_table)) * cum_table_len = len(model.vocabulary.cum_table) # <<<<<<<<<<<<<< * if negative or sample: * next_random = (2**24) * model.random.randint(0, 2**24) + model.random.randint(0, 2**24) */ - __pyx_t_1 = __Pyx_PyObject_GetAttrStr(__pyx_v_model, __pyx_n_s_vocabulary); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 289, __pyx_L1_error) + __pyx_t_1 = __Pyx_PyObject_GetAttrStr(__pyx_v_model, __pyx_n_s_vocabulary); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 327, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_1); - __pyx_t_3 = __Pyx_PyObject_GetAttrStr(__pyx_t_1, __pyx_n_s_cum_table); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 289, __pyx_L1_error) + __pyx_t_3 = __Pyx_PyObject_GetAttrStr(__pyx_t_1, __pyx_n_s_cum_table); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 327, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_3); __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0; - __pyx_t_7 = PyObject_Length(__pyx_t_3); if (unlikely(__pyx_t_7 == ((Py_ssize_t)-1))) __PYX_ERR(0, 289, __pyx_L1_error) + __pyx_t_7 = PyObject_Length(__pyx_t_3); if (unlikely(__pyx_t_7 == ((Py_ssize_t)-1))) __PYX_ERR(0, 327, __pyx_L1_error) __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0; __pyx_v_cum_table_len = __pyx_t_7; - /* "gensim/models/doc2vec_inner.pyx":286 + /* "gensim/models/doc2vec_inner.pyx":324 * syn1 = (np.PyArray_DATA(model.trainables.syn1)) * * if negative: # <<<<<<<<<<<<<< @@ -3955,7 +3946,7 @@ static PyObject *__pyx_pf_6gensim_6models_13doc2vec_inner_train_document_dbow(CY */ } - /* "gensim/models/doc2vec_inner.pyx":290 + /* "gensim/models/doc2vec_inner.pyx":328 * cum_table = (np.PyArray_DATA(model.vocabulary.cum_table)) * cum_table_len = len(model.vocabulary.cum_table) * if negative or sample: # <<<<<<<<<<<<<< @@ -3973,41 +3964,41 @@ static PyObject *__pyx_pf_6gensim_6models_13doc2vec_inner_train_document_dbow(CY __pyx_L10_bool_binop_done:; if (__pyx_t_5) { - /* "gensim/models/doc2vec_inner.pyx":291 + /* "gensim/models/doc2vec_inner.pyx":329 * cum_table_len = len(model.vocabulary.cum_table) * if negative or sample: * next_random = (2**24) * model.random.randint(0, 2**24) + model.random.randint(0, 2**24) # <<<<<<<<<<<<<< * * # convert Python structures to primitive types, so we can release the GIL */ - __pyx_t_3 = __Pyx_PyObject_GetAttrStr(__pyx_v_model, __pyx_n_s_random); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 291, __pyx_L1_error) + __pyx_t_3 = __Pyx_PyObject_GetAttrStr(__pyx_v_model, __pyx_n_s_random); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 329, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_3); - __pyx_t_1 = __Pyx_PyObject_GetAttrStr(__pyx_t_3, __pyx_n_s_randint); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 291, __pyx_L1_error) + __pyx_t_1 = __Pyx_PyObject_GetAttrStr(__pyx_t_3, __pyx_n_s_randint); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 329, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_1); __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0; - __pyx_t_3 = __Pyx_PyObject_Call(__pyx_t_1, __pyx_tuple_, NULL); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 291, __pyx_L1_error) + __pyx_t_3 = __Pyx_PyObject_Call(__pyx_t_1, __pyx_tuple_, NULL); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 329, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_3); __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0; - __pyx_t_1 = PyNumber_Multiply(__pyx_int_16777216, __pyx_t_3); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 291, __pyx_L1_error) + __pyx_t_1 = PyNumber_Multiply(__pyx_int_16777216, __pyx_t_3); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 329, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_1); __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0; - __pyx_t_3 = __Pyx_PyObject_GetAttrStr(__pyx_v_model, __pyx_n_s_random); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 291, __pyx_L1_error) + __pyx_t_3 = __Pyx_PyObject_GetAttrStr(__pyx_v_model, __pyx_n_s_random); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 329, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_3); - __pyx_t_8 = __Pyx_PyObject_GetAttrStr(__pyx_t_3, __pyx_n_s_randint); if (unlikely(!__pyx_t_8)) __PYX_ERR(0, 291, __pyx_L1_error) + __pyx_t_8 = __Pyx_PyObject_GetAttrStr(__pyx_t_3, __pyx_n_s_randint); if (unlikely(!__pyx_t_8)) __PYX_ERR(0, 329, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_8); __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0; - __pyx_t_3 = __Pyx_PyObject_Call(__pyx_t_8, __pyx_tuple__2, NULL); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 291, __pyx_L1_error) + __pyx_t_3 = __Pyx_PyObject_Call(__pyx_t_8, __pyx_tuple__2, NULL); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 329, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_3); __Pyx_DECREF(__pyx_t_8); __pyx_t_8 = 0; - __pyx_t_8 = PyNumber_Add(__pyx_t_1, __pyx_t_3); if (unlikely(!__pyx_t_8)) __PYX_ERR(0, 291, __pyx_L1_error) + __pyx_t_8 = PyNumber_Add(__pyx_t_1, __pyx_t_3); if (unlikely(!__pyx_t_8)) __PYX_ERR(0, 329, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_8); __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0; __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0; - __pyx_t_9 = __Pyx_PyInt_As_unsigned_PY_LONG_LONG(__pyx_t_8); if (unlikely((__pyx_t_9 == (unsigned PY_LONG_LONG)-1) && PyErr_Occurred())) __PYX_ERR(0, 291, __pyx_L1_error) + __pyx_t_9 = __Pyx_PyInt_As_unsigned_PY_LONG_LONG(__pyx_t_8); if (unlikely((__pyx_t_9 == (unsigned PY_LONG_LONG)-1) && PyErr_Occurred())) __PYX_ERR(0, 329, __pyx_L1_error) __Pyx_DECREF(__pyx_t_8); __pyx_t_8 = 0; __pyx_v_next_random = __pyx_t_9; - /* "gensim/models/doc2vec_inner.pyx":290 + /* "gensim/models/doc2vec_inner.pyx":328 * cum_table = (np.PyArray_DATA(model.vocabulary.cum_table)) * cum_table_len = len(model.vocabulary.cum_table) * if negative or sample: # <<<<<<<<<<<<<< @@ -4016,7 +4007,7 @@ static PyObject *__pyx_pf_6gensim_6models_13doc2vec_inner_train_document_dbow(CY */ } - /* "gensim/models/doc2vec_inner.pyx":294 + /* "gensim/models/doc2vec_inner.pyx":332 * * # convert Python structures to primitive types, so we can release the GIL * if work is None: # <<<<<<<<<<<<<< @@ -4027,32 +4018,32 @@ static PyObject *__pyx_pf_6gensim_6models_13doc2vec_inner_train_document_dbow(CY __pyx_t_6 = (__pyx_t_5 != 0); if (__pyx_t_6) { - /* "gensim/models/doc2vec_inner.pyx":295 + /* "gensim/models/doc2vec_inner.pyx":333 * # convert Python structures to primitive types, so we can release the GIL * if work is None: * work = zeros(model.trainables.layer1_size, dtype=REAL) # <<<<<<<<<<<<<< * _work = np.PyArray_DATA(work) * */ - __pyx_t_8 = __Pyx_GetModuleGlobalName(__pyx_n_s_zeros); if (unlikely(!__pyx_t_8)) __PYX_ERR(0, 295, __pyx_L1_error) + __pyx_t_8 = __Pyx_GetModuleGlobalName(__pyx_n_s_zeros); if (unlikely(!__pyx_t_8)) __PYX_ERR(0, 333, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_8); - __pyx_t_3 = __Pyx_PyObject_GetAttrStr(__pyx_v_model, __pyx_n_s_trainables); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 295, __pyx_L1_error) + __pyx_t_3 = __Pyx_PyObject_GetAttrStr(__pyx_v_model, __pyx_n_s_trainables); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 333, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_3); - __pyx_t_1 = __Pyx_PyObject_GetAttrStr(__pyx_t_3, __pyx_n_s_layer1_size); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 295, __pyx_L1_error) + __pyx_t_1 = __Pyx_PyObject_GetAttrStr(__pyx_t_3, __pyx_n_s_layer1_size); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 333, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_1); __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0; - __pyx_t_3 = PyTuple_New(1); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 295, __pyx_L1_error) + __pyx_t_3 = PyTuple_New(1); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 333, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_3); __Pyx_GIVEREF(__pyx_t_1); PyTuple_SET_ITEM(__pyx_t_3, 0, __pyx_t_1); __pyx_t_1 = 0; - __pyx_t_1 = __Pyx_PyDict_NewPresized(1); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 295, __pyx_L1_error) + __pyx_t_1 = __Pyx_PyDict_NewPresized(1); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 333, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_1); - __pyx_t_10 = __Pyx_GetModuleGlobalName(__pyx_n_s_REAL); if (unlikely(!__pyx_t_10)) __PYX_ERR(0, 295, __pyx_L1_error) + __pyx_t_10 = __Pyx_GetModuleGlobalName(__pyx_n_s_REAL); if (unlikely(!__pyx_t_10)) __PYX_ERR(0, 333, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_10); - if (PyDict_SetItem(__pyx_t_1, __pyx_n_s_dtype, __pyx_t_10) < 0) __PYX_ERR(0, 295, __pyx_L1_error) + if (PyDict_SetItem(__pyx_t_1, __pyx_n_s_dtype, __pyx_t_10) < 0) __PYX_ERR(0, 333, __pyx_L1_error) __Pyx_DECREF(__pyx_t_10); __pyx_t_10 = 0; - __pyx_t_10 = __Pyx_PyObject_Call(__pyx_t_8, __pyx_t_3, __pyx_t_1); if (unlikely(!__pyx_t_10)) __PYX_ERR(0, 295, __pyx_L1_error) + __pyx_t_10 = __Pyx_PyObject_Call(__pyx_t_8, __pyx_t_3, __pyx_t_1); if (unlikely(!__pyx_t_10)) __PYX_ERR(0, 333, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_10); __Pyx_DECREF(__pyx_t_8); __pyx_t_8 = 0; __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0; @@ -4060,7 +4051,7 @@ static PyObject *__pyx_pf_6gensim_6models_13doc2vec_inner_train_document_dbow(CY __Pyx_DECREF_SET(__pyx_v_work, __pyx_t_10); __pyx_t_10 = 0; - /* "gensim/models/doc2vec_inner.pyx":294 + /* "gensim/models/doc2vec_inner.pyx":332 * * # convert Python structures to primitive types, so we can release the GIL * if work is None: # <<<<<<<<<<<<<< @@ -4069,32 +4060,32 @@ static PyObject *__pyx_pf_6gensim_6models_13doc2vec_inner_train_document_dbow(CY */ } - /* "gensim/models/doc2vec_inner.pyx":296 + /* "gensim/models/doc2vec_inner.pyx":334 * if work is None: * work = zeros(model.trainables.layer1_size, dtype=REAL) * _work = np.PyArray_DATA(work) # <<<<<<<<<<<<<< * * vlookup = model.wv.vocab */ - if (!(likely(((__pyx_v_work) == Py_None) || likely(__Pyx_TypeTest(__pyx_v_work, __pyx_ptype_5numpy_ndarray))))) __PYX_ERR(0, 296, __pyx_L1_error) + if (!(likely(((__pyx_v_work) == Py_None) || likely(__Pyx_TypeTest(__pyx_v_work, __pyx_ptype_5numpy_ndarray))))) __PYX_ERR(0, 334, __pyx_L1_error) __pyx_v__work = ((__pyx_t_6gensim_6models_14word2vec_inner_REAL_t *)PyArray_DATA(((PyArrayObject *)__pyx_v_work))); - /* "gensim/models/doc2vec_inner.pyx":298 + /* "gensim/models/doc2vec_inner.pyx":336 * _work = np.PyArray_DATA(work) * * vlookup = model.wv.vocab # <<<<<<<<<<<<<< * i = 0 * for token in doc_words: */ - __pyx_t_10 = __Pyx_PyObject_GetAttrStr(__pyx_v_model, __pyx_n_s_wv); if (unlikely(!__pyx_t_10)) __PYX_ERR(0, 298, __pyx_L1_error) + __pyx_t_10 = __Pyx_PyObject_GetAttrStr(__pyx_v_model, __pyx_n_s_wv); if (unlikely(!__pyx_t_10)) __PYX_ERR(0, 336, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_10); - __pyx_t_1 = __Pyx_PyObject_GetAttrStr(__pyx_t_10, __pyx_n_s_vocab); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 298, __pyx_L1_error) + __pyx_t_1 = __Pyx_PyObject_GetAttrStr(__pyx_t_10, __pyx_n_s_vocab); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 336, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_1); __Pyx_DECREF(__pyx_t_10); __pyx_t_10 = 0; __pyx_v_vlookup = __pyx_t_1; __pyx_t_1 = 0; - /* "gensim/models/doc2vec_inner.pyx":299 + /* "gensim/models/doc2vec_inner.pyx":337 * * vlookup = model.wv.vocab * i = 0 # <<<<<<<<<<<<<< @@ -4103,7 +4094,7 @@ static PyObject *__pyx_pf_6gensim_6models_13doc2vec_inner_train_document_dbow(CY */ __pyx_v_i = 0; - /* "gensim/models/doc2vec_inner.pyx":300 + /* "gensim/models/doc2vec_inner.pyx":338 * vlookup = model.wv.vocab * i = 0 * for token in doc_words: # <<<<<<<<<<<<<< @@ -4114,26 +4105,26 @@ static PyObject *__pyx_pf_6gensim_6models_13doc2vec_inner_train_document_dbow(CY __pyx_t_1 = __pyx_v_doc_words; __Pyx_INCREF(__pyx_t_1); __pyx_t_7 = 0; __pyx_t_11 = NULL; } else { - __pyx_t_7 = -1; __pyx_t_1 = PyObject_GetIter(__pyx_v_doc_words); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 300, __pyx_L1_error) + __pyx_t_7 = -1; __pyx_t_1 = PyObject_GetIter(__pyx_v_doc_words); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 338, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_1); - __pyx_t_11 = Py_TYPE(__pyx_t_1)->tp_iternext; if (unlikely(!__pyx_t_11)) __PYX_ERR(0, 300, __pyx_L1_error) + __pyx_t_11 = Py_TYPE(__pyx_t_1)->tp_iternext; if (unlikely(!__pyx_t_11)) __PYX_ERR(0, 338, __pyx_L1_error) } for (;;) { if (likely(!__pyx_t_11)) { if (likely(PyList_CheckExact(__pyx_t_1))) { if (__pyx_t_7 >= PyList_GET_SIZE(__pyx_t_1)) break; #if CYTHON_ASSUME_SAFE_MACROS && !CYTHON_AVOID_BORROWED_REFS - __pyx_t_10 = PyList_GET_ITEM(__pyx_t_1, __pyx_t_7); __Pyx_INCREF(__pyx_t_10); __pyx_t_7++; if (unlikely(0 < 0)) __PYX_ERR(0, 300, __pyx_L1_error) + __pyx_t_10 = PyList_GET_ITEM(__pyx_t_1, __pyx_t_7); __Pyx_INCREF(__pyx_t_10); __pyx_t_7++; if (unlikely(0 < 0)) __PYX_ERR(0, 338, __pyx_L1_error) #else - __pyx_t_10 = PySequence_ITEM(__pyx_t_1, __pyx_t_7); __pyx_t_7++; if (unlikely(!__pyx_t_10)) __PYX_ERR(0, 300, __pyx_L1_error) + __pyx_t_10 = PySequence_ITEM(__pyx_t_1, __pyx_t_7); __pyx_t_7++; if (unlikely(!__pyx_t_10)) __PYX_ERR(0, 338, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_10); #endif } else { if (__pyx_t_7 >= PyTuple_GET_SIZE(__pyx_t_1)) break; #if CYTHON_ASSUME_SAFE_MACROS && !CYTHON_AVOID_BORROWED_REFS - __pyx_t_10 = PyTuple_GET_ITEM(__pyx_t_1, __pyx_t_7); __Pyx_INCREF(__pyx_t_10); __pyx_t_7++; if (unlikely(0 < 0)) __PYX_ERR(0, 300, __pyx_L1_error) + __pyx_t_10 = PyTuple_GET_ITEM(__pyx_t_1, __pyx_t_7); __Pyx_INCREF(__pyx_t_10); __pyx_t_7++; if (unlikely(0 < 0)) __PYX_ERR(0, 338, __pyx_L1_error) #else - __pyx_t_10 = PySequence_ITEM(__pyx_t_1, __pyx_t_7); __pyx_t_7++; if (unlikely(!__pyx_t_10)) __PYX_ERR(0, 300, __pyx_L1_error) + __pyx_t_10 = PySequence_ITEM(__pyx_t_1, __pyx_t_7); __pyx_t_7++; if (unlikely(!__pyx_t_10)) __PYX_ERR(0, 338, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_10); #endif } @@ -4143,7 +4134,7 @@ static PyObject *__pyx_pf_6gensim_6models_13doc2vec_inner_train_document_dbow(CY PyObject* exc_type = PyErr_Occurred(); if (exc_type) { if (likely(__Pyx_PyErr_GivenExceptionMatches(exc_type, PyExc_StopIteration))) PyErr_Clear(); - else __PYX_ERR(0, 300, __pyx_L1_error) + else __PYX_ERR(0, 338, __pyx_L1_error) } break; } @@ -4152,16 +4143,16 @@ static PyObject *__pyx_pf_6gensim_6models_13doc2vec_inner_train_document_dbow(CY __Pyx_XDECREF_SET(__pyx_v_token, __pyx_t_10); __pyx_t_10 = 0; - /* "gensim/models/doc2vec_inner.pyx":301 + /* "gensim/models/doc2vec_inner.pyx":339 * i = 0 * for token in doc_words: * predict_word = vlookup[token] if token in vlookup else None # <<<<<<<<<<<<<< * if predict_word is None: # shrink document to leave out word * continue # leaving i unchanged */ - __pyx_t_6 = (__Pyx_PySequence_ContainsTF(__pyx_v_token, __pyx_v_vlookup, Py_EQ)); if (unlikely(__pyx_t_6 < 0)) __PYX_ERR(0, 301, __pyx_L1_error) + __pyx_t_6 = (__Pyx_PySequence_ContainsTF(__pyx_v_token, __pyx_v_vlookup, Py_EQ)); if (unlikely(__pyx_t_6 < 0)) __PYX_ERR(0, 339, __pyx_L1_error) if ((__pyx_t_6 != 0)) { - __pyx_t_3 = PyObject_GetItem(__pyx_v_vlookup, __pyx_v_token); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 301, __pyx_L1_error) + __pyx_t_3 = PyObject_GetItem(__pyx_v_vlookup, __pyx_v_token); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 339, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_3); __pyx_t_10 = __pyx_t_3; __pyx_t_3 = 0; @@ -4172,7 +4163,7 @@ static PyObject *__pyx_pf_6gensim_6models_13doc2vec_inner_train_document_dbow(CY __Pyx_XDECREF_SET(__pyx_v_predict_word, __pyx_t_10); __pyx_t_10 = 0; - /* "gensim/models/doc2vec_inner.pyx":302 + /* "gensim/models/doc2vec_inner.pyx":340 * for token in doc_words: * predict_word = vlookup[token] if token in vlookup else None * if predict_word is None: # shrink document to leave out word # <<<<<<<<<<<<<< @@ -4183,7 +4174,7 @@ static PyObject *__pyx_pf_6gensim_6models_13doc2vec_inner_train_document_dbow(CY __pyx_t_5 = (__pyx_t_6 != 0); if (__pyx_t_5) { - /* "gensim/models/doc2vec_inner.pyx":303 + /* "gensim/models/doc2vec_inner.pyx":341 * predict_word = vlookup[token] if token in vlookup else None * if predict_word is None: # shrink document to leave out word * continue # leaving i unchanged # <<<<<<<<<<<<<< @@ -4192,7 +4183,7 @@ static PyObject *__pyx_pf_6gensim_6models_13doc2vec_inner_train_document_dbow(CY */ goto __pyx_L13_continue; - /* "gensim/models/doc2vec_inner.pyx":302 + /* "gensim/models/doc2vec_inner.pyx":340 * for token in doc_words: * predict_word = vlookup[token] if token in vlookup else None * if predict_word is None: # shrink document to leave out word # <<<<<<<<<<<<<< @@ -4201,7 +4192,7 @@ static PyObject *__pyx_pf_6gensim_6models_13doc2vec_inner_train_document_dbow(CY */ } - /* "gensim/models/doc2vec_inner.pyx":304 + /* "gensim/models/doc2vec_inner.pyx":342 * if predict_word is None: # shrink document to leave out word * continue # leaving i unchanged * if sample and predict_word.sample_int < random_int32(&next_random): # <<<<<<<<<<<<<< @@ -4214,20 +4205,20 @@ static PyObject *__pyx_pf_6gensim_6models_13doc2vec_inner_train_document_dbow(CY __pyx_t_5 = __pyx_t_6; goto __pyx_L17_bool_binop_done; } - __pyx_t_10 = __Pyx_PyObject_GetAttrStr(__pyx_v_predict_word, __pyx_n_s_sample_int); if (unlikely(!__pyx_t_10)) __PYX_ERR(0, 304, __pyx_L1_error) + __pyx_t_10 = __Pyx_PyObject_GetAttrStr(__pyx_v_predict_word, __pyx_n_s_sample_int); if (unlikely(!__pyx_t_10)) __PYX_ERR(0, 342, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_10); - __pyx_t_3 = __Pyx_PyInt_From_unsigned_PY_LONG_LONG(__pyx_f_6gensim_6models_14word2vec_inner_random_int32((&__pyx_v_next_random))); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 304, __pyx_L1_error) + __pyx_t_3 = __Pyx_PyInt_From_unsigned_PY_LONG_LONG(__pyx_f_6gensim_6models_14word2vec_inner_random_int32((&__pyx_v_next_random))); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 342, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_3); - __pyx_t_8 = PyObject_RichCompare(__pyx_t_10, __pyx_t_3, Py_LT); __Pyx_XGOTREF(__pyx_t_8); if (unlikely(!__pyx_t_8)) __PYX_ERR(0, 304, __pyx_L1_error) + __pyx_t_8 = PyObject_RichCompare(__pyx_t_10, __pyx_t_3, Py_LT); __Pyx_XGOTREF(__pyx_t_8); if (unlikely(!__pyx_t_8)) __PYX_ERR(0, 342, __pyx_L1_error) __Pyx_DECREF(__pyx_t_10); __pyx_t_10 = 0; __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0; - __pyx_t_6 = __Pyx_PyObject_IsTrue(__pyx_t_8); if (unlikely(__pyx_t_6 < 0)) __PYX_ERR(0, 304, __pyx_L1_error) + __pyx_t_6 = __Pyx_PyObject_IsTrue(__pyx_t_8); if (unlikely(__pyx_t_6 < 0)) __PYX_ERR(0, 342, __pyx_L1_error) __Pyx_DECREF(__pyx_t_8); __pyx_t_8 = 0; __pyx_t_5 = __pyx_t_6; __pyx_L17_bool_binop_done:; if (__pyx_t_5) { - /* "gensim/models/doc2vec_inner.pyx":305 + /* "gensim/models/doc2vec_inner.pyx":343 * continue # leaving i unchanged * if sample and predict_word.sample_int < random_int32(&next_random): * continue # <<<<<<<<<<<<<< @@ -4236,7 +4227,7 @@ static PyObject *__pyx_pf_6gensim_6models_13doc2vec_inner_train_document_dbow(CY */ goto __pyx_L13_continue; - /* "gensim/models/doc2vec_inner.pyx":304 + /* "gensim/models/doc2vec_inner.pyx":342 * if predict_word is None: # shrink document to leave out word * continue # leaving i unchanged * if sample and predict_word.sample_int < random_int32(&next_random): # <<<<<<<<<<<<<< @@ -4245,20 +4236,20 @@ static PyObject *__pyx_pf_6gensim_6models_13doc2vec_inner_train_document_dbow(CY */ } - /* "gensim/models/doc2vec_inner.pyx":306 + /* "gensim/models/doc2vec_inner.pyx":344 * if sample and predict_word.sample_int < random_int32(&next_random): * continue * indexes[i] = predict_word.index # <<<<<<<<<<<<<< * if hs: * codelens[i] = len(predict_word.code) */ - __pyx_t_8 = __Pyx_PyObject_GetAttrStr(__pyx_v_predict_word, __pyx_n_s_index); if (unlikely(!__pyx_t_8)) __PYX_ERR(0, 306, __pyx_L1_error) + __pyx_t_8 = __Pyx_PyObject_GetAttrStr(__pyx_v_predict_word, __pyx_n_s_index); if (unlikely(!__pyx_t_8)) __PYX_ERR(0, 344, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_8); - __pyx_t_12 = __Pyx_PyInt_As_npy_uint32(__pyx_t_8); if (unlikely((__pyx_t_12 == ((npy_uint32)-1)) && PyErr_Occurred())) __PYX_ERR(0, 306, __pyx_L1_error) + __pyx_t_12 = __Pyx_PyInt_As_npy_uint32(__pyx_t_8); if (unlikely((__pyx_t_12 == ((npy_uint32)-1)) && PyErr_Occurred())) __PYX_ERR(0, 344, __pyx_L1_error) __Pyx_DECREF(__pyx_t_8); __pyx_t_8 = 0; (__pyx_v_indexes[__pyx_v_i]) = __pyx_t_12; - /* "gensim/models/doc2vec_inner.pyx":307 + /* "gensim/models/doc2vec_inner.pyx":345 * continue * indexes[i] = predict_word.index * if hs: # <<<<<<<<<<<<<< @@ -4268,46 +4259,46 @@ static PyObject *__pyx_pf_6gensim_6models_13doc2vec_inner_train_document_dbow(CY __pyx_t_5 = (__pyx_v_hs != 0); if (__pyx_t_5) { - /* "gensim/models/doc2vec_inner.pyx":308 + /* "gensim/models/doc2vec_inner.pyx":346 * indexes[i] = predict_word.index * if hs: * codelens[i] = len(predict_word.code) # <<<<<<<<<<<<<< * codes[i] = np.PyArray_DATA(predict_word.code) * points[i] = np.PyArray_DATA(predict_word.point) */ - __pyx_t_8 = __Pyx_PyObject_GetAttrStr(__pyx_v_predict_word, __pyx_n_s_code); if (unlikely(!__pyx_t_8)) __PYX_ERR(0, 308, __pyx_L1_error) + __pyx_t_8 = __Pyx_PyObject_GetAttrStr(__pyx_v_predict_word, __pyx_n_s_code); if (unlikely(!__pyx_t_8)) __PYX_ERR(0, 346, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_8); - __pyx_t_13 = PyObject_Length(__pyx_t_8); if (unlikely(__pyx_t_13 == ((Py_ssize_t)-1))) __PYX_ERR(0, 308, __pyx_L1_error) + __pyx_t_13 = PyObject_Length(__pyx_t_8); if (unlikely(__pyx_t_13 == ((Py_ssize_t)-1))) __PYX_ERR(0, 346, __pyx_L1_error) __Pyx_DECREF(__pyx_t_8); __pyx_t_8 = 0; (__pyx_v_codelens[__pyx_v_i]) = ((int)__pyx_t_13); - /* "gensim/models/doc2vec_inner.pyx":309 + /* "gensim/models/doc2vec_inner.pyx":347 * if hs: * codelens[i] = len(predict_word.code) * codes[i] = np.PyArray_DATA(predict_word.code) # <<<<<<<<<<<<<< * points[i] = np.PyArray_DATA(predict_word.point) * result += 1 */ - __pyx_t_8 = __Pyx_PyObject_GetAttrStr(__pyx_v_predict_word, __pyx_n_s_code); if (unlikely(!__pyx_t_8)) __PYX_ERR(0, 309, __pyx_L1_error) + __pyx_t_8 = __Pyx_PyObject_GetAttrStr(__pyx_v_predict_word, __pyx_n_s_code); if (unlikely(!__pyx_t_8)) __PYX_ERR(0, 347, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_8); - if (!(likely(((__pyx_t_8) == Py_None) || likely(__Pyx_TypeTest(__pyx_t_8, __pyx_ptype_5numpy_ndarray))))) __PYX_ERR(0, 309, __pyx_L1_error) + if (!(likely(((__pyx_t_8) == Py_None) || likely(__Pyx_TypeTest(__pyx_t_8, __pyx_ptype_5numpy_ndarray))))) __PYX_ERR(0, 347, __pyx_L1_error) (__pyx_v_codes[__pyx_v_i]) = ((__pyx_t_5numpy_uint8_t *)PyArray_DATA(((PyArrayObject *)__pyx_t_8))); __Pyx_DECREF(__pyx_t_8); __pyx_t_8 = 0; - /* "gensim/models/doc2vec_inner.pyx":310 + /* "gensim/models/doc2vec_inner.pyx":348 * codelens[i] = len(predict_word.code) * codes[i] = np.PyArray_DATA(predict_word.code) * points[i] = np.PyArray_DATA(predict_word.point) # <<<<<<<<<<<<<< * result += 1 * i += 1 */ - __pyx_t_8 = __Pyx_PyObject_GetAttrStr(__pyx_v_predict_word, __pyx_n_s_point); if (unlikely(!__pyx_t_8)) __PYX_ERR(0, 310, __pyx_L1_error) + __pyx_t_8 = __Pyx_PyObject_GetAttrStr(__pyx_v_predict_word, __pyx_n_s_point); if (unlikely(!__pyx_t_8)) __PYX_ERR(0, 348, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_8); - if (!(likely(((__pyx_t_8) == Py_None) || likely(__Pyx_TypeTest(__pyx_t_8, __pyx_ptype_5numpy_ndarray))))) __PYX_ERR(0, 310, __pyx_L1_error) + if (!(likely(((__pyx_t_8) == Py_None) || likely(__Pyx_TypeTest(__pyx_t_8, __pyx_ptype_5numpy_ndarray))))) __PYX_ERR(0, 348, __pyx_L1_error) (__pyx_v_points[__pyx_v_i]) = ((__pyx_t_5numpy_uint32_t *)PyArray_DATA(((PyArrayObject *)__pyx_t_8))); __Pyx_DECREF(__pyx_t_8); __pyx_t_8 = 0; - /* "gensim/models/doc2vec_inner.pyx":307 + /* "gensim/models/doc2vec_inner.pyx":345 * continue * indexes[i] = predict_word.index * if hs: # <<<<<<<<<<<<<< @@ -4316,7 +4307,7 @@ static PyObject *__pyx_pf_6gensim_6models_13doc2vec_inner_train_document_dbow(CY */ } - /* "gensim/models/doc2vec_inner.pyx":311 + /* "gensim/models/doc2vec_inner.pyx":349 * codes[i] = np.PyArray_DATA(predict_word.code) * points[i] = np.PyArray_DATA(predict_word.point) * result += 1 # <<<<<<<<<<<<<< @@ -4325,7 +4316,7 @@ static PyObject *__pyx_pf_6gensim_6models_13doc2vec_inner_train_document_dbow(CY */ __pyx_v_result = (__pyx_v_result + 1); - /* "gensim/models/doc2vec_inner.pyx":312 + /* "gensim/models/doc2vec_inner.pyx":350 * points[i] = np.PyArray_DATA(predict_word.point) * result += 1 * i += 1 # <<<<<<<<<<<<<< @@ -4334,7 +4325,7 @@ static PyObject *__pyx_pf_6gensim_6models_13doc2vec_inner_train_document_dbow(CY */ __pyx_v_i = (__pyx_v_i + 1); - /* "gensim/models/doc2vec_inner.pyx":313 + /* "gensim/models/doc2vec_inner.pyx":351 * result += 1 * i += 1 * if i == MAX_DOCUMENT_LEN: # <<<<<<<<<<<<<< @@ -4344,7 +4335,7 @@ static PyObject *__pyx_pf_6gensim_6models_13doc2vec_inner_train_document_dbow(CY __pyx_t_5 = ((__pyx_v_i == 0x2710) != 0); if (__pyx_t_5) { - /* "gensim/models/doc2vec_inner.pyx":314 + /* "gensim/models/doc2vec_inner.pyx":352 * i += 1 * if i == MAX_DOCUMENT_LEN: * break # TODO: log warning, tally overflow? # <<<<<<<<<<<<<< @@ -4353,7 +4344,7 @@ static PyObject *__pyx_pf_6gensim_6models_13doc2vec_inner_train_document_dbow(CY */ goto __pyx_L14_break; - /* "gensim/models/doc2vec_inner.pyx":313 + /* "gensim/models/doc2vec_inner.pyx":351 * result += 1 * i += 1 * if i == MAX_DOCUMENT_LEN: # <<<<<<<<<<<<<< @@ -4362,7 +4353,7 @@ static PyObject *__pyx_pf_6gensim_6models_13doc2vec_inner_train_document_dbow(CY */ } - /* "gensim/models/doc2vec_inner.pyx":300 + /* "gensim/models/doc2vec_inner.pyx":338 * vlookup = model.wv.vocab * i = 0 * for token in doc_words: # <<<<<<<<<<<<<< @@ -4374,7 +4365,7 @@ static PyObject *__pyx_pf_6gensim_6models_13doc2vec_inner_train_document_dbow(CY __pyx_L14_break:; __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0; - /* "gensim/models/doc2vec_inner.pyx":315 + /* "gensim/models/doc2vec_inner.pyx":353 * if i == MAX_DOCUMENT_LEN: * break # TODO: log warning, tally overflow? * document_len = i # <<<<<<<<<<<<<< @@ -4383,7 +4374,7 @@ static PyObject *__pyx_pf_6gensim_6models_13doc2vec_inner_train_document_dbow(CY */ __pyx_v_document_len = __pyx_v_i; - /* "gensim/models/doc2vec_inner.pyx":317 + /* "gensim/models/doc2vec_inner.pyx":355 * document_len = i * * if _train_words: # <<<<<<<<<<<<<< @@ -4393,7 +4384,7 @@ static PyObject *__pyx_pf_6gensim_6models_13doc2vec_inner_train_document_dbow(CY __pyx_t_5 = (__pyx_v__train_words != 0); if (__pyx_t_5) { - /* "gensim/models/doc2vec_inner.pyx":319 + /* "gensim/models/doc2vec_inner.pyx":357 * if _train_words: * # single randint() call avoids a big thread-synchronization slowdown * for i, item in enumerate(model.random.randint(0, window, document_len)): # <<<<<<<<<<<<<< @@ -4401,14 +4392,14 @@ static PyObject *__pyx_pf_6gensim_6models_13doc2vec_inner_train_document_dbow(CY * */ __pyx_t_2 = 0; - __pyx_t_8 = __Pyx_PyObject_GetAttrStr(__pyx_v_model, __pyx_n_s_random); if (unlikely(!__pyx_t_8)) __PYX_ERR(0, 319, __pyx_L1_error) + __pyx_t_8 = __Pyx_PyObject_GetAttrStr(__pyx_v_model, __pyx_n_s_random); if (unlikely(!__pyx_t_8)) __PYX_ERR(0, 357, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_8); - __pyx_t_3 = __Pyx_PyObject_GetAttrStr(__pyx_t_8, __pyx_n_s_randint); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 319, __pyx_L1_error) + __pyx_t_3 = __Pyx_PyObject_GetAttrStr(__pyx_t_8, __pyx_n_s_randint); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 357, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_3); __Pyx_DECREF(__pyx_t_8); __pyx_t_8 = 0; - __pyx_t_8 = __Pyx_PyInt_From_int(__pyx_v_window); if (unlikely(!__pyx_t_8)) __PYX_ERR(0, 319, __pyx_L1_error) + __pyx_t_8 = __Pyx_PyInt_From_int(__pyx_v_window); if (unlikely(!__pyx_t_8)) __PYX_ERR(0, 357, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_8); - __pyx_t_10 = __Pyx_PyInt_From_int(__pyx_v_document_len); if (unlikely(!__pyx_t_10)) __PYX_ERR(0, 319, __pyx_L1_error) + __pyx_t_10 = __Pyx_PyInt_From_int(__pyx_v_document_len); if (unlikely(!__pyx_t_10)) __PYX_ERR(0, 357, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_10); __pyx_t_14 = NULL; __pyx_t_15 = 0; @@ -4425,7 +4416,7 @@ static PyObject *__pyx_pf_6gensim_6models_13doc2vec_inner_train_document_dbow(CY #if CYTHON_FAST_PYCALL if (PyFunction_Check(__pyx_t_3)) { PyObject *__pyx_temp[4] = {__pyx_t_14, __pyx_int_0, __pyx_t_8, __pyx_t_10}; - __pyx_t_1 = __Pyx_PyFunction_FastCall(__pyx_t_3, __pyx_temp+1-__pyx_t_15, 3+__pyx_t_15); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 319, __pyx_L1_error) + __pyx_t_1 = __Pyx_PyFunction_FastCall(__pyx_t_3, __pyx_temp+1-__pyx_t_15, 3+__pyx_t_15); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 357, __pyx_L1_error) __Pyx_XDECREF(__pyx_t_14); __pyx_t_14 = 0; __Pyx_GOTREF(__pyx_t_1); __Pyx_DECREF(__pyx_t_8); __pyx_t_8 = 0; @@ -4435,7 +4426,7 @@ static PyObject *__pyx_pf_6gensim_6models_13doc2vec_inner_train_document_dbow(CY #if CYTHON_FAST_PYCCALL if (__Pyx_PyFastCFunction_Check(__pyx_t_3)) { PyObject *__pyx_temp[4] = {__pyx_t_14, __pyx_int_0, __pyx_t_8, __pyx_t_10}; - __pyx_t_1 = __Pyx_PyCFunction_FastCall(__pyx_t_3, __pyx_temp+1-__pyx_t_15, 3+__pyx_t_15); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 319, __pyx_L1_error) + __pyx_t_1 = __Pyx_PyCFunction_FastCall(__pyx_t_3, __pyx_temp+1-__pyx_t_15, 3+__pyx_t_15); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 357, __pyx_L1_error) __Pyx_XDECREF(__pyx_t_14); __pyx_t_14 = 0; __Pyx_GOTREF(__pyx_t_1); __Pyx_DECREF(__pyx_t_8); __pyx_t_8 = 0; @@ -4443,7 +4434,7 @@ static PyObject *__pyx_pf_6gensim_6models_13doc2vec_inner_train_document_dbow(CY } else #endif { - __pyx_t_16 = PyTuple_New(3+__pyx_t_15); if (unlikely(!__pyx_t_16)) __PYX_ERR(0, 319, __pyx_L1_error) + __pyx_t_16 = PyTuple_New(3+__pyx_t_15); if (unlikely(!__pyx_t_16)) __PYX_ERR(0, 357, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_16); if (__pyx_t_14) { __Pyx_GIVEREF(__pyx_t_14); PyTuple_SET_ITEM(__pyx_t_16, 0, __pyx_t_14); __pyx_t_14 = NULL; @@ -4457,7 +4448,7 @@ static PyObject *__pyx_pf_6gensim_6models_13doc2vec_inner_train_document_dbow(CY PyTuple_SET_ITEM(__pyx_t_16, 2+__pyx_t_15, __pyx_t_10); __pyx_t_8 = 0; __pyx_t_10 = 0; - __pyx_t_1 = __Pyx_PyObject_Call(__pyx_t_3, __pyx_t_16, NULL); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 319, __pyx_L1_error) + __pyx_t_1 = __Pyx_PyObject_Call(__pyx_t_3, __pyx_t_16, NULL); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 357, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_1); __Pyx_DECREF(__pyx_t_16); __pyx_t_16 = 0; } @@ -4466,9 +4457,9 @@ static PyObject *__pyx_pf_6gensim_6models_13doc2vec_inner_train_document_dbow(CY __pyx_t_3 = __pyx_t_1; __Pyx_INCREF(__pyx_t_3); __pyx_t_7 = 0; __pyx_t_11 = NULL; } else { - __pyx_t_7 = -1; __pyx_t_3 = PyObject_GetIter(__pyx_t_1); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 319, __pyx_L1_error) + __pyx_t_7 = -1; __pyx_t_3 = PyObject_GetIter(__pyx_t_1); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 357, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_3); - __pyx_t_11 = Py_TYPE(__pyx_t_3)->tp_iternext; if (unlikely(!__pyx_t_11)) __PYX_ERR(0, 319, __pyx_L1_error) + __pyx_t_11 = Py_TYPE(__pyx_t_3)->tp_iternext; if (unlikely(!__pyx_t_11)) __PYX_ERR(0, 357, __pyx_L1_error) } __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0; for (;;) { @@ -4476,17 +4467,17 @@ static PyObject *__pyx_pf_6gensim_6models_13doc2vec_inner_train_document_dbow(CY if (likely(PyList_CheckExact(__pyx_t_3))) { if (__pyx_t_7 >= PyList_GET_SIZE(__pyx_t_3)) break; #if CYTHON_ASSUME_SAFE_MACROS && !CYTHON_AVOID_BORROWED_REFS - __pyx_t_1 = PyList_GET_ITEM(__pyx_t_3, __pyx_t_7); __Pyx_INCREF(__pyx_t_1); __pyx_t_7++; if (unlikely(0 < 0)) __PYX_ERR(0, 319, __pyx_L1_error) + __pyx_t_1 = PyList_GET_ITEM(__pyx_t_3, __pyx_t_7); __Pyx_INCREF(__pyx_t_1); __pyx_t_7++; if (unlikely(0 < 0)) __PYX_ERR(0, 357, __pyx_L1_error) #else - __pyx_t_1 = PySequence_ITEM(__pyx_t_3, __pyx_t_7); __pyx_t_7++; if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 319, __pyx_L1_error) + __pyx_t_1 = PySequence_ITEM(__pyx_t_3, __pyx_t_7); __pyx_t_7++; if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 357, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_1); #endif } else { if (__pyx_t_7 >= PyTuple_GET_SIZE(__pyx_t_3)) break; #if CYTHON_ASSUME_SAFE_MACROS && !CYTHON_AVOID_BORROWED_REFS - __pyx_t_1 = PyTuple_GET_ITEM(__pyx_t_3, __pyx_t_7); __Pyx_INCREF(__pyx_t_1); __pyx_t_7++; if (unlikely(0 < 0)) __PYX_ERR(0, 319, __pyx_L1_error) + __pyx_t_1 = PyTuple_GET_ITEM(__pyx_t_3, __pyx_t_7); __Pyx_INCREF(__pyx_t_1); __pyx_t_7++; if (unlikely(0 < 0)) __PYX_ERR(0, 357, __pyx_L1_error) #else - __pyx_t_1 = PySequence_ITEM(__pyx_t_3, __pyx_t_7); __pyx_t_7++; if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 319, __pyx_L1_error) + __pyx_t_1 = PySequence_ITEM(__pyx_t_3, __pyx_t_7); __pyx_t_7++; if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 357, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_1); #endif } @@ -4496,7 +4487,7 @@ static PyObject *__pyx_pf_6gensim_6models_13doc2vec_inner_train_document_dbow(CY PyObject* exc_type = PyErr_Occurred(); if (exc_type) { if (likely(__Pyx_PyErr_GivenExceptionMatches(exc_type, PyExc_StopIteration))) PyErr_Clear(); - else __PYX_ERR(0, 319, __pyx_L1_error) + else __PYX_ERR(0, 357, __pyx_L1_error) } break; } @@ -4507,17 +4498,17 @@ static PyObject *__pyx_pf_6gensim_6models_13doc2vec_inner_train_document_dbow(CY __pyx_v_i = __pyx_t_2; __pyx_t_2 = (__pyx_t_2 + 1); - /* "gensim/models/doc2vec_inner.pyx":320 + /* "gensim/models/doc2vec_inner.pyx":358 * # single randint() call avoids a big thread-synchronization slowdown * for i, item in enumerate(model.random.randint(0, window, document_len)): * reduced_windows[i] = item # <<<<<<<<<<<<<< * * doctag_len = min(MAX_DOCUMENT_LEN, len(doctag_indexes)) */ - __pyx_t_12 = __Pyx_PyInt_As_npy_uint32(__pyx_v_item); if (unlikely((__pyx_t_12 == ((npy_uint32)-1)) && PyErr_Occurred())) __PYX_ERR(0, 320, __pyx_L1_error) + __pyx_t_12 = __Pyx_PyInt_As_npy_uint32(__pyx_v_item); if (unlikely((__pyx_t_12 == ((npy_uint32)-1)) && PyErr_Occurred())) __PYX_ERR(0, 358, __pyx_L1_error) (__pyx_v_reduced_windows[__pyx_v_i]) = __pyx_t_12; - /* "gensim/models/doc2vec_inner.pyx":319 + /* "gensim/models/doc2vec_inner.pyx":357 * if _train_words: * # single randint() call avoids a big thread-synchronization slowdown * for i, item in enumerate(model.random.randint(0, window, document_len)): # <<<<<<<<<<<<<< @@ -4527,7 +4518,7 @@ static PyObject *__pyx_pf_6gensim_6models_13doc2vec_inner_train_document_dbow(CY } __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0; - /* "gensim/models/doc2vec_inner.pyx":317 + /* "gensim/models/doc2vec_inner.pyx":355 * document_len = i * * if _train_words: # <<<<<<<<<<<<<< @@ -4536,14 +4527,14 @@ static PyObject *__pyx_pf_6gensim_6models_13doc2vec_inner_train_document_dbow(CY */ } - /* "gensim/models/doc2vec_inner.pyx":322 + /* "gensim/models/doc2vec_inner.pyx":360 * reduced_windows[i] = item * * doctag_len = min(MAX_DOCUMENT_LEN, len(doctag_indexes)) # <<<<<<<<<<<<<< * for i in range(doctag_len): * _doctag_indexes[i] = doctag_indexes[i] */ - __pyx_t_7 = PyObject_Length(__pyx_v_doctag_indexes); if (unlikely(__pyx_t_7 == ((Py_ssize_t)-1))) __PYX_ERR(0, 322, __pyx_L1_error) + __pyx_t_7 = PyObject_Length(__pyx_v_doctag_indexes); if (unlikely(__pyx_t_7 == ((Py_ssize_t)-1))) __PYX_ERR(0, 360, __pyx_L1_error) __pyx_t_17 = 0x2710; if (((__pyx_t_7 < __pyx_t_17) != 0)) { __pyx_t_13 = __pyx_t_7; @@ -4552,7 +4543,7 @@ static PyObject *__pyx_pf_6gensim_6models_13doc2vec_inner_train_document_dbow(CY } __pyx_v_doctag_len = ((int)__pyx_t_13); - /* "gensim/models/doc2vec_inner.pyx":323 + /* "gensim/models/doc2vec_inner.pyx":361 * * doctag_len = min(MAX_DOCUMENT_LEN, len(doctag_indexes)) * for i in range(doctag_len): # <<<<<<<<<<<<<< @@ -4563,20 +4554,20 @@ static PyObject *__pyx_pf_6gensim_6models_13doc2vec_inner_train_document_dbow(CY for (__pyx_t_15 = 0; __pyx_t_15 < __pyx_t_2; __pyx_t_15+=1) { __pyx_v_i = __pyx_t_15; - /* "gensim/models/doc2vec_inner.pyx":324 + /* "gensim/models/doc2vec_inner.pyx":362 * doctag_len = min(MAX_DOCUMENT_LEN, len(doctag_indexes)) * for i in range(doctag_len): * _doctag_indexes[i] = doctag_indexes[i] # <<<<<<<<<<<<<< * result += 1 * */ - __pyx_t_3 = __Pyx_GetItemInt(__pyx_v_doctag_indexes, __pyx_v_i, int, 1, __Pyx_PyInt_From_int, 0, 0, 0); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 324, __pyx_L1_error) + __pyx_t_3 = __Pyx_GetItemInt(__pyx_v_doctag_indexes, __pyx_v_i, int, 1, __Pyx_PyInt_From_int, 0, 0, 0); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 362, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_3); - __pyx_t_12 = __Pyx_PyInt_As_npy_uint32(__pyx_t_3); if (unlikely((__pyx_t_12 == ((npy_uint32)-1)) && PyErr_Occurred())) __PYX_ERR(0, 324, __pyx_L1_error) + __pyx_t_12 = __Pyx_PyInt_As_npy_uint32(__pyx_t_3); if (unlikely((__pyx_t_12 == ((npy_uint32)-1)) && PyErr_Occurred())) __PYX_ERR(0, 362, __pyx_L1_error) __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0; (__pyx_v__doctag_indexes[__pyx_v_i]) = __pyx_t_12; - /* "gensim/models/doc2vec_inner.pyx":325 + /* "gensim/models/doc2vec_inner.pyx":363 * for i in range(doctag_len): * _doctag_indexes[i] = doctag_indexes[i] * result += 1 # <<<<<<<<<<<<<< @@ -4586,7 +4577,7 @@ static PyObject *__pyx_pf_6gensim_6models_13doc2vec_inner_train_document_dbow(CY __pyx_v_result = (__pyx_v_result + 1); } - /* "gensim/models/doc2vec_inner.pyx":328 + /* "gensim/models/doc2vec_inner.pyx":366 * * # release GIL & train on the document * with nogil: # <<<<<<<<<<<<<< @@ -4601,7 +4592,7 @@ static PyObject *__pyx_pf_6gensim_6models_13doc2vec_inner_train_document_dbow(CY #endif /*try:*/ { - /* "gensim/models/doc2vec_inner.pyx":329 + /* "gensim/models/doc2vec_inner.pyx":367 * # release GIL & train on the document * with nogil: * for i in range(document_len): # <<<<<<<<<<<<<< @@ -4612,7 +4603,7 @@ static PyObject *__pyx_pf_6gensim_6models_13doc2vec_inner_train_document_dbow(CY for (__pyx_t_15 = 0; __pyx_t_15 < __pyx_t_2; __pyx_t_15+=1) { __pyx_v_i = __pyx_t_15; - /* "gensim/models/doc2vec_inner.pyx":330 + /* "gensim/models/doc2vec_inner.pyx":368 * with nogil: * for i in range(document_len): * if _train_words: # simultaneous skip-gram wordvec-training # <<<<<<<<<<<<<< @@ -4622,7 +4613,7 @@ static PyObject *__pyx_pf_6gensim_6models_13doc2vec_inner_train_document_dbow(CY __pyx_t_5 = (__pyx_v__train_words != 0); if (__pyx_t_5) { - /* "gensim/models/doc2vec_inner.pyx":331 + /* "gensim/models/doc2vec_inner.pyx":369 * for i in range(document_len): * if _train_words: # simultaneous skip-gram wordvec-training * j = i - window + reduced_windows[i] # <<<<<<<<<<<<<< @@ -4631,7 +4622,7 @@ static PyObject *__pyx_pf_6gensim_6models_13doc2vec_inner_train_document_dbow(CY */ __pyx_v_j = ((__pyx_v_i - __pyx_v_window) + (__pyx_v_reduced_windows[__pyx_v_i])); - /* "gensim/models/doc2vec_inner.pyx":332 + /* "gensim/models/doc2vec_inner.pyx":370 * if _train_words: # simultaneous skip-gram wordvec-training * j = i - window + reduced_windows[i] * if j < 0: # <<<<<<<<<<<<<< @@ -4641,7 +4632,7 @@ static PyObject *__pyx_pf_6gensim_6models_13doc2vec_inner_train_document_dbow(CY __pyx_t_5 = ((__pyx_v_j < 0) != 0); if (__pyx_t_5) { - /* "gensim/models/doc2vec_inner.pyx":333 + /* "gensim/models/doc2vec_inner.pyx":371 * j = i - window + reduced_windows[i] * if j < 0: * j = 0 # <<<<<<<<<<<<<< @@ -4650,7 +4641,7 @@ static PyObject *__pyx_pf_6gensim_6models_13doc2vec_inner_train_document_dbow(CY */ __pyx_v_j = 0; - /* "gensim/models/doc2vec_inner.pyx":332 + /* "gensim/models/doc2vec_inner.pyx":370 * if _train_words: # simultaneous skip-gram wordvec-training * j = i - window + reduced_windows[i] * if j < 0: # <<<<<<<<<<<<<< @@ -4659,7 +4650,7 @@ static PyObject *__pyx_pf_6gensim_6models_13doc2vec_inner_train_document_dbow(CY */ } - /* "gensim/models/doc2vec_inner.pyx":334 + /* "gensim/models/doc2vec_inner.pyx":372 * if j < 0: * j = 0 * k = i + window + 1 - reduced_windows[i] # <<<<<<<<<<<<<< @@ -4668,7 +4659,7 @@ static PyObject *__pyx_pf_6gensim_6models_13doc2vec_inner_train_document_dbow(CY */ __pyx_v_k = (((__pyx_v_i + __pyx_v_window) + 1) - (__pyx_v_reduced_windows[__pyx_v_i])); - /* "gensim/models/doc2vec_inner.pyx":335 + /* "gensim/models/doc2vec_inner.pyx":373 * j = 0 * k = i + window + 1 - reduced_windows[i] * if k > document_len: # <<<<<<<<<<<<<< @@ -4678,7 +4669,7 @@ static PyObject *__pyx_pf_6gensim_6models_13doc2vec_inner_train_document_dbow(CY __pyx_t_5 = ((__pyx_v_k > __pyx_v_document_len) != 0); if (__pyx_t_5) { - /* "gensim/models/doc2vec_inner.pyx":336 + /* "gensim/models/doc2vec_inner.pyx":374 * k = i + window + 1 - reduced_windows[i] * if k > document_len: * k = document_len # <<<<<<<<<<<<<< @@ -4687,7 +4678,7 @@ static PyObject *__pyx_pf_6gensim_6models_13doc2vec_inner_train_document_dbow(CY */ __pyx_v_k = __pyx_v_document_len; - /* "gensim/models/doc2vec_inner.pyx":335 + /* "gensim/models/doc2vec_inner.pyx":373 * j = 0 * k = i + window + 1 - reduced_windows[i] * if k > document_len: # <<<<<<<<<<<<<< @@ -4696,7 +4687,7 @@ static PyObject *__pyx_pf_6gensim_6models_13doc2vec_inner_train_document_dbow(CY */ } - /* "gensim/models/doc2vec_inner.pyx":337 + /* "gensim/models/doc2vec_inner.pyx":375 * if k > document_len: * k = document_len * for j in range(j, k): # <<<<<<<<<<<<<< @@ -4707,7 +4698,7 @@ static PyObject *__pyx_pf_6gensim_6models_13doc2vec_inner_train_document_dbow(CY for (__pyx_t_18 = __pyx_v_j; __pyx_t_18 < __pyx_t_17; __pyx_t_18+=1) { __pyx_v_j = __pyx_t_18; - /* "gensim/models/doc2vec_inner.pyx":338 + /* "gensim/models/doc2vec_inner.pyx":376 * k = document_len * for j in range(j, k): * if j == i: # <<<<<<<<<<<<<< @@ -4717,7 +4708,7 @@ static PyObject *__pyx_pf_6gensim_6models_13doc2vec_inner_train_document_dbow(CY __pyx_t_5 = ((__pyx_v_j == __pyx_v_i) != 0); if (__pyx_t_5) { - /* "gensim/models/doc2vec_inner.pyx":339 + /* "gensim/models/doc2vec_inner.pyx":377 * for j in range(j, k): * if j == i: * continue # <<<<<<<<<<<<<< @@ -4726,7 +4717,7 @@ static PyObject *__pyx_pf_6gensim_6models_13doc2vec_inner_train_document_dbow(CY */ goto __pyx_L34_continue; - /* "gensim/models/doc2vec_inner.pyx":338 + /* "gensim/models/doc2vec_inner.pyx":376 * k = document_len * for j in range(j, k): * if j == i: # <<<<<<<<<<<<<< @@ -4735,7 +4726,7 @@ static PyObject *__pyx_pf_6gensim_6models_13doc2vec_inner_train_document_dbow(CY */ } - /* "gensim/models/doc2vec_inner.pyx":340 + /* "gensim/models/doc2vec_inner.pyx":378 * if j == i: * continue * if hs: # <<<<<<<<<<<<<< @@ -4745,7 +4736,7 @@ static PyObject *__pyx_pf_6gensim_6models_13doc2vec_inner_train_document_dbow(CY __pyx_t_5 = (__pyx_v_hs != 0); if (__pyx_t_5) { - /* "gensim/models/doc2vec_inner.pyx":342 + /* "gensim/models/doc2vec_inner.pyx":380 * if hs: * # we reuse the DBOW function, as it is equivalent to skip-gram for this purpose * fast_document_dbow_hs(points[i], codes[i], codelens[i], _word_vectors, syn1, size, indexes[j], # <<<<<<<<<<<<<< @@ -4754,7 +4745,7 @@ static PyObject *__pyx_pf_6gensim_6models_13doc2vec_inner_train_document_dbow(CY */ __pyx_f_6gensim_6models_13doc2vec_inner_fast_document_dbow_hs((__pyx_v_points[__pyx_v_i]), (__pyx_v_codes[__pyx_v_i]), (__pyx_v_codelens[__pyx_v_i]), __pyx_v__word_vectors, __pyx_v_syn1, __pyx_v_size, (__pyx_v_indexes[__pyx_v_j]), __pyx_v__alpha, __pyx_v__work, __pyx_v__learn_words, __pyx_v__learn_hidden, __pyx_v__word_locks); - /* "gensim/models/doc2vec_inner.pyx":340 + /* "gensim/models/doc2vec_inner.pyx":378 * if j == i: * continue * if hs: # <<<<<<<<<<<<<< @@ -4763,7 +4754,7 @@ static PyObject *__pyx_pf_6gensim_6models_13doc2vec_inner_train_document_dbow(CY */ } - /* "gensim/models/doc2vec_inner.pyx":344 + /* "gensim/models/doc2vec_inner.pyx":382 * fast_document_dbow_hs(points[i], codes[i], codelens[i], _word_vectors, syn1, size, indexes[j], * _alpha, _work, _learn_words, _learn_hidden, _word_locks) * if negative: # <<<<<<<<<<<<<< @@ -4773,7 +4764,7 @@ static PyObject *__pyx_pf_6gensim_6models_13doc2vec_inner_train_document_dbow(CY __pyx_t_5 = (__pyx_v_negative != 0); if (__pyx_t_5) { - /* "gensim/models/doc2vec_inner.pyx":346 + /* "gensim/models/doc2vec_inner.pyx":384 * if negative: * # we reuse the DBOW function, as it is equivalent to skip-gram for this purpose * next_random = fast_document_dbow_neg(negative, cum_table, cum_table_len, _word_vectors, syn1neg, size, # <<<<<<<<<<<<<< @@ -4782,7 +4773,7 @@ static PyObject *__pyx_pf_6gensim_6models_13doc2vec_inner_train_document_dbow(CY */ __pyx_v_next_random = __pyx_f_6gensim_6models_13doc2vec_inner_fast_document_dbow_neg(__pyx_v_negative, __pyx_v_cum_table, __pyx_v_cum_table_len, __pyx_v__word_vectors, __pyx_v_syn1neg, __pyx_v_size, (__pyx_v_indexes[__pyx_v_i]), (__pyx_v_indexes[__pyx_v_j]), __pyx_v__alpha, __pyx_v__work, __pyx_v_next_random, __pyx_v__learn_words, __pyx_v__learn_hidden, __pyx_v__word_locks); - /* "gensim/models/doc2vec_inner.pyx":344 + /* "gensim/models/doc2vec_inner.pyx":382 * fast_document_dbow_hs(points[i], codes[i], codelens[i], _word_vectors, syn1, size, indexes[j], * _alpha, _work, _learn_words, _learn_hidden, _word_locks) * if negative: # <<<<<<<<<<<<<< @@ -4793,7 +4784,7 @@ static PyObject *__pyx_pf_6gensim_6models_13doc2vec_inner_train_document_dbow(CY __pyx_L34_continue:; } - /* "gensim/models/doc2vec_inner.pyx":330 + /* "gensim/models/doc2vec_inner.pyx":368 * with nogil: * for i in range(document_len): * if _train_words: # simultaneous skip-gram wordvec-training # <<<<<<<<<<<<<< @@ -4802,7 +4793,7 @@ static PyObject *__pyx_pf_6gensim_6models_13doc2vec_inner_train_document_dbow(CY */ } - /* "gensim/models/doc2vec_inner.pyx":351 + /* "gensim/models/doc2vec_inner.pyx":389 * * # docvec-training * for j in range(doctag_len): # <<<<<<<<<<<<<< @@ -4813,7 +4804,7 @@ static PyObject *__pyx_pf_6gensim_6models_13doc2vec_inner_train_document_dbow(CY for (__pyx_t_19 = 0; __pyx_t_19 < __pyx_t_18; __pyx_t_19+=1) { __pyx_v_j = __pyx_t_19; - /* "gensim/models/doc2vec_inner.pyx":352 + /* "gensim/models/doc2vec_inner.pyx":390 * # docvec-training * for j in range(doctag_len): * if hs: # <<<<<<<<<<<<<< @@ -4823,7 +4814,7 @@ static PyObject *__pyx_pf_6gensim_6models_13doc2vec_inner_train_document_dbow(CY __pyx_t_5 = (__pyx_v_hs != 0); if (__pyx_t_5) { - /* "gensim/models/doc2vec_inner.pyx":353 + /* "gensim/models/doc2vec_inner.pyx":391 * for j in range(doctag_len): * if hs: * fast_document_dbow_hs(points[i], codes[i], codelens[i], _doctag_vectors, syn1, size, _doctag_indexes[j], # <<<<<<<<<<<<<< @@ -4832,7 +4823,7 @@ static PyObject *__pyx_pf_6gensim_6models_13doc2vec_inner_train_document_dbow(CY */ __pyx_f_6gensim_6models_13doc2vec_inner_fast_document_dbow_hs((__pyx_v_points[__pyx_v_i]), (__pyx_v_codes[__pyx_v_i]), (__pyx_v_codelens[__pyx_v_i]), __pyx_v__doctag_vectors, __pyx_v_syn1, __pyx_v_size, (__pyx_v__doctag_indexes[__pyx_v_j]), __pyx_v__alpha, __pyx_v__work, __pyx_v__learn_doctags, __pyx_v__learn_hidden, __pyx_v__doctag_locks); - /* "gensim/models/doc2vec_inner.pyx":352 + /* "gensim/models/doc2vec_inner.pyx":390 * # docvec-training * for j in range(doctag_len): * if hs: # <<<<<<<<<<<<<< @@ -4841,7 +4832,7 @@ static PyObject *__pyx_pf_6gensim_6models_13doc2vec_inner_train_document_dbow(CY */ } - /* "gensim/models/doc2vec_inner.pyx":355 + /* "gensim/models/doc2vec_inner.pyx":393 * fast_document_dbow_hs(points[i], codes[i], codelens[i], _doctag_vectors, syn1, size, _doctag_indexes[j], * _alpha, _work, _learn_doctags, _learn_hidden, _doctag_locks) * if negative: # <<<<<<<<<<<<<< @@ -4851,7 +4842,7 @@ static PyObject *__pyx_pf_6gensim_6models_13doc2vec_inner_train_document_dbow(CY __pyx_t_5 = (__pyx_v_negative != 0); if (__pyx_t_5) { - /* "gensim/models/doc2vec_inner.pyx":356 + /* "gensim/models/doc2vec_inner.pyx":394 * _alpha, _work, _learn_doctags, _learn_hidden, _doctag_locks) * if negative: * next_random = fast_document_dbow_neg(negative, cum_table, cum_table_len, _doctag_vectors, syn1neg, size, # <<<<<<<<<<<<<< @@ -4860,7 +4851,7 @@ static PyObject *__pyx_pf_6gensim_6models_13doc2vec_inner_train_document_dbow(CY */ __pyx_v_next_random = __pyx_f_6gensim_6models_13doc2vec_inner_fast_document_dbow_neg(__pyx_v_negative, __pyx_v_cum_table, __pyx_v_cum_table_len, __pyx_v__doctag_vectors, __pyx_v_syn1neg, __pyx_v_size, (__pyx_v_indexes[__pyx_v_i]), (__pyx_v__doctag_indexes[__pyx_v_j]), __pyx_v__alpha, __pyx_v__work, __pyx_v_next_random, __pyx_v__learn_doctags, __pyx_v__learn_hidden, __pyx_v__doctag_locks); - /* "gensim/models/doc2vec_inner.pyx":355 + /* "gensim/models/doc2vec_inner.pyx":393 * fast_document_dbow_hs(points[i], codes[i], codelens[i], _doctag_vectors, syn1, size, _doctag_indexes[j], * _alpha, _work, _learn_doctags, _learn_hidden, _doctag_locks) * if negative: # <<<<<<<<<<<<<< @@ -4872,7 +4863,7 @@ static PyObject *__pyx_pf_6gensim_6models_13doc2vec_inner_train_document_dbow(CY } } - /* "gensim/models/doc2vec_inner.pyx":328 + /* "gensim/models/doc2vec_inner.pyx":366 * * # release GIL & train on the document * with nogil: # <<<<<<<<<<<<<< @@ -4891,7 +4882,7 @@ static PyObject *__pyx_pf_6gensim_6models_13doc2vec_inner_train_document_dbow(CY } } - /* "gensim/models/doc2vec_inner.pyx":360 + /* "gensim/models/doc2vec_inner.pyx":398 * _learn_doctags, _learn_hidden, _doctag_locks) * * return result # <<<<<<<<<<<<<< @@ -4899,13 +4890,13 @@ static PyObject *__pyx_pf_6gensim_6models_13doc2vec_inner_train_document_dbow(CY * */ __Pyx_XDECREF(__pyx_r); - __pyx_t_3 = __Pyx_PyInt_From_long(__pyx_v_result); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 360, __pyx_L1_error) + __pyx_t_3 = __Pyx_PyInt_From_long(__pyx_v_result); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 398, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_3); __pyx_r = __pyx_t_3; __pyx_t_3 = 0; goto __pyx_L0; - /* "gensim/models/doc2vec_inner.pyx":227 + /* "gensim/models/doc2vec_inner.pyx":221 * * * def train_document_dbow(model, doc_words, doctag_indexes, alpha, work=None, # <<<<<<<<<<<<<< @@ -4938,7 +4929,7 @@ static PyObject *__pyx_pf_6gensim_6models_13doc2vec_inner_train_document_dbow(CY return __pyx_r; } -/* "gensim/models/doc2vec_inner.pyx":363 +/* "gensim/models/doc2vec_inner.pyx":401 * * * def train_document_dm(model, doc_words, doctag_indexes, alpha, work=None, neu1=None, # <<<<<<<<<<<<<< @@ -4948,7 +4939,8 @@ static PyObject *__pyx_pf_6gensim_6models_13doc2vec_inner_train_document_dbow(CY /* Python wrapper */ static PyObject *__pyx_pw_6gensim_6models_13doc2vec_inner_3train_document_dm(PyObject *__pyx_self, PyObject *__pyx_args, PyObject *__pyx_kwds); /*proto*/ -static PyMethodDef __pyx_mdef_6gensim_6models_13doc2vec_inner_3train_document_dm = {"train_document_dm", (PyCFunction)__pyx_pw_6gensim_6models_13doc2vec_inner_3train_document_dm, METH_VARARGS|METH_KEYWORDS, 0}; +static char __pyx_doc_6gensim_6models_13doc2vec_inner_2train_document_dm[] = "train_document_dm(model, doc_words, doctag_indexes, alpha, work=None, neu1=None, learn_doctags=True, learn_words=True, learn_hidden=True, word_vectors=None, word_locks=None, doctag_vectors=None, doctag_locks=None)\nUpdate distributed memory model (\"PV-DM\") by training on a single document.\n This method implements the DM model with a projection (input) layer that is either the sum or mean of the context\n vectors, depending on the model's `dm_mean` configuration field.\n\n Called internally from :meth:`~gensim.models.doc2vec.Doc2Vec.train` and\n :meth:`~gensim.models.doc2vec.Doc2Vec.infer_vector`.\n\n Parameters\n ----------\n model : :class:`~gensim.models.doc2vec.Doc2Vec`\n The model to train.\n doc_words : list of str\n The input document as a list of words to be used for training. Each word will be looked up in\n the model's vocabulary.\n doctag_indexes : list of int\n Indices into `doctag_vectors` used to obtain the tags of the document.\n alpha : float\n Learning rate.\n work : np.ndarray, optional\n Private working memory for each worker.\n neu1 : np.ndarray, optional\n Private working memory for each worker.\n learn_doctags : bool, optional\n Whether the tag vectors should be updated.\n learn_words : bool, optional\n Word vectors will be updated exactly as per Word2Vec skip-gram training only if **both**\n `learn_words` and `train_words` are set to True.\n learn_hidden : bool, optional\n Whether or not the weights of the hidden layer will be updated.\n word_vectors : numpy.ndarray, optional\n The vector representation for each word in the vocabulary. If None, these will be retrieved from the model.\n word_locks : numpy.ndarray, optional\n A learning lock factor for each weight in the hidden layer for words, value 0 completely blocks updates,\n a value of 1 allows to update word-vectors.\n doctag_vectors : n""umpy.ndarray, optional\n Vector representations of the tags. If None, these will be retrieved from the model.\n doctag_locks : numpy.ndarray, optional\n The lock factors for each tag, same as `word_locks`, but for document-vectors.\n\n Returns\n -------\n int\n Number of words in the input document that were actually used for training.\n\n "; +static PyMethodDef __pyx_mdef_6gensim_6models_13doc2vec_inner_3train_document_dm = {"train_document_dm", (PyCFunction)__pyx_pw_6gensim_6models_13doc2vec_inner_3train_document_dm, METH_VARARGS|METH_KEYWORDS, __pyx_doc_6gensim_6models_13doc2vec_inner_2train_document_dm}; static PyObject *__pyx_pw_6gensim_6models_13doc2vec_inner_3train_document_dm(PyObject *__pyx_self, PyObject *__pyx_args, PyObject *__pyx_kwds) { PyObject *__pyx_v_model = 0; PyObject *__pyx_v_doc_words = 0; @@ -4972,23 +4964,23 @@ static PyObject *__pyx_pw_6gensim_6models_13doc2vec_inner_3train_document_dm(PyO values[4] = ((PyObject *)Py_None); values[5] = ((PyObject *)Py_None); - /* "gensim/models/doc2vec_inner.pyx":364 + /* "gensim/models/doc2vec_inner.pyx":402 * * def train_document_dm(model, doc_words, doctag_indexes, alpha, work=None, neu1=None, * learn_doctags=True, learn_words=True, learn_hidden=True, # <<<<<<<<<<<<<< * word_vectors=None, word_locks=None, doctag_vectors=None, doctag_locks=None): - * cdef int hs = model.hs + * """Update distributed memory model ("PV-DM") by training on a single document. */ values[6] = ((PyObject *)Py_True); values[7] = ((PyObject *)Py_True); values[8] = ((PyObject *)Py_True); - /* "gensim/models/doc2vec_inner.pyx":365 + /* "gensim/models/doc2vec_inner.pyx":403 * def train_document_dm(model, doc_words, doctag_indexes, alpha, work=None, neu1=None, * learn_doctags=True, learn_words=True, learn_hidden=True, * word_vectors=None, word_locks=None, doctag_vectors=None, doctag_locks=None): # <<<<<<<<<<<<<< - * cdef int hs = model.hs - * cdef int negative = model.negative + * """Update distributed memory model ("PV-DM") by training on a single document. + * This method implements the DM model with a projection (input) layer that is either the sum or mean of the context */ values[9] = ((PyObject *)Py_None); values[10] = ((PyObject *)Py_None); @@ -5036,19 +5028,19 @@ static PyObject *__pyx_pw_6gensim_6models_13doc2vec_inner_3train_document_dm(PyO case 1: if (likely((values[1] = PyDict_GetItem(__pyx_kwds, __pyx_n_s_doc_words)) != 0)) kw_args--; else { - __Pyx_RaiseArgtupleInvalid("train_document_dm", 0, 4, 13, 1); __PYX_ERR(0, 363, __pyx_L3_error) + __Pyx_RaiseArgtupleInvalid("train_document_dm", 0, 4, 13, 1); __PYX_ERR(0, 401, __pyx_L3_error) } CYTHON_FALLTHROUGH; case 2: if (likely((values[2] = PyDict_GetItem(__pyx_kwds, __pyx_n_s_doctag_indexes)) != 0)) kw_args--; else { - __Pyx_RaiseArgtupleInvalid("train_document_dm", 0, 4, 13, 2); __PYX_ERR(0, 363, __pyx_L3_error) + __Pyx_RaiseArgtupleInvalid("train_document_dm", 0, 4, 13, 2); __PYX_ERR(0, 401, __pyx_L3_error) } CYTHON_FALLTHROUGH; case 3: if (likely((values[3] = PyDict_GetItem(__pyx_kwds, __pyx_n_s_alpha)) != 0)) kw_args--; else { - __Pyx_RaiseArgtupleInvalid("train_document_dm", 0, 4, 13, 3); __PYX_ERR(0, 363, __pyx_L3_error) + __Pyx_RaiseArgtupleInvalid("train_document_dm", 0, 4, 13, 3); __PYX_ERR(0, 401, __pyx_L3_error) } CYTHON_FALLTHROUGH; case 4: @@ -5106,7 +5098,7 @@ static PyObject *__pyx_pw_6gensim_6models_13doc2vec_inner_3train_document_dm(PyO } } if (unlikely(kw_args > 0)) { - if (unlikely(__Pyx_ParseOptionalKeywords(__pyx_kwds, __pyx_pyargnames, 0, values, pos_args, "train_document_dm") < 0)) __PYX_ERR(0, 363, __pyx_L3_error) + if (unlikely(__Pyx_ParseOptionalKeywords(__pyx_kwds, __pyx_pyargnames, 0, values, pos_args, "train_document_dm") < 0)) __PYX_ERR(0, 401, __pyx_L3_error) } } else { switch (PyTuple_GET_SIZE(__pyx_args)) { @@ -5152,7 +5144,7 @@ static PyObject *__pyx_pw_6gensim_6models_13doc2vec_inner_3train_document_dm(PyO } goto __pyx_L4_argument_unpacking_done; __pyx_L5_argtuple_error:; - __Pyx_RaiseArgtupleInvalid("train_document_dm", 0, 4, 13, PyTuple_GET_SIZE(__pyx_args)); __PYX_ERR(0, 363, __pyx_L3_error) + __Pyx_RaiseArgtupleInvalid("train_document_dm", 0, 4, 13, PyTuple_GET_SIZE(__pyx_args)); __PYX_ERR(0, 401, __pyx_L3_error) __pyx_L3_error:; __Pyx_AddTraceback("gensim.models.doc2vec_inner.train_document_dm", __pyx_clineno, __pyx_lineno, __pyx_filename); __Pyx_RefNannyFinishContext(); @@ -5160,7 +5152,7 @@ static PyObject *__pyx_pw_6gensim_6models_13doc2vec_inner_3train_document_dm(PyO __pyx_L4_argument_unpacking_done:; __pyx_r = __pyx_pf_6gensim_6models_13doc2vec_inner_2train_document_dm(__pyx_self, __pyx_v_model, __pyx_v_doc_words, __pyx_v_doctag_indexes, __pyx_v_alpha, __pyx_v_work, __pyx_v_neu1, __pyx_v_learn_doctags, __pyx_v_learn_words, __pyx_v_learn_hidden, __pyx_v_word_vectors, __pyx_v_word_locks, __pyx_v_doctag_vectors, __pyx_v_doctag_locks); - /* "gensim/models/doc2vec_inner.pyx":363 + /* "gensim/models/doc2vec_inner.pyx":401 * * * def train_document_dm(model, doc_words, doctag_indexes, alpha, work=None, neu1=None, # <<<<<<<<<<<<<< @@ -5243,94 +5235,94 @@ static PyObject *__pyx_pf_6gensim_6models_13doc2vec_inner_2train_document_dm(CYT __Pyx_INCREF(__pyx_v_doctag_vectors); __Pyx_INCREF(__pyx_v_doctag_locks); - /* "gensim/models/doc2vec_inner.pyx":366 - * learn_doctags=True, learn_words=True, learn_hidden=True, - * word_vectors=None, word_locks=None, doctag_vectors=None, doctag_locks=None): + /* "gensim/models/doc2vec_inner.pyx":449 + * + * """ * cdef int hs = model.hs # <<<<<<<<<<<<<< * cdef int negative = model.negative * cdef int sample = (model.vocabulary.sample != 0) */ - __pyx_t_1 = __Pyx_PyObject_GetAttrStr(__pyx_v_model, __pyx_n_s_hs); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 366, __pyx_L1_error) + __pyx_t_1 = __Pyx_PyObject_GetAttrStr(__pyx_v_model, __pyx_n_s_hs); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 449, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_1); - __pyx_t_2 = __Pyx_PyInt_As_int(__pyx_t_1); if (unlikely((__pyx_t_2 == (int)-1) && PyErr_Occurred())) __PYX_ERR(0, 366, __pyx_L1_error) + __pyx_t_2 = __Pyx_PyInt_As_int(__pyx_t_1); if (unlikely((__pyx_t_2 == (int)-1) && PyErr_Occurred())) __PYX_ERR(0, 449, __pyx_L1_error) __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0; __pyx_v_hs = __pyx_t_2; - /* "gensim/models/doc2vec_inner.pyx":367 - * word_vectors=None, word_locks=None, doctag_vectors=None, doctag_locks=None): + /* "gensim/models/doc2vec_inner.pyx":450 + * """ * cdef int hs = model.hs * cdef int negative = model.negative # <<<<<<<<<<<<<< * cdef int sample = (model.vocabulary.sample != 0) * cdef int _learn_doctags = learn_doctags */ - __pyx_t_1 = __Pyx_PyObject_GetAttrStr(__pyx_v_model, __pyx_n_s_negative); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 367, __pyx_L1_error) + __pyx_t_1 = __Pyx_PyObject_GetAttrStr(__pyx_v_model, __pyx_n_s_negative); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 450, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_1); - __pyx_t_2 = __Pyx_PyInt_As_int(__pyx_t_1); if (unlikely((__pyx_t_2 == (int)-1) && PyErr_Occurred())) __PYX_ERR(0, 367, __pyx_L1_error) + __pyx_t_2 = __Pyx_PyInt_As_int(__pyx_t_1); if (unlikely((__pyx_t_2 == (int)-1) && PyErr_Occurred())) __PYX_ERR(0, 450, __pyx_L1_error) __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0; __pyx_v_negative = __pyx_t_2; - /* "gensim/models/doc2vec_inner.pyx":368 + /* "gensim/models/doc2vec_inner.pyx":451 * cdef int hs = model.hs * cdef int negative = model.negative * cdef int sample = (model.vocabulary.sample != 0) # <<<<<<<<<<<<<< * cdef int _learn_doctags = learn_doctags * cdef int _learn_words = learn_words */ - __pyx_t_1 = __Pyx_PyObject_GetAttrStr(__pyx_v_model, __pyx_n_s_vocabulary); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 368, __pyx_L1_error) + __pyx_t_1 = __Pyx_PyObject_GetAttrStr(__pyx_v_model, __pyx_n_s_vocabulary); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 451, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_1); - __pyx_t_3 = __Pyx_PyObject_GetAttrStr(__pyx_t_1, __pyx_n_s_sample); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 368, __pyx_L1_error) + __pyx_t_3 = __Pyx_PyObject_GetAttrStr(__pyx_t_1, __pyx_n_s_sample); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 451, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_3); __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0; - __pyx_t_1 = PyObject_RichCompare(__pyx_t_3, __pyx_int_0, Py_NE); __Pyx_XGOTREF(__pyx_t_1); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 368, __pyx_L1_error) + __pyx_t_1 = PyObject_RichCompare(__pyx_t_3, __pyx_int_0, Py_NE); __Pyx_XGOTREF(__pyx_t_1); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 451, __pyx_L1_error) __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0; - __pyx_t_2 = __Pyx_PyInt_As_int(__pyx_t_1); if (unlikely((__pyx_t_2 == (int)-1) && PyErr_Occurred())) __PYX_ERR(0, 368, __pyx_L1_error) + __pyx_t_2 = __Pyx_PyInt_As_int(__pyx_t_1); if (unlikely((__pyx_t_2 == (int)-1) && PyErr_Occurred())) __PYX_ERR(0, 451, __pyx_L1_error) __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0; __pyx_v_sample = __pyx_t_2; - /* "gensim/models/doc2vec_inner.pyx":369 + /* "gensim/models/doc2vec_inner.pyx":452 * cdef int negative = model.negative * cdef int sample = (model.vocabulary.sample != 0) * cdef int _learn_doctags = learn_doctags # <<<<<<<<<<<<<< * cdef int _learn_words = learn_words * cdef int _learn_hidden = learn_hidden */ - __pyx_t_2 = __Pyx_PyInt_As_int(__pyx_v_learn_doctags); if (unlikely((__pyx_t_2 == (int)-1) && PyErr_Occurred())) __PYX_ERR(0, 369, __pyx_L1_error) + __pyx_t_2 = __Pyx_PyInt_As_int(__pyx_v_learn_doctags); if (unlikely((__pyx_t_2 == (int)-1) && PyErr_Occurred())) __PYX_ERR(0, 452, __pyx_L1_error) __pyx_v__learn_doctags = __pyx_t_2; - /* "gensim/models/doc2vec_inner.pyx":370 + /* "gensim/models/doc2vec_inner.pyx":453 * cdef int sample = (model.vocabulary.sample != 0) * cdef int _learn_doctags = learn_doctags * cdef int _learn_words = learn_words # <<<<<<<<<<<<<< * cdef int _learn_hidden = learn_hidden * cdef int cbow_mean = model.cbow_mean */ - __pyx_t_2 = __Pyx_PyInt_As_int(__pyx_v_learn_words); if (unlikely((__pyx_t_2 == (int)-1) && PyErr_Occurred())) __PYX_ERR(0, 370, __pyx_L1_error) + __pyx_t_2 = __Pyx_PyInt_As_int(__pyx_v_learn_words); if (unlikely((__pyx_t_2 == (int)-1) && PyErr_Occurred())) __PYX_ERR(0, 453, __pyx_L1_error) __pyx_v__learn_words = __pyx_t_2; - /* "gensim/models/doc2vec_inner.pyx":371 + /* "gensim/models/doc2vec_inner.pyx":454 * cdef int _learn_doctags = learn_doctags * cdef int _learn_words = learn_words * cdef int _learn_hidden = learn_hidden # <<<<<<<<<<<<<< * cdef int cbow_mean = model.cbow_mean * cdef REAL_t count, inv_count = 1.0 */ - __pyx_t_2 = __Pyx_PyInt_As_int(__pyx_v_learn_hidden); if (unlikely((__pyx_t_2 == (int)-1) && PyErr_Occurred())) __PYX_ERR(0, 371, __pyx_L1_error) + __pyx_t_2 = __Pyx_PyInt_As_int(__pyx_v_learn_hidden); if (unlikely((__pyx_t_2 == (int)-1) && PyErr_Occurred())) __PYX_ERR(0, 454, __pyx_L1_error) __pyx_v__learn_hidden = __pyx_t_2; - /* "gensim/models/doc2vec_inner.pyx":372 + /* "gensim/models/doc2vec_inner.pyx":455 * cdef int _learn_words = learn_words * cdef int _learn_hidden = learn_hidden * cdef int cbow_mean = model.cbow_mean # <<<<<<<<<<<<<< * cdef REAL_t count, inv_count = 1.0 * */ - __pyx_t_1 = __Pyx_PyObject_GetAttrStr(__pyx_v_model, __pyx_n_s_cbow_mean); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 372, __pyx_L1_error) + __pyx_t_1 = __Pyx_PyObject_GetAttrStr(__pyx_v_model, __pyx_n_s_cbow_mean); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 455, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_1); - __pyx_t_2 = __Pyx_PyInt_As_int(__pyx_t_1); if (unlikely((__pyx_t_2 == (int)-1) && PyErr_Occurred())) __PYX_ERR(0, 372, __pyx_L1_error) + __pyx_t_2 = __Pyx_PyInt_As_int(__pyx_t_1); if (unlikely((__pyx_t_2 == (int)-1) && PyErr_Occurred())) __PYX_ERR(0, 455, __pyx_L1_error) __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0; __pyx_v_cbow_mean = __pyx_t_2; - /* "gensim/models/doc2vec_inner.pyx":373 + /* "gensim/models/doc2vec_inner.pyx":456 * cdef int _learn_hidden = learn_hidden * cdef int cbow_mean = model.cbow_mean * cdef REAL_t count, inv_count = 1.0 # <<<<<<<<<<<<<< @@ -5339,46 +5331,46 @@ static PyObject *__pyx_pf_6gensim_6models_13doc2vec_inner_2train_document_dm(CYT */ __pyx_v_inv_count = 1.0; - /* "gensim/models/doc2vec_inner.pyx":381 + /* "gensim/models/doc2vec_inner.pyx":464 * cdef REAL_t *_work * cdef REAL_t *_neu1 * cdef REAL_t _alpha = alpha # <<<<<<<<<<<<<< * cdef int size = model.trainables.layer1_size * */ - __pyx_t_4 = __pyx_PyFloat_AsFloat(__pyx_v_alpha); if (unlikely((__pyx_t_4 == ((npy_float32)-1)) && PyErr_Occurred())) __PYX_ERR(0, 381, __pyx_L1_error) + __pyx_t_4 = __pyx_PyFloat_AsFloat(__pyx_v_alpha); if (unlikely((__pyx_t_4 == ((npy_float32)-1)) && PyErr_Occurred())) __PYX_ERR(0, 464, __pyx_L1_error) __pyx_v__alpha = __pyx_t_4; - /* "gensim/models/doc2vec_inner.pyx":382 + /* "gensim/models/doc2vec_inner.pyx":465 * cdef REAL_t *_neu1 * cdef REAL_t _alpha = alpha * cdef int size = model.trainables.layer1_size # <<<<<<<<<<<<<< * * cdef int codelens[MAX_DOCUMENT_LEN] */ - __pyx_t_1 = __Pyx_PyObject_GetAttrStr(__pyx_v_model, __pyx_n_s_trainables); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 382, __pyx_L1_error) + __pyx_t_1 = __Pyx_PyObject_GetAttrStr(__pyx_v_model, __pyx_n_s_trainables); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 465, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_1); - __pyx_t_3 = __Pyx_PyObject_GetAttrStr(__pyx_t_1, __pyx_n_s_layer1_size); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 382, __pyx_L1_error) + __pyx_t_3 = __Pyx_PyObject_GetAttrStr(__pyx_t_1, __pyx_n_s_layer1_size); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 465, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_3); __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0; - __pyx_t_2 = __Pyx_PyInt_As_int(__pyx_t_3); if (unlikely((__pyx_t_2 == (int)-1) && PyErr_Occurred())) __PYX_ERR(0, 382, __pyx_L1_error) + __pyx_t_2 = __Pyx_PyInt_As_int(__pyx_t_3); if (unlikely((__pyx_t_2 == (int)-1) && PyErr_Occurred())) __PYX_ERR(0, 465, __pyx_L1_error) __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0; __pyx_v_size = __pyx_t_2; - /* "gensim/models/doc2vec_inner.pyx":390 + /* "gensim/models/doc2vec_inner.pyx":473 * cdef int document_len * cdef int doctag_len * cdef int window = model.window # <<<<<<<<<<<<<< * * cdef int i, j, k, m */ - __pyx_t_3 = __Pyx_PyObject_GetAttrStr(__pyx_v_model, __pyx_n_s_window); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 390, __pyx_L1_error) + __pyx_t_3 = __Pyx_PyObject_GetAttrStr(__pyx_v_model, __pyx_n_s_window); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 473, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_3); - __pyx_t_2 = __Pyx_PyInt_As_int(__pyx_t_3); if (unlikely((__pyx_t_2 == (int)-1) && PyErr_Occurred())) __PYX_ERR(0, 390, __pyx_L1_error) + __pyx_t_2 = __Pyx_PyInt_As_int(__pyx_t_3); if (unlikely((__pyx_t_2 == (int)-1) && PyErr_Occurred())) __PYX_ERR(0, 473, __pyx_L1_error) __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0; __pyx_v_window = __pyx_t_2; - /* "gensim/models/doc2vec_inner.pyx":393 + /* "gensim/models/doc2vec_inner.pyx":476 * * cdef int i, j, k, m * cdef long result = 0 # <<<<<<<<<<<<<< @@ -5387,7 +5379,7 @@ static PyObject *__pyx_pf_6gensim_6models_13doc2vec_inner_2train_document_dm(CYT */ __pyx_v_result = 0; - /* "gensim/models/doc2vec_inner.pyx":408 + /* "gensim/models/doc2vec_inner.pyx":491 * * # default vectors, locks from syn0/doctag_syn0 * if word_vectors is None: # <<<<<<<<<<<<<< @@ -5398,22 +5390,22 @@ static PyObject *__pyx_pf_6gensim_6models_13doc2vec_inner_2train_document_dm(CYT __pyx_t_6 = (__pyx_t_5 != 0); if (__pyx_t_6) { - /* "gensim/models/doc2vec_inner.pyx":409 + /* "gensim/models/doc2vec_inner.pyx":492 * # default vectors, locks from syn0/doctag_syn0 * if word_vectors is None: * word_vectors = model.wv.vectors # <<<<<<<<<<<<<< * _word_vectors = (np.PyArray_DATA(word_vectors)) * if doctag_vectors is None: */ - __pyx_t_3 = __Pyx_PyObject_GetAttrStr(__pyx_v_model, __pyx_n_s_wv); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 409, __pyx_L1_error) + __pyx_t_3 = __Pyx_PyObject_GetAttrStr(__pyx_v_model, __pyx_n_s_wv); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 492, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_3); - __pyx_t_1 = __Pyx_PyObject_GetAttrStr(__pyx_t_3, __pyx_n_s_vectors); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 409, __pyx_L1_error) + __pyx_t_1 = __Pyx_PyObject_GetAttrStr(__pyx_t_3, __pyx_n_s_vectors); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 492, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_1); __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0; __Pyx_DECREF_SET(__pyx_v_word_vectors, __pyx_t_1); __pyx_t_1 = 0; - /* "gensim/models/doc2vec_inner.pyx":408 + /* "gensim/models/doc2vec_inner.pyx":491 * * # default vectors, locks from syn0/doctag_syn0 * if word_vectors is None: # <<<<<<<<<<<<<< @@ -5422,17 +5414,17 @@ static PyObject *__pyx_pf_6gensim_6models_13doc2vec_inner_2train_document_dm(CYT */ } - /* "gensim/models/doc2vec_inner.pyx":410 + /* "gensim/models/doc2vec_inner.pyx":493 * if word_vectors is None: * word_vectors = model.wv.vectors * _word_vectors = (np.PyArray_DATA(word_vectors)) # <<<<<<<<<<<<<< * if doctag_vectors is None: * doctag_vectors = model.docvecs.vectors_docs */ - if (!(likely(((__pyx_v_word_vectors) == Py_None) || likely(__Pyx_TypeTest(__pyx_v_word_vectors, __pyx_ptype_5numpy_ndarray))))) __PYX_ERR(0, 410, __pyx_L1_error) + if (!(likely(((__pyx_v_word_vectors) == Py_None) || likely(__Pyx_TypeTest(__pyx_v_word_vectors, __pyx_ptype_5numpy_ndarray))))) __PYX_ERR(0, 493, __pyx_L1_error) __pyx_v__word_vectors = ((__pyx_t_6gensim_6models_14word2vec_inner_REAL_t *)PyArray_DATA(((PyArrayObject *)__pyx_v_word_vectors))); - /* "gensim/models/doc2vec_inner.pyx":411 + /* "gensim/models/doc2vec_inner.pyx":494 * word_vectors = model.wv.vectors * _word_vectors = (np.PyArray_DATA(word_vectors)) * if doctag_vectors is None: # <<<<<<<<<<<<<< @@ -5443,22 +5435,22 @@ static PyObject *__pyx_pf_6gensim_6models_13doc2vec_inner_2train_document_dm(CYT __pyx_t_5 = (__pyx_t_6 != 0); if (__pyx_t_5) { - /* "gensim/models/doc2vec_inner.pyx":412 + /* "gensim/models/doc2vec_inner.pyx":495 * _word_vectors = (np.PyArray_DATA(word_vectors)) * if doctag_vectors is None: * doctag_vectors = model.docvecs.vectors_docs # <<<<<<<<<<<<<< * _doctag_vectors = (np.PyArray_DATA(doctag_vectors)) * if word_locks is None: */ - __pyx_t_1 = __Pyx_PyObject_GetAttrStr(__pyx_v_model, __pyx_n_s_docvecs); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 412, __pyx_L1_error) + __pyx_t_1 = __Pyx_PyObject_GetAttrStr(__pyx_v_model, __pyx_n_s_docvecs); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 495, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_1); - __pyx_t_3 = __Pyx_PyObject_GetAttrStr(__pyx_t_1, __pyx_n_s_vectors_docs); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 412, __pyx_L1_error) + __pyx_t_3 = __Pyx_PyObject_GetAttrStr(__pyx_t_1, __pyx_n_s_vectors_docs); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 495, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_3); __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0; __Pyx_DECREF_SET(__pyx_v_doctag_vectors, __pyx_t_3); __pyx_t_3 = 0; - /* "gensim/models/doc2vec_inner.pyx":411 + /* "gensim/models/doc2vec_inner.pyx":494 * word_vectors = model.wv.vectors * _word_vectors = (np.PyArray_DATA(word_vectors)) * if doctag_vectors is None: # <<<<<<<<<<<<<< @@ -5467,17 +5459,17 @@ static PyObject *__pyx_pf_6gensim_6models_13doc2vec_inner_2train_document_dm(CYT */ } - /* "gensim/models/doc2vec_inner.pyx":413 + /* "gensim/models/doc2vec_inner.pyx":496 * if doctag_vectors is None: * doctag_vectors = model.docvecs.vectors_docs * _doctag_vectors = (np.PyArray_DATA(doctag_vectors)) # <<<<<<<<<<<<<< * if word_locks is None: * word_locks = model.trainables.vectors_lockf */ - if (!(likely(((__pyx_v_doctag_vectors) == Py_None) || likely(__Pyx_TypeTest(__pyx_v_doctag_vectors, __pyx_ptype_5numpy_ndarray))))) __PYX_ERR(0, 413, __pyx_L1_error) + if (!(likely(((__pyx_v_doctag_vectors) == Py_None) || likely(__Pyx_TypeTest(__pyx_v_doctag_vectors, __pyx_ptype_5numpy_ndarray))))) __PYX_ERR(0, 496, __pyx_L1_error) __pyx_v__doctag_vectors = ((__pyx_t_6gensim_6models_14word2vec_inner_REAL_t *)PyArray_DATA(((PyArrayObject *)__pyx_v_doctag_vectors))); - /* "gensim/models/doc2vec_inner.pyx":414 + /* "gensim/models/doc2vec_inner.pyx":497 * doctag_vectors = model.docvecs.vectors_docs * _doctag_vectors = (np.PyArray_DATA(doctag_vectors)) * if word_locks is None: # <<<<<<<<<<<<<< @@ -5488,22 +5480,22 @@ static PyObject *__pyx_pf_6gensim_6models_13doc2vec_inner_2train_document_dm(CYT __pyx_t_6 = (__pyx_t_5 != 0); if (__pyx_t_6) { - /* "gensim/models/doc2vec_inner.pyx":415 + /* "gensim/models/doc2vec_inner.pyx":498 * _doctag_vectors = (np.PyArray_DATA(doctag_vectors)) * if word_locks is None: * word_locks = model.trainables.vectors_lockf # <<<<<<<<<<<<<< * _word_locks = (np.PyArray_DATA(word_locks)) * if doctag_locks is None: */ - __pyx_t_3 = __Pyx_PyObject_GetAttrStr(__pyx_v_model, __pyx_n_s_trainables); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 415, __pyx_L1_error) + __pyx_t_3 = __Pyx_PyObject_GetAttrStr(__pyx_v_model, __pyx_n_s_trainables); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 498, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_3); - __pyx_t_1 = __Pyx_PyObject_GetAttrStr(__pyx_t_3, __pyx_n_s_vectors_lockf); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 415, __pyx_L1_error) + __pyx_t_1 = __Pyx_PyObject_GetAttrStr(__pyx_t_3, __pyx_n_s_vectors_lockf); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 498, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_1); __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0; __Pyx_DECREF_SET(__pyx_v_word_locks, __pyx_t_1); __pyx_t_1 = 0; - /* "gensim/models/doc2vec_inner.pyx":414 + /* "gensim/models/doc2vec_inner.pyx":497 * doctag_vectors = model.docvecs.vectors_docs * _doctag_vectors = (np.PyArray_DATA(doctag_vectors)) * if word_locks is None: # <<<<<<<<<<<<<< @@ -5512,17 +5504,17 @@ static PyObject *__pyx_pf_6gensim_6models_13doc2vec_inner_2train_document_dm(CYT */ } - /* "gensim/models/doc2vec_inner.pyx":416 + /* "gensim/models/doc2vec_inner.pyx":499 * if word_locks is None: * word_locks = model.trainables.vectors_lockf * _word_locks = (np.PyArray_DATA(word_locks)) # <<<<<<<<<<<<<< * if doctag_locks is None: * doctag_locks = model.trainables.vectors_docs_lockf */ - if (!(likely(((__pyx_v_word_locks) == Py_None) || likely(__Pyx_TypeTest(__pyx_v_word_locks, __pyx_ptype_5numpy_ndarray))))) __PYX_ERR(0, 416, __pyx_L1_error) + if (!(likely(((__pyx_v_word_locks) == Py_None) || likely(__Pyx_TypeTest(__pyx_v_word_locks, __pyx_ptype_5numpy_ndarray))))) __PYX_ERR(0, 499, __pyx_L1_error) __pyx_v__word_locks = ((__pyx_t_6gensim_6models_14word2vec_inner_REAL_t *)PyArray_DATA(((PyArrayObject *)__pyx_v_word_locks))); - /* "gensim/models/doc2vec_inner.pyx":417 + /* "gensim/models/doc2vec_inner.pyx":500 * word_locks = model.trainables.vectors_lockf * _word_locks = (np.PyArray_DATA(word_locks)) * if doctag_locks is None: # <<<<<<<<<<<<<< @@ -5533,22 +5525,22 @@ static PyObject *__pyx_pf_6gensim_6models_13doc2vec_inner_2train_document_dm(CYT __pyx_t_5 = (__pyx_t_6 != 0); if (__pyx_t_5) { - /* "gensim/models/doc2vec_inner.pyx":418 + /* "gensim/models/doc2vec_inner.pyx":501 * _word_locks = (np.PyArray_DATA(word_locks)) * if doctag_locks is None: * doctag_locks = model.trainables.vectors_docs_lockf # <<<<<<<<<<<<<< * _doctag_locks = (np.PyArray_DATA(doctag_locks)) * */ - __pyx_t_1 = __Pyx_PyObject_GetAttrStr(__pyx_v_model, __pyx_n_s_trainables); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 418, __pyx_L1_error) + __pyx_t_1 = __Pyx_PyObject_GetAttrStr(__pyx_v_model, __pyx_n_s_trainables); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 501, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_1); - __pyx_t_3 = __Pyx_PyObject_GetAttrStr(__pyx_t_1, __pyx_n_s_vectors_docs_lockf); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 418, __pyx_L1_error) + __pyx_t_3 = __Pyx_PyObject_GetAttrStr(__pyx_t_1, __pyx_n_s_vectors_docs_lockf); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 501, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_3); __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0; __Pyx_DECREF_SET(__pyx_v_doctag_locks, __pyx_t_3); __pyx_t_3 = 0; - /* "gensim/models/doc2vec_inner.pyx":417 + /* "gensim/models/doc2vec_inner.pyx":500 * word_locks = model.trainables.vectors_lockf * _word_locks = (np.PyArray_DATA(word_locks)) * if doctag_locks is None: # <<<<<<<<<<<<<< @@ -5557,17 +5549,17 @@ static PyObject *__pyx_pf_6gensim_6models_13doc2vec_inner_2train_document_dm(CYT */ } - /* "gensim/models/doc2vec_inner.pyx":419 + /* "gensim/models/doc2vec_inner.pyx":502 * if doctag_locks is None: * doctag_locks = model.trainables.vectors_docs_lockf * _doctag_locks = (np.PyArray_DATA(doctag_locks)) # <<<<<<<<<<<<<< * * if hs: */ - if (!(likely(((__pyx_v_doctag_locks) == Py_None) || likely(__Pyx_TypeTest(__pyx_v_doctag_locks, __pyx_ptype_5numpy_ndarray))))) __PYX_ERR(0, 419, __pyx_L1_error) + if (!(likely(((__pyx_v_doctag_locks) == Py_None) || likely(__Pyx_TypeTest(__pyx_v_doctag_locks, __pyx_ptype_5numpy_ndarray))))) __PYX_ERR(0, 502, __pyx_L1_error) __pyx_v__doctag_locks = ((__pyx_t_6gensim_6models_14word2vec_inner_REAL_t *)PyArray_DATA(((PyArrayObject *)__pyx_v_doctag_locks))); - /* "gensim/models/doc2vec_inner.pyx":421 + /* "gensim/models/doc2vec_inner.pyx":504 * _doctag_locks = (np.PyArray_DATA(doctag_locks)) * * if hs: # <<<<<<<<<<<<<< @@ -5577,23 +5569,23 @@ static PyObject *__pyx_pf_6gensim_6models_13doc2vec_inner_2train_document_dm(CYT __pyx_t_5 = (__pyx_v_hs != 0); if (__pyx_t_5) { - /* "gensim/models/doc2vec_inner.pyx":422 + /* "gensim/models/doc2vec_inner.pyx":505 * * if hs: * syn1 = (np.PyArray_DATA(model.trainables.syn1)) # <<<<<<<<<<<<<< * * if negative: */ - __pyx_t_3 = __Pyx_PyObject_GetAttrStr(__pyx_v_model, __pyx_n_s_trainables); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 422, __pyx_L1_error) + __pyx_t_3 = __Pyx_PyObject_GetAttrStr(__pyx_v_model, __pyx_n_s_trainables); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 505, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_3); - __pyx_t_1 = __Pyx_PyObject_GetAttrStr(__pyx_t_3, __pyx_n_s_syn1); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 422, __pyx_L1_error) + __pyx_t_1 = __Pyx_PyObject_GetAttrStr(__pyx_t_3, __pyx_n_s_syn1); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 505, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_1); __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0; - if (!(likely(((__pyx_t_1) == Py_None) || likely(__Pyx_TypeTest(__pyx_t_1, __pyx_ptype_5numpy_ndarray))))) __PYX_ERR(0, 422, __pyx_L1_error) + if (!(likely(((__pyx_t_1) == Py_None) || likely(__Pyx_TypeTest(__pyx_t_1, __pyx_ptype_5numpy_ndarray))))) __PYX_ERR(0, 505, __pyx_L1_error) __pyx_v_syn1 = ((__pyx_t_6gensim_6models_14word2vec_inner_REAL_t *)PyArray_DATA(((PyArrayObject *)__pyx_t_1))); __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0; - /* "gensim/models/doc2vec_inner.pyx":421 + /* "gensim/models/doc2vec_inner.pyx":504 * _doctag_locks = (np.PyArray_DATA(doctag_locks)) * * if hs: # <<<<<<<<<<<<<< @@ -5602,7 +5594,7 @@ static PyObject *__pyx_pf_6gensim_6models_13doc2vec_inner_2train_document_dm(CYT */ } - /* "gensim/models/doc2vec_inner.pyx":424 + /* "gensim/models/doc2vec_inner.pyx":507 * syn1 = (np.PyArray_DATA(model.trainables.syn1)) * * if negative: # <<<<<<<<<<<<<< @@ -5612,55 +5604,55 @@ static PyObject *__pyx_pf_6gensim_6models_13doc2vec_inner_2train_document_dm(CYT __pyx_t_5 = (__pyx_v_negative != 0); if (__pyx_t_5) { - /* "gensim/models/doc2vec_inner.pyx":425 + /* "gensim/models/doc2vec_inner.pyx":508 * * if negative: * syn1neg = (np.PyArray_DATA(model.trainables.syn1neg)) # <<<<<<<<<<<<<< * cum_table = (np.PyArray_DATA(model.vocabulary.cum_table)) * cum_table_len = len(model.vocabulary.cum_table) */ - __pyx_t_1 = __Pyx_PyObject_GetAttrStr(__pyx_v_model, __pyx_n_s_trainables); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 425, __pyx_L1_error) + __pyx_t_1 = __Pyx_PyObject_GetAttrStr(__pyx_v_model, __pyx_n_s_trainables); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 508, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_1); - __pyx_t_3 = __Pyx_PyObject_GetAttrStr(__pyx_t_1, __pyx_n_s_syn1neg); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 425, __pyx_L1_error) + __pyx_t_3 = __Pyx_PyObject_GetAttrStr(__pyx_t_1, __pyx_n_s_syn1neg); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 508, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_3); __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0; - if (!(likely(((__pyx_t_3) == Py_None) || likely(__Pyx_TypeTest(__pyx_t_3, __pyx_ptype_5numpy_ndarray))))) __PYX_ERR(0, 425, __pyx_L1_error) + if (!(likely(((__pyx_t_3) == Py_None) || likely(__Pyx_TypeTest(__pyx_t_3, __pyx_ptype_5numpy_ndarray))))) __PYX_ERR(0, 508, __pyx_L1_error) __pyx_v_syn1neg = ((__pyx_t_6gensim_6models_14word2vec_inner_REAL_t *)PyArray_DATA(((PyArrayObject *)__pyx_t_3))); __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0; - /* "gensim/models/doc2vec_inner.pyx":426 + /* "gensim/models/doc2vec_inner.pyx":509 * if negative: * syn1neg = (np.PyArray_DATA(model.trainables.syn1neg)) * cum_table = (np.PyArray_DATA(model.vocabulary.cum_table)) # <<<<<<<<<<<<<< * cum_table_len = len(model.vocabulary.cum_table) * if negative or sample: */ - __pyx_t_3 = __Pyx_PyObject_GetAttrStr(__pyx_v_model, __pyx_n_s_vocabulary); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 426, __pyx_L1_error) + __pyx_t_3 = __Pyx_PyObject_GetAttrStr(__pyx_v_model, __pyx_n_s_vocabulary); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 509, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_3); - __pyx_t_1 = __Pyx_PyObject_GetAttrStr(__pyx_t_3, __pyx_n_s_cum_table); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 426, __pyx_L1_error) + __pyx_t_1 = __Pyx_PyObject_GetAttrStr(__pyx_t_3, __pyx_n_s_cum_table); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 509, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_1); __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0; - if (!(likely(((__pyx_t_1) == Py_None) || likely(__Pyx_TypeTest(__pyx_t_1, __pyx_ptype_5numpy_ndarray))))) __PYX_ERR(0, 426, __pyx_L1_error) + if (!(likely(((__pyx_t_1) == Py_None) || likely(__Pyx_TypeTest(__pyx_t_1, __pyx_ptype_5numpy_ndarray))))) __PYX_ERR(0, 509, __pyx_L1_error) __pyx_v_cum_table = ((__pyx_t_5numpy_uint32_t *)PyArray_DATA(((PyArrayObject *)__pyx_t_1))); __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0; - /* "gensim/models/doc2vec_inner.pyx":427 + /* "gensim/models/doc2vec_inner.pyx":510 * syn1neg = (np.PyArray_DATA(model.trainables.syn1neg)) * cum_table = (np.PyArray_DATA(model.vocabulary.cum_table)) * cum_table_len = len(model.vocabulary.cum_table) # <<<<<<<<<<<<<< * if negative or sample: * next_random = (2**24) * model.random.randint(0, 2**24) + model.random.randint(0, 2**24) */ - __pyx_t_1 = __Pyx_PyObject_GetAttrStr(__pyx_v_model, __pyx_n_s_vocabulary); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 427, __pyx_L1_error) + __pyx_t_1 = __Pyx_PyObject_GetAttrStr(__pyx_v_model, __pyx_n_s_vocabulary); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 510, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_1); - __pyx_t_3 = __Pyx_PyObject_GetAttrStr(__pyx_t_1, __pyx_n_s_cum_table); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 427, __pyx_L1_error) + __pyx_t_3 = __Pyx_PyObject_GetAttrStr(__pyx_t_1, __pyx_n_s_cum_table); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 510, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_3); __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0; - __pyx_t_7 = PyObject_Length(__pyx_t_3); if (unlikely(__pyx_t_7 == ((Py_ssize_t)-1))) __PYX_ERR(0, 427, __pyx_L1_error) + __pyx_t_7 = PyObject_Length(__pyx_t_3); if (unlikely(__pyx_t_7 == ((Py_ssize_t)-1))) __PYX_ERR(0, 510, __pyx_L1_error) __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0; __pyx_v_cum_table_len = __pyx_t_7; - /* "gensim/models/doc2vec_inner.pyx":424 + /* "gensim/models/doc2vec_inner.pyx":507 * syn1 = (np.PyArray_DATA(model.trainables.syn1)) * * if negative: # <<<<<<<<<<<<<< @@ -5669,7 +5661,7 @@ static PyObject *__pyx_pf_6gensim_6models_13doc2vec_inner_2train_document_dm(CYT */ } - /* "gensim/models/doc2vec_inner.pyx":428 + /* "gensim/models/doc2vec_inner.pyx":511 * cum_table = (np.PyArray_DATA(model.vocabulary.cum_table)) * cum_table_len = len(model.vocabulary.cum_table) * if negative or sample: # <<<<<<<<<<<<<< @@ -5687,41 +5679,41 @@ static PyObject *__pyx_pf_6gensim_6models_13doc2vec_inner_2train_document_dm(CYT __pyx_L10_bool_binop_done:; if (__pyx_t_5) { - /* "gensim/models/doc2vec_inner.pyx":429 + /* "gensim/models/doc2vec_inner.pyx":512 * cum_table_len = len(model.vocabulary.cum_table) * if negative or sample: * next_random = (2**24) * model.random.randint(0, 2**24) + model.random.randint(0, 2**24) # <<<<<<<<<<<<<< * * # convert Python structures to primitive types, so we can release the GIL */ - __pyx_t_3 = __Pyx_PyObject_GetAttrStr(__pyx_v_model, __pyx_n_s_random); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 429, __pyx_L1_error) + __pyx_t_3 = __Pyx_PyObject_GetAttrStr(__pyx_v_model, __pyx_n_s_random); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 512, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_3); - __pyx_t_1 = __Pyx_PyObject_GetAttrStr(__pyx_t_3, __pyx_n_s_randint); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 429, __pyx_L1_error) + __pyx_t_1 = __Pyx_PyObject_GetAttrStr(__pyx_t_3, __pyx_n_s_randint); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 512, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_1); __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0; - __pyx_t_3 = __Pyx_PyObject_Call(__pyx_t_1, __pyx_tuple__3, NULL); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 429, __pyx_L1_error) + __pyx_t_3 = __Pyx_PyObject_Call(__pyx_t_1, __pyx_tuple__3, NULL); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 512, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_3); __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0; - __pyx_t_1 = PyNumber_Multiply(__pyx_int_16777216, __pyx_t_3); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 429, __pyx_L1_error) + __pyx_t_1 = PyNumber_Multiply(__pyx_int_16777216, __pyx_t_3); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 512, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_1); __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0; - __pyx_t_3 = __Pyx_PyObject_GetAttrStr(__pyx_v_model, __pyx_n_s_random); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 429, __pyx_L1_error) + __pyx_t_3 = __Pyx_PyObject_GetAttrStr(__pyx_v_model, __pyx_n_s_random); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 512, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_3); - __pyx_t_8 = __Pyx_PyObject_GetAttrStr(__pyx_t_3, __pyx_n_s_randint); if (unlikely(!__pyx_t_8)) __PYX_ERR(0, 429, __pyx_L1_error) + __pyx_t_8 = __Pyx_PyObject_GetAttrStr(__pyx_t_3, __pyx_n_s_randint); if (unlikely(!__pyx_t_8)) __PYX_ERR(0, 512, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_8); __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0; - __pyx_t_3 = __Pyx_PyObject_Call(__pyx_t_8, __pyx_tuple__4, NULL); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 429, __pyx_L1_error) + __pyx_t_3 = __Pyx_PyObject_Call(__pyx_t_8, __pyx_tuple__4, NULL); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 512, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_3); __Pyx_DECREF(__pyx_t_8); __pyx_t_8 = 0; - __pyx_t_8 = PyNumber_Add(__pyx_t_1, __pyx_t_3); if (unlikely(!__pyx_t_8)) __PYX_ERR(0, 429, __pyx_L1_error) + __pyx_t_8 = PyNumber_Add(__pyx_t_1, __pyx_t_3); if (unlikely(!__pyx_t_8)) __PYX_ERR(0, 512, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_8); __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0; __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0; - __pyx_t_9 = __Pyx_PyInt_As_unsigned_PY_LONG_LONG(__pyx_t_8); if (unlikely((__pyx_t_9 == (unsigned PY_LONG_LONG)-1) && PyErr_Occurred())) __PYX_ERR(0, 429, __pyx_L1_error) + __pyx_t_9 = __Pyx_PyInt_As_unsigned_PY_LONG_LONG(__pyx_t_8); if (unlikely((__pyx_t_9 == (unsigned PY_LONG_LONG)-1) && PyErr_Occurred())) __PYX_ERR(0, 512, __pyx_L1_error) __Pyx_DECREF(__pyx_t_8); __pyx_t_8 = 0; __pyx_v_next_random = __pyx_t_9; - /* "gensim/models/doc2vec_inner.pyx":428 + /* "gensim/models/doc2vec_inner.pyx":511 * cum_table = (np.PyArray_DATA(model.vocabulary.cum_table)) * cum_table_len = len(model.vocabulary.cum_table) * if negative or sample: # <<<<<<<<<<<<<< @@ -5730,7 +5722,7 @@ static PyObject *__pyx_pf_6gensim_6models_13doc2vec_inner_2train_document_dm(CYT */ } - /* "gensim/models/doc2vec_inner.pyx":432 + /* "gensim/models/doc2vec_inner.pyx":515 * * # convert Python structures to primitive types, so we can release the GIL * if work is None: # <<<<<<<<<<<<<< @@ -5741,32 +5733,32 @@ static PyObject *__pyx_pf_6gensim_6models_13doc2vec_inner_2train_document_dm(CYT __pyx_t_6 = (__pyx_t_5 != 0); if (__pyx_t_6) { - /* "gensim/models/doc2vec_inner.pyx":433 + /* "gensim/models/doc2vec_inner.pyx":516 * # convert Python structures to primitive types, so we can release the GIL * if work is None: * work = zeros(model.trainables.layer1_size, dtype=REAL) # <<<<<<<<<<<<<< * _work = np.PyArray_DATA(work) * if neu1 is None: */ - __pyx_t_8 = __Pyx_GetModuleGlobalName(__pyx_n_s_zeros); if (unlikely(!__pyx_t_8)) __PYX_ERR(0, 433, __pyx_L1_error) + __pyx_t_8 = __Pyx_GetModuleGlobalName(__pyx_n_s_zeros); if (unlikely(!__pyx_t_8)) __PYX_ERR(0, 516, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_8); - __pyx_t_3 = __Pyx_PyObject_GetAttrStr(__pyx_v_model, __pyx_n_s_trainables); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 433, __pyx_L1_error) + __pyx_t_3 = __Pyx_PyObject_GetAttrStr(__pyx_v_model, __pyx_n_s_trainables); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 516, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_3); - __pyx_t_1 = __Pyx_PyObject_GetAttrStr(__pyx_t_3, __pyx_n_s_layer1_size); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 433, __pyx_L1_error) + __pyx_t_1 = __Pyx_PyObject_GetAttrStr(__pyx_t_3, __pyx_n_s_layer1_size); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 516, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_1); __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0; - __pyx_t_3 = PyTuple_New(1); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 433, __pyx_L1_error) + __pyx_t_3 = PyTuple_New(1); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 516, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_3); __Pyx_GIVEREF(__pyx_t_1); PyTuple_SET_ITEM(__pyx_t_3, 0, __pyx_t_1); __pyx_t_1 = 0; - __pyx_t_1 = __Pyx_PyDict_NewPresized(1); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 433, __pyx_L1_error) + __pyx_t_1 = __Pyx_PyDict_NewPresized(1); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 516, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_1); - __pyx_t_10 = __Pyx_GetModuleGlobalName(__pyx_n_s_REAL); if (unlikely(!__pyx_t_10)) __PYX_ERR(0, 433, __pyx_L1_error) + __pyx_t_10 = __Pyx_GetModuleGlobalName(__pyx_n_s_REAL); if (unlikely(!__pyx_t_10)) __PYX_ERR(0, 516, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_10); - if (PyDict_SetItem(__pyx_t_1, __pyx_n_s_dtype, __pyx_t_10) < 0) __PYX_ERR(0, 433, __pyx_L1_error) + if (PyDict_SetItem(__pyx_t_1, __pyx_n_s_dtype, __pyx_t_10) < 0) __PYX_ERR(0, 516, __pyx_L1_error) __Pyx_DECREF(__pyx_t_10); __pyx_t_10 = 0; - __pyx_t_10 = __Pyx_PyObject_Call(__pyx_t_8, __pyx_t_3, __pyx_t_1); if (unlikely(!__pyx_t_10)) __PYX_ERR(0, 433, __pyx_L1_error) + __pyx_t_10 = __Pyx_PyObject_Call(__pyx_t_8, __pyx_t_3, __pyx_t_1); if (unlikely(!__pyx_t_10)) __PYX_ERR(0, 516, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_10); __Pyx_DECREF(__pyx_t_8); __pyx_t_8 = 0; __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0; @@ -5774,7 +5766,7 @@ static PyObject *__pyx_pf_6gensim_6models_13doc2vec_inner_2train_document_dm(CYT __Pyx_DECREF_SET(__pyx_v_work, __pyx_t_10); __pyx_t_10 = 0; - /* "gensim/models/doc2vec_inner.pyx":432 + /* "gensim/models/doc2vec_inner.pyx":515 * * # convert Python structures to primitive types, so we can release the GIL * if work is None: # <<<<<<<<<<<<<< @@ -5783,17 +5775,17 @@ static PyObject *__pyx_pf_6gensim_6models_13doc2vec_inner_2train_document_dm(CYT */ } - /* "gensim/models/doc2vec_inner.pyx":434 + /* "gensim/models/doc2vec_inner.pyx":517 * if work is None: * work = zeros(model.trainables.layer1_size, dtype=REAL) * _work = np.PyArray_DATA(work) # <<<<<<<<<<<<<< * if neu1 is None: * neu1 = zeros(model.trainables.layer1_size, dtype=REAL) */ - if (!(likely(((__pyx_v_work) == Py_None) || likely(__Pyx_TypeTest(__pyx_v_work, __pyx_ptype_5numpy_ndarray))))) __PYX_ERR(0, 434, __pyx_L1_error) + if (!(likely(((__pyx_v_work) == Py_None) || likely(__Pyx_TypeTest(__pyx_v_work, __pyx_ptype_5numpy_ndarray))))) __PYX_ERR(0, 517, __pyx_L1_error) __pyx_v__work = ((__pyx_t_6gensim_6models_14word2vec_inner_REAL_t *)PyArray_DATA(((PyArrayObject *)__pyx_v_work))); - /* "gensim/models/doc2vec_inner.pyx":435 + /* "gensim/models/doc2vec_inner.pyx":518 * work = zeros(model.trainables.layer1_size, dtype=REAL) * _work = np.PyArray_DATA(work) * if neu1 is None: # <<<<<<<<<<<<<< @@ -5804,32 +5796,32 @@ static PyObject *__pyx_pf_6gensim_6models_13doc2vec_inner_2train_document_dm(CYT __pyx_t_5 = (__pyx_t_6 != 0); if (__pyx_t_5) { - /* "gensim/models/doc2vec_inner.pyx":436 + /* "gensim/models/doc2vec_inner.pyx":519 * _work = np.PyArray_DATA(work) * if neu1 is None: * neu1 = zeros(model.trainables.layer1_size, dtype=REAL) # <<<<<<<<<<<<<< * _neu1 = np.PyArray_DATA(neu1) * */ - __pyx_t_10 = __Pyx_GetModuleGlobalName(__pyx_n_s_zeros); if (unlikely(!__pyx_t_10)) __PYX_ERR(0, 436, __pyx_L1_error) + __pyx_t_10 = __Pyx_GetModuleGlobalName(__pyx_n_s_zeros); if (unlikely(!__pyx_t_10)) __PYX_ERR(0, 519, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_10); - __pyx_t_1 = __Pyx_PyObject_GetAttrStr(__pyx_v_model, __pyx_n_s_trainables); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 436, __pyx_L1_error) + __pyx_t_1 = __Pyx_PyObject_GetAttrStr(__pyx_v_model, __pyx_n_s_trainables); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 519, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_1); - __pyx_t_3 = __Pyx_PyObject_GetAttrStr(__pyx_t_1, __pyx_n_s_layer1_size); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 436, __pyx_L1_error) + __pyx_t_3 = __Pyx_PyObject_GetAttrStr(__pyx_t_1, __pyx_n_s_layer1_size); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 519, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_3); __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0; - __pyx_t_1 = PyTuple_New(1); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 436, __pyx_L1_error) + __pyx_t_1 = PyTuple_New(1); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 519, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_1); __Pyx_GIVEREF(__pyx_t_3); PyTuple_SET_ITEM(__pyx_t_1, 0, __pyx_t_3); __pyx_t_3 = 0; - __pyx_t_3 = __Pyx_PyDict_NewPresized(1); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 436, __pyx_L1_error) + __pyx_t_3 = __Pyx_PyDict_NewPresized(1); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 519, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_3); - __pyx_t_8 = __Pyx_GetModuleGlobalName(__pyx_n_s_REAL); if (unlikely(!__pyx_t_8)) __PYX_ERR(0, 436, __pyx_L1_error) + __pyx_t_8 = __Pyx_GetModuleGlobalName(__pyx_n_s_REAL); if (unlikely(!__pyx_t_8)) __PYX_ERR(0, 519, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_8); - if (PyDict_SetItem(__pyx_t_3, __pyx_n_s_dtype, __pyx_t_8) < 0) __PYX_ERR(0, 436, __pyx_L1_error) + if (PyDict_SetItem(__pyx_t_3, __pyx_n_s_dtype, __pyx_t_8) < 0) __PYX_ERR(0, 519, __pyx_L1_error) __Pyx_DECREF(__pyx_t_8); __pyx_t_8 = 0; - __pyx_t_8 = __Pyx_PyObject_Call(__pyx_t_10, __pyx_t_1, __pyx_t_3); if (unlikely(!__pyx_t_8)) __PYX_ERR(0, 436, __pyx_L1_error) + __pyx_t_8 = __Pyx_PyObject_Call(__pyx_t_10, __pyx_t_1, __pyx_t_3); if (unlikely(!__pyx_t_8)) __PYX_ERR(0, 519, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_8); __Pyx_DECREF(__pyx_t_10); __pyx_t_10 = 0; __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0; @@ -5837,7 +5829,7 @@ static PyObject *__pyx_pf_6gensim_6models_13doc2vec_inner_2train_document_dm(CYT __Pyx_DECREF_SET(__pyx_v_neu1, __pyx_t_8); __pyx_t_8 = 0; - /* "gensim/models/doc2vec_inner.pyx":435 + /* "gensim/models/doc2vec_inner.pyx":518 * work = zeros(model.trainables.layer1_size, dtype=REAL) * _work = np.PyArray_DATA(work) * if neu1 is None: # <<<<<<<<<<<<<< @@ -5846,32 +5838,32 @@ static PyObject *__pyx_pf_6gensim_6models_13doc2vec_inner_2train_document_dm(CYT */ } - /* "gensim/models/doc2vec_inner.pyx":437 + /* "gensim/models/doc2vec_inner.pyx":520 * if neu1 is None: * neu1 = zeros(model.trainables.layer1_size, dtype=REAL) * _neu1 = np.PyArray_DATA(neu1) # <<<<<<<<<<<<<< * * vlookup = model.wv.vocab */ - if (!(likely(((__pyx_v_neu1) == Py_None) || likely(__Pyx_TypeTest(__pyx_v_neu1, __pyx_ptype_5numpy_ndarray))))) __PYX_ERR(0, 437, __pyx_L1_error) + if (!(likely(((__pyx_v_neu1) == Py_None) || likely(__Pyx_TypeTest(__pyx_v_neu1, __pyx_ptype_5numpy_ndarray))))) __PYX_ERR(0, 520, __pyx_L1_error) __pyx_v__neu1 = ((__pyx_t_6gensim_6models_14word2vec_inner_REAL_t *)PyArray_DATA(((PyArrayObject *)__pyx_v_neu1))); - /* "gensim/models/doc2vec_inner.pyx":439 + /* "gensim/models/doc2vec_inner.pyx":522 * _neu1 = np.PyArray_DATA(neu1) * * vlookup = model.wv.vocab # <<<<<<<<<<<<<< * i = 0 * for token in doc_words: */ - __pyx_t_8 = __Pyx_PyObject_GetAttrStr(__pyx_v_model, __pyx_n_s_wv); if (unlikely(!__pyx_t_8)) __PYX_ERR(0, 439, __pyx_L1_error) + __pyx_t_8 = __Pyx_PyObject_GetAttrStr(__pyx_v_model, __pyx_n_s_wv); if (unlikely(!__pyx_t_8)) __PYX_ERR(0, 522, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_8); - __pyx_t_3 = __Pyx_PyObject_GetAttrStr(__pyx_t_8, __pyx_n_s_vocab); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 439, __pyx_L1_error) + __pyx_t_3 = __Pyx_PyObject_GetAttrStr(__pyx_t_8, __pyx_n_s_vocab); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 522, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_3); __Pyx_DECREF(__pyx_t_8); __pyx_t_8 = 0; __pyx_v_vlookup = __pyx_t_3; __pyx_t_3 = 0; - /* "gensim/models/doc2vec_inner.pyx":440 + /* "gensim/models/doc2vec_inner.pyx":523 * * vlookup = model.wv.vocab * i = 0 # <<<<<<<<<<<<<< @@ -5880,7 +5872,7 @@ static PyObject *__pyx_pf_6gensim_6models_13doc2vec_inner_2train_document_dm(CYT */ __pyx_v_i = 0; - /* "gensim/models/doc2vec_inner.pyx":441 + /* "gensim/models/doc2vec_inner.pyx":524 * vlookup = model.wv.vocab * i = 0 * for token in doc_words: # <<<<<<<<<<<<<< @@ -5891,26 +5883,26 @@ static PyObject *__pyx_pf_6gensim_6models_13doc2vec_inner_2train_document_dm(CYT __pyx_t_3 = __pyx_v_doc_words; __Pyx_INCREF(__pyx_t_3); __pyx_t_7 = 0; __pyx_t_11 = NULL; } else { - __pyx_t_7 = -1; __pyx_t_3 = PyObject_GetIter(__pyx_v_doc_words); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 441, __pyx_L1_error) + __pyx_t_7 = -1; __pyx_t_3 = PyObject_GetIter(__pyx_v_doc_words); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 524, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_3); - __pyx_t_11 = Py_TYPE(__pyx_t_3)->tp_iternext; if (unlikely(!__pyx_t_11)) __PYX_ERR(0, 441, __pyx_L1_error) + __pyx_t_11 = Py_TYPE(__pyx_t_3)->tp_iternext; if (unlikely(!__pyx_t_11)) __PYX_ERR(0, 524, __pyx_L1_error) } for (;;) { if (likely(!__pyx_t_11)) { if (likely(PyList_CheckExact(__pyx_t_3))) { if (__pyx_t_7 >= PyList_GET_SIZE(__pyx_t_3)) break; #if CYTHON_ASSUME_SAFE_MACROS && !CYTHON_AVOID_BORROWED_REFS - __pyx_t_8 = PyList_GET_ITEM(__pyx_t_3, __pyx_t_7); __Pyx_INCREF(__pyx_t_8); __pyx_t_7++; if (unlikely(0 < 0)) __PYX_ERR(0, 441, __pyx_L1_error) + __pyx_t_8 = PyList_GET_ITEM(__pyx_t_3, __pyx_t_7); __Pyx_INCREF(__pyx_t_8); __pyx_t_7++; if (unlikely(0 < 0)) __PYX_ERR(0, 524, __pyx_L1_error) #else - __pyx_t_8 = PySequence_ITEM(__pyx_t_3, __pyx_t_7); __pyx_t_7++; if (unlikely(!__pyx_t_8)) __PYX_ERR(0, 441, __pyx_L1_error) + __pyx_t_8 = PySequence_ITEM(__pyx_t_3, __pyx_t_7); __pyx_t_7++; if (unlikely(!__pyx_t_8)) __PYX_ERR(0, 524, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_8); #endif } else { if (__pyx_t_7 >= PyTuple_GET_SIZE(__pyx_t_3)) break; #if CYTHON_ASSUME_SAFE_MACROS && !CYTHON_AVOID_BORROWED_REFS - __pyx_t_8 = PyTuple_GET_ITEM(__pyx_t_3, __pyx_t_7); __Pyx_INCREF(__pyx_t_8); __pyx_t_7++; if (unlikely(0 < 0)) __PYX_ERR(0, 441, __pyx_L1_error) + __pyx_t_8 = PyTuple_GET_ITEM(__pyx_t_3, __pyx_t_7); __Pyx_INCREF(__pyx_t_8); __pyx_t_7++; if (unlikely(0 < 0)) __PYX_ERR(0, 524, __pyx_L1_error) #else - __pyx_t_8 = PySequence_ITEM(__pyx_t_3, __pyx_t_7); __pyx_t_7++; if (unlikely(!__pyx_t_8)) __PYX_ERR(0, 441, __pyx_L1_error) + __pyx_t_8 = PySequence_ITEM(__pyx_t_3, __pyx_t_7); __pyx_t_7++; if (unlikely(!__pyx_t_8)) __PYX_ERR(0, 524, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_8); #endif } @@ -5920,7 +5912,7 @@ static PyObject *__pyx_pf_6gensim_6models_13doc2vec_inner_2train_document_dm(CYT PyObject* exc_type = PyErr_Occurred(); if (exc_type) { if (likely(__Pyx_PyErr_GivenExceptionMatches(exc_type, PyExc_StopIteration))) PyErr_Clear(); - else __PYX_ERR(0, 441, __pyx_L1_error) + else __PYX_ERR(0, 524, __pyx_L1_error) } break; } @@ -5929,16 +5921,16 @@ static PyObject *__pyx_pf_6gensim_6models_13doc2vec_inner_2train_document_dm(CYT __Pyx_XDECREF_SET(__pyx_v_token, __pyx_t_8); __pyx_t_8 = 0; - /* "gensim/models/doc2vec_inner.pyx":442 + /* "gensim/models/doc2vec_inner.pyx":525 * i = 0 * for token in doc_words: * predict_word = vlookup[token] if token in vlookup else None # <<<<<<<<<<<<<< * if predict_word is None: # shrink document to leave out word * continue # leaving i unchanged */ - __pyx_t_5 = (__Pyx_PySequence_ContainsTF(__pyx_v_token, __pyx_v_vlookup, Py_EQ)); if (unlikely(__pyx_t_5 < 0)) __PYX_ERR(0, 442, __pyx_L1_error) + __pyx_t_5 = (__Pyx_PySequence_ContainsTF(__pyx_v_token, __pyx_v_vlookup, Py_EQ)); if (unlikely(__pyx_t_5 < 0)) __PYX_ERR(0, 525, __pyx_L1_error) if ((__pyx_t_5 != 0)) { - __pyx_t_1 = PyObject_GetItem(__pyx_v_vlookup, __pyx_v_token); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 442, __pyx_L1_error) + __pyx_t_1 = PyObject_GetItem(__pyx_v_vlookup, __pyx_v_token); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 525, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_1); __pyx_t_8 = __pyx_t_1; __pyx_t_1 = 0; @@ -5949,7 +5941,7 @@ static PyObject *__pyx_pf_6gensim_6models_13doc2vec_inner_2train_document_dm(CYT __Pyx_XDECREF_SET(__pyx_v_predict_word, __pyx_t_8); __pyx_t_8 = 0; - /* "gensim/models/doc2vec_inner.pyx":443 + /* "gensim/models/doc2vec_inner.pyx":526 * for token in doc_words: * predict_word = vlookup[token] if token in vlookup else None * if predict_word is None: # shrink document to leave out word # <<<<<<<<<<<<<< @@ -5960,7 +5952,7 @@ static PyObject *__pyx_pf_6gensim_6models_13doc2vec_inner_2train_document_dm(CYT __pyx_t_6 = (__pyx_t_5 != 0); if (__pyx_t_6) { - /* "gensim/models/doc2vec_inner.pyx":444 + /* "gensim/models/doc2vec_inner.pyx":527 * predict_word = vlookup[token] if token in vlookup else None * if predict_word is None: # shrink document to leave out word * continue # leaving i unchanged # <<<<<<<<<<<<<< @@ -5969,7 +5961,7 @@ static PyObject *__pyx_pf_6gensim_6models_13doc2vec_inner_2train_document_dm(CYT */ goto __pyx_L14_continue; - /* "gensim/models/doc2vec_inner.pyx":443 + /* "gensim/models/doc2vec_inner.pyx":526 * for token in doc_words: * predict_word = vlookup[token] if token in vlookup else None * if predict_word is None: # shrink document to leave out word # <<<<<<<<<<<<<< @@ -5978,7 +5970,7 @@ static PyObject *__pyx_pf_6gensim_6models_13doc2vec_inner_2train_document_dm(CYT */ } - /* "gensim/models/doc2vec_inner.pyx":445 + /* "gensim/models/doc2vec_inner.pyx":528 * if predict_word is None: # shrink document to leave out word * continue # leaving i unchanged * if sample and predict_word.sample_int < random_int32(&next_random): # <<<<<<<<<<<<<< @@ -5991,20 +5983,20 @@ static PyObject *__pyx_pf_6gensim_6models_13doc2vec_inner_2train_document_dm(CYT __pyx_t_6 = __pyx_t_5; goto __pyx_L18_bool_binop_done; } - __pyx_t_8 = __Pyx_PyObject_GetAttrStr(__pyx_v_predict_word, __pyx_n_s_sample_int); if (unlikely(!__pyx_t_8)) __PYX_ERR(0, 445, __pyx_L1_error) + __pyx_t_8 = __Pyx_PyObject_GetAttrStr(__pyx_v_predict_word, __pyx_n_s_sample_int); if (unlikely(!__pyx_t_8)) __PYX_ERR(0, 528, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_8); - __pyx_t_1 = __Pyx_PyInt_From_unsigned_PY_LONG_LONG(__pyx_f_6gensim_6models_14word2vec_inner_random_int32((&__pyx_v_next_random))); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 445, __pyx_L1_error) + __pyx_t_1 = __Pyx_PyInt_From_unsigned_PY_LONG_LONG(__pyx_f_6gensim_6models_14word2vec_inner_random_int32((&__pyx_v_next_random))); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 528, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_1); - __pyx_t_10 = PyObject_RichCompare(__pyx_t_8, __pyx_t_1, Py_LT); __Pyx_XGOTREF(__pyx_t_10); if (unlikely(!__pyx_t_10)) __PYX_ERR(0, 445, __pyx_L1_error) + __pyx_t_10 = PyObject_RichCompare(__pyx_t_8, __pyx_t_1, Py_LT); __Pyx_XGOTREF(__pyx_t_10); if (unlikely(!__pyx_t_10)) __PYX_ERR(0, 528, __pyx_L1_error) __Pyx_DECREF(__pyx_t_8); __pyx_t_8 = 0; __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0; - __pyx_t_5 = __Pyx_PyObject_IsTrue(__pyx_t_10); if (unlikely(__pyx_t_5 < 0)) __PYX_ERR(0, 445, __pyx_L1_error) + __pyx_t_5 = __Pyx_PyObject_IsTrue(__pyx_t_10); if (unlikely(__pyx_t_5 < 0)) __PYX_ERR(0, 528, __pyx_L1_error) __Pyx_DECREF(__pyx_t_10); __pyx_t_10 = 0; __pyx_t_6 = __pyx_t_5; __pyx_L18_bool_binop_done:; if (__pyx_t_6) { - /* "gensim/models/doc2vec_inner.pyx":446 + /* "gensim/models/doc2vec_inner.pyx":529 * continue # leaving i unchanged * if sample and predict_word.sample_int < random_int32(&next_random): * continue # <<<<<<<<<<<<<< @@ -6013,7 +6005,7 @@ static PyObject *__pyx_pf_6gensim_6models_13doc2vec_inner_2train_document_dm(CYT */ goto __pyx_L14_continue; - /* "gensim/models/doc2vec_inner.pyx":445 + /* "gensim/models/doc2vec_inner.pyx":528 * if predict_word is None: # shrink document to leave out word * continue # leaving i unchanged * if sample and predict_word.sample_int < random_int32(&next_random): # <<<<<<<<<<<<<< @@ -6022,20 +6014,20 @@ static PyObject *__pyx_pf_6gensim_6models_13doc2vec_inner_2train_document_dm(CYT */ } - /* "gensim/models/doc2vec_inner.pyx":447 + /* "gensim/models/doc2vec_inner.pyx":530 * if sample and predict_word.sample_int < random_int32(&next_random): * continue * indexes[i] = predict_word.index # <<<<<<<<<<<<<< * if hs: * codelens[i] = len(predict_word.code) */ - __pyx_t_10 = __Pyx_PyObject_GetAttrStr(__pyx_v_predict_word, __pyx_n_s_index); if (unlikely(!__pyx_t_10)) __PYX_ERR(0, 447, __pyx_L1_error) + __pyx_t_10 = __Pyx_PyObject_GetAttrStr(__pyx_v_predict_word, __pyx_n_s_index); if (unlikely(!__pyx_t_10)) __PYX_ERR(0, 530, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_10); - __pyx_t_12 = __Pyx_PyInt_As_npy_uint32(__pyx_t_10); if (unlikely((__pyx_t_12 == ((npy_uint32)-1)) && PyErr_Occurred())) __PYX_ERR(0, 447, __pyx_L1_error) + __pyx_t_12 = __Pyx_PyInt_As_npy_uint32(__pyx_t_10); if (unlikely((__pyx_t_12 == ((npy_uint32)-1)) && PyErr_Occurred())) __PYX_ERR(0, 530, __pyx_L1_error) __Pyx_DECREF(__pyx_t_10); __pyx_t_10 = 0; (__pyx_v_indexes[__pyx_v_i]) = __pyx_t_12; - /* "gensim/models/doc2vec_inner.pyx":448 + /* "gensim/models/doc2vec_inner.pyx":531 * continue * indexes[i] = predict_word.index * if hs: # <<<<<<<<<<<<<< @@ -6045,46 +6037,46 @@ static PyObject *__pyx_pf_6gensim_6models_13doc2vec_inner_2train_document_dm(CYT __pyx_t_6 = (__pyx_v_hs != 0); if (__pyx_t_6) { - /* "gensim/models/doc2vec_inner.pyx":449 + /* "gensim/models/doc2vec_inner.pyx":532 * indexes[i] = predict_word.index * if hs: * codelens[i] = len(predict_word.code) # <<<<<<<<<<<<<< * codes[i] = np.PyArray_DATA(predict_word.code) * points[i] = np.PyArray_DATA(predict_word.point) */ - __pyx_t_10 = __Pyx_PyObject_GetAttrStr(__pyx_v_predict_word, __pyx_n_s_code); if (unlikely(!__pyx_t_10)) __PYX_ERR(0, 449, __pyx_L1_error) + __pyx_t_10 = __Pyx_PyObject_GetAttrStr(__pyx_v_predict_word, __pyx_n_s_code); if (unlikely(!__pyx_t_10)) __PYX_ERR(0, 532, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_10); - __pyx_t_13 = PyObject_Length(__pyx_t_10); if (unlikely(__pyx_t_13 == ((Py_ssize_t)-1))) __PYX_ERR(0, 449, __pyx_L1_error) + __pyx_t_13 = PyObject_Length(__pyx_t_10); if (unlikely(__pyx_t_13 == ((Py_ssize_t)-1))) __PYX_ERR(0, 532, __pyx_L1_error) __Pyx_DECREF(__pyx_t_10); __pyx_t_10 = 0; (__pyx_v_codelens[__pyx_v_i]) = ((int)__pyx_t_13); - /* "gensim/models/doc2vec_inner.pyx":450 + /* "gensim/models/doc2vec_inner.pyx":533 * if hs: * codelens[i] = len(predict_word.code) * codes[i] = np.PyArray_DATA(predict_word.code) # <<<<<<<<<<<<<< * points[i] = np.PyArray_DATA(predict_word.point) * result += 1 */ - __pyx_t_10 = __Pyx_PyObject_GetAttrStr(__pyx_v_predict_word, __pyx_n_s_code); if (unlikely(!__pyx_t_10)) __PYX_ERR(0, 450, __pyx_L1_error) + __pyx_t_10 = __Pyx_PyObject_GetAttrStr(__pyx_v_predict_word, __pyx_n_s_code); if (unlikely(!__pyx_t_10)) __PYX_ERR(0, 533, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_10); - if (!(likely(((__pyx_t_10) == Py_None) || likely(__Pyx_TypeTest(__pyx_t_10, __pyx_ptype_5numpy_ndarray))))) __PYX_ERR(0, 450, __pyx_L1_error) + if (!(likely(((__pyx_t_10) == Py_None) || likely(__Pyx_TypeTest(__pyx_t_10, __pyx_ptype_5numpy_ndarray))))) __PYX_ERR(0, 533, __pyx_L1_error) (__pyx_v_codes[__pyx_v_i]) = ((__pyx_t_5numpy_uint8_t *)PyArray_DATA(((PyArrayObject *)__pyx_t_10))); __Pyx_DECREF(__pyx_t_10); __pyx_t_10 = 0; - /* "gensim/models/doc2vec_inner.pyx":451 + /* "gensim/models/doc2vec_inner.pyx":534 * codelens[i] = len(predict_word.code) * codes[i] = np.PyArray_DATA(predict_word.code) * points[i] = np.PyArray_DATA(predict_word.point) # <<<<<<<<<<<<<< * result += 1 * i += 1 */ - __pyx_t_10 = __Pyx_PyObject_GetAttrStr(__pyx_v_predict_word, __pyx_n_s_point); if (unlikely(!__pyx_t_10)) __PYX_ERR(0, 451, __pyx_L1_error) + __pyx_t_10 = __Pyx_PyObject_GetAttrStr(__pyx_v_predict_word, __pyx_n_s_point); if (unlikely(!__pyx_t_10)) __PYX_ERR(0, 534, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_10); - if (!(likely(((__pyx_t_10) == Py_None) || likely(__Pyx_TypeTest(__pyx_t_10, __pyx_ptype_5numpy_ndarray))))) __PYX_ERR(0, 451, __pyx_L1_error) + if (!(likely(((__pyx_t_10) == Py_None) || likely(__Pyx_TypeTest(__pyx_t_10, __pyx_ptype_5numpy_ndarray))))) __PYX_ERR(0, 534, __pyx_L1_error) (__pyx_v_points[__pyx_v_i]) = ((__pyx_t_5numpy_uint32_t *)PyArray_DATA(((PyArrayObject *)__pyx_t_10))); __Pyx_DECREF(__pyx_t_10); __pyx_t_10 = 0; - /* "gensim/models/doc2vec_inner.pyx":448 + /* "gensim/models/doc2vec_inner.pyx":531 * continue * indexes[i] = predict_word.index * if hs: # <<<<<<<<<<<<<< @@ -6093,7 +6085,7 @@ static PyObject *__pyx_pf_6gensim_6models_13doc2vec_inner_2train_document_dm(CYT */ } - /* "gensim/models/doc2vec_inner.pyx":452 + /* "gensim/models/doc2vec_inner.pyx":535 * codes[i] = np.PyArray_DATA(predict_word.code) * points[i] = np.PyArray_DATA(predict_word.point) * result += 1 # <<<<<<<<<<<<<< @@ -6102,7 +6094,7 @@ static PyObject *__pyx_pf_6gensim_6models_13doc2vec_inner_2train_document_dm(CYT */ __pyx_v_result = (__pyx_v_result + 1); - /* "gensim/models/doc2vec_inner.pyx":453 + /* "gensim/models/doc2vec_inner.pyx":536 * points[i] = np.PyArray_DATA(predict_word.point) * result += 1 * i += 1 # <<<<<<<<<<<<<< @@ -6111,7 +6103,7 @@ static PyObject *__pyx_pf_6gensim_6models_13doc2vec_inner_2train_document_dm(CYT */ __pyx_v_i = (__pyx_v_i + 1); - /* "gensim/models/doc2vec_inner.pyx":454 + /* "gensim/models/doc2vec_inner.pyx":537 * result += 1 * i += 1 * if i == MAX_DOCUMENT_LEN: # <<<<<<<<<<<<<< @@ -6121,7 +6113,7 @@ static PyObject *__pyx_pf_6gensim_6models_13doc2vec_inner_2train_document_dm(CYT __pyx_t_6 = ((__pyx_v_i == 0x2710) != 0); if (__pyx_t_6) { - /* "gensim/models/doc2vec_inner.pyx":455 + /* "gensim/models/doc2vec_inner.pyx":538 * i += 1 * if i == MAX_DOCUMENT_LEN: * break # TODO: log warning, tally overflow? # <<<<<<<<<<<<<< @@ -6130,7 +6122,7 @@ static PyObject *__pyx_pf_6gensim_6models_13doc2vec_inner_2train_document_dm(CYT */ goto __pyx_L15_break; - /* "gensim/models/doc2vec_inner.pyx":454 + /* "gensim/models/doc2vec_inner.pyx":537 * result += 1 * i += 1 * if i == MAX_DOCUMENT_LEN: # <<<<<<<<<<<<<< @@ -6139,7 +6131,7 @@ static PyObject *__pyx_pf_6gensim_6models_13doc2vec_inner_2train_document_dm(CYT */ } - /* "gensim/models/doc2vec_inner.pyx":441 + /* "gensim/models/doc2vec_inner.pyx":524 * vlookup = model.wv.vocab * i = 0 * for token in doc_words: # <<<<<<<<<<<<<< @@ -6151,7 +6143,7 @@ static PyObject *__pyx_pf_6gensim_6models_13doc2vec_inner_2train_document_dm(CYT __pyx_L15_break:; __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0; - /* "gensim/models/doc2vec_inner.pyx":456 + /* "gensim/models/doc2vec_inner.pyx":539 * if i == MAX_DOCUMENT_LEN: * break # TODO: log warning, tally overflow? * document_len = i # <<<<<<<<<<<<<< @@ -6160,7 +6152,7 @@ static PyObject *__pyx_pf_6gensim_6models_13doc2vec_inner_2train_document_dm(CYT */ __pyx_v_document_len = __pyx_v_i; - /* "gensim/models/doc2vec_inner.pyx":459 + /* "gensim/models/doc2vec_inner.pyx":542 * * # single randint() call avoids a big thread-sync slowdown * for i, item in enumerate(model.random.randint(0, window, document_len)): # <<<<<<<<<<<<<< @@ -6168,14 +6160,14 @@ static PyObject *__pyx_pf_6gensim_6models_13doc2vec_inner_2train_document_dm(CYT * */ __pyx_t_2 = 0; - __pyx_t_10 = __Pyx_PyObject_GetAttrStr(__pyx_v_model, __pyx_n_s_random); if (unlikely(!__pyx_t_10)) __PYX_ERR(0, 459, __pyx_L1_error) + __pyx_t_10 = __Pyx_PyObject_GetAttrStr(__pyx_v_model, __pyx_n_s_random); if (unlikely(!__pyx_t_10)) __PYX_ERR(0, 542, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_10); - __pyx_t_1 = __Pyx_PyObject_GetAttrStr(__pyx_t_10, __pyx_n_s_randint); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 459, __pyx_L1_error) + __pyx_t_1 = __Pyx_PyObject_GetAttrStr(__pyx_t_10, __pyx_n_s_randint); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 542, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_1); __Pyx_DECREF(__pyx_t_10); __pyx_t_10 = 0; - __pyx_t_10 = __Pyx_PyInt_From_int(__pyx_v_window); if (unlikely(!__pyx_t_10)) __PYX_ERR(0, 459, __pyx_L1_error) + __pyx_t_10 = __Pyx_PyInt_From_int(__pyx_v_window); if (unlikely(!__pyx_t_10)) __PYX_ERR(0, 542, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_10); - __pyx_t_8 = __Pyx_PyInt_From_int(__pyx_v_document_len); if (unlikely(!__pyx_t_8)) __PYX_ERR(0, 459, __pyx_L1_error) + __pyx_t_8 = __Pyx_PyInt_From_int(__pyx_v_document_len); if (unlikely(!__pyx_t_8)) __PYX_ERR(0, 542, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_8); __pyx_t_14 = NULL; __pyx_t_15 = 0; @@ -6192,7 +6184,7 @@ static PyObject *__pyx_pf_6gensim_6models_13doc2vec_inner_2train_document_dm(CYT #if CYTHON_FAST_PYCALL if (PyFunction_Check(__pyx_t_1)) { PyObject *__pyx_temp[4] = {__pyx_t_14, __pyx_int_0, __pyx_t_10, __pyx_t_8}; - __pyx_t_3 = __Pyx_PyFunction_FastCall(__pyx_t_1, __pyx_temp+1-__pyx_t_15, 3+__pyx_t_15); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 459, __pyx_L1_error) + __pyx_t_3 = __Pyx_PyFunction_FastCall(__pyx_t_1, __pyx_temp+1-__pyx_t_15, 3+__pyx_t_15); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 542, __pyx_L1_error) __Pyx_XDECREF(__pyx_t_14); __pyx_t_14 = 0; __Pyx_GOTREF(__pyx_t_3); __Pyx_DECREF(__pyx_t_10); __pyx_t_10 = 0; @@ -6202,7 +6194,7 @@ static PyObject *__pyx_pf_6gensim_6models_13doc2vec_inner_2train_document_dm(CYT #if CYTHON_FAST_PYCCALL if (__Pyx_PyFastCFunction_Check(__pyx_t_1)) { PyObject *__pyx_temp[4] = {__pyx_t_14, __pyx_int_0, __pyx_t_10, __pyx_t_8}; - __pyx_t_3 = __Pyx_PyCFunction_FastCall(__pyx_t_1, __pyx_temp+1-__pyx_t_15, 3+__pyx_t_15); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 459, __pyx_L1_error) + __pyx_t_3 = __Pyx_PyCFunction_FastCall(__pyx_t_1, __pyx_temp+1-__pyx_t_15, 3+__pyx_t_15); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 542, __pyx_L1_error) __Pyx_XDECREF(__pyx_t_14); __pyx_t_14 = 0; __Pyx_GOTREF(__pyx_t_3); __Pyx_DECREF(__pyx_t_10); __pyx_t_10 = 0; @@ -6210,7 +6202,7 @@ static PyObject *__pyx_pf_6gensim_6models_13doc2vec_inner_2train_document_dm(CYT } else #endif { - __pyx_t_16 = PyTuple_New(3+__pyx_t_15); if (unlikely(!__pyx_t_16)) __PYX_ERR(0, 459, __pyx_L1_error) + __pyx_t_16 = PyTuple_New(3+__pyx_t_15); if (unlikely(!__pyx_t_16)) __PYX_ERR(0, 542, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_16); if (__pyx_t_14) { __Pyx_GIVEREF(__pyx_t_14); PyTuple_SET_ITEM(__pyx_t_16, 0, __pyx_t_14); __pyx_t_14 = NULL; @@ -6224,7 +6216,7 @@ static PyObject *__pyx_pf_6gensim_6models_13doc2vec_inner_2train_document_dm(CYT PyTuple_SET_ITEM(__pyx_t_16, 2+__pyx_t_15, __pyx_t_8); __pyx_t_10 = 0; __pyx_t_8 = 0; - __pyx_t_3 = __Pyx_PyObject_Call(__pyx_t_1, __pyx_t_16, NULL); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 459, __pyx_L1_error) + __pyx_t_3 = __Pyx_PyObject_Call(__pyx_t_1, __pyx_t_16, NULL); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 542, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_3); __Pyx_DECREF(__pyx_t_16); __pyx_t_16 = 0; } @@ -6233,9 +6225,9 @@ static PyObject *__pyx_pf_6gensim_6models_13doc2vec_inner_2train_document_dm(CYT __pyx_t_1 = __pyx_t_3; __Pyx_INCREF(__pyx_t_1); __pyx_t_7 = 0; __pyx_t_11 = NULL; } else { - __pyx_t_7 = -1; __pyx_t_1 = PyObject_GetIter(__pyx_t_3); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 459, __pyx_L1_error) + __pyx_t_7 = -1; __pyx_t_1 = PyObject_GetIter(__pyx_t_3); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 542, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_1); - __pyx_t_11 = Py_TYPE(__pyx_t_1)->tp_iternext; if (unlikely(!__pyx_t_11)) __PYX_ERR(0, 459, __pyx_L1_error) + __pyx_t_11 = Py_TYPE(__pyx_t_1)->tp_iternext; if (unlikely(!__pyx_t_11)) __PYX_ERR(0, 542, __pyx_L1_error) } __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0; for (;;) { @@ -6243,17 +6235,17 @@ static PyObject *__pyx_pf_6gensim_6models_13doc2vec_inner_2train_document_dm(CYT if (likely(PyList_CheckExact(__pyx_t_1))) { if (__pyx_t_7 >= PyList_GET_SIZE(__pyx_t_1)) break; #if CYTHON_ASSUME_SAFE_MACROS && !CYTHON_AVOID_BORROWED_REFS - __pyx_t_3 = PyList_GET_ITEM(__pyx_t_1, __pyx_t_7); __Pyx_INCREF(__pyx_t_3); __pyx_t_7++; if (unlikely(0 < 0)) __PYX_ERR(0, 459, __pyx_L1_error) + __pyx_t_3 = PyList_GET_ITEM(__pyx_t_1, __pyx_t_7); __Pyx_INCREF(__pyx_t_3); __pyx_t_7++; if (unlikely(0 < 0)) __PYX_ERR(0, 542, __pyx_L1_error) #else - __pyx_t_3 = PySequence_ITEM(__pyx_t_1, __pyx_t_7); __pyx_t_7++; if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 459, __pyx_L1_error) + __pyx_t_3 = PySequence_ITEM(__pyx_t_1, __pyx_t_7); __pyx_t_7++; if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 542, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_3); #endif } else { if (__pyx_t_7 >= PyTuple_GET_SIZE(__pyx_t_1)) break; #if CYTHON_ASSUME_SAFE_MACROS && !CYTHON_AVOID_BORROWED_REFS - __pyx_t_3 = PyTuple_GET_ITEM(__pyx_t_1, __pyx_t_7); __Pyx_INCREF(__pyx_t_3); __pyx_t_7++; if (unlikely(0 < 0)) __PYX_ERR(0, 459, __pyx_L1_error) + __pyx_t_3 = PyTuple_GET_ITEM(__pyx_t_1, __pyx_t_7); __Pyx_INCREF(__pyx_t_3); __pyx_t_7++; if (unlikely(0 < 0)) __PYX_ERR(0, 542, __pyx_L1_error) #else - __pyx_t_3 = PySequence_ITEM(__pyx_t_1, __pyx_t_7); __pyx_t_7++; if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 459, __pyx_L1_error) + __pyx_t_3 = PySequence_ITEM(__pyx_t_1, __pyx_t_7); __pyx_t_7++; if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 542, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_3); #endif } @@ -6263,7 +6255,7 @@ static PyObject *__pyx_pf_6gensim_6models_13doc2vec_inner_2train_document_dm(CYT PyObject* exc_type = PyErr_Occurred(); if (exc_type) { if (likely(__Pyx_PyErr_GivenExceptionMatches(exc_type, PyExc_StopIteration))) PyErr_Clear(); - else __PYX_ERR(0, 459, __pyx_L1_error) + else __PYX_ERR(0, 542, __pyx_L1_error) } break; } @@ -6274,17 +6266,17 @@ static PyObject *__pyx_pf_6gensim_6models_13doc2vec_inner_2train_document_dm(CYT __pyx_v_i = __pyx_t_2; __pyx_t_2 = (__pyx_t_2 + 1); - /* "gensim/models/doc2vec_inner.pyx":460 + /* "gensim/models/doc2vec_inner.pyx":543 * # single randint() call avoids a big thread-sync slowdown * for i, item in enumerate(model.random.randint(0, window, document_len)): * reduced_windows[i] = item # <<<<<<<<<<<<<< * * doctag_len = min(MAX_DOCUMENT_LEN, len(doctag_indexes)) */ - __pyx_t_12 = __Pyx_PyInt_As_npy_uint32(__pyx_v_item); if (unlikely((__pyx_t_12 == ((npy_uint32)-1)) && PyErr_Occurred())) __PYX_ERR(0, 460, __pyx_L1_error) + __pyx_t_12 = __Pyx_PyInt_As_npy_uint32(__pyx_v_item); if (unlikely((__pyx_t_12 == ((npy_uint32)-1)) && PyErr_Occurred())) __PYX_ERR(0, 543, __pyx_L1_error) (__pyx_v_reduced_windows[__pyx_v_i]) = __pyx_t_12; - /* "gensim/models/doc2vec_inner.pyx":459 + /* "gensim/models/doc2vec_inner.pyx":542 * * # single randint() call avoids a big thread-sync slowdown * for i, item in enumerate(model.random.randint(0, window, document_len)): # <<<<<<<<<<<<<< @@ -6294,14 +6286,14 @@ static PyObject *__pyx_pf_6gensim_6models_13doc2vec_inner_2train_document_dm(CYT } __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0; - /* "gensim/models/doc2vec_inner.pyx":462 + /* "gensim/models/doc2vec_inner.pyx":545 * reduced_windows[i] = item * * doctag_len = min(MAX_DOCUMENT_LEN, len(doctag_indexes)) # <<<<<<<<<<<<<< * for i in range(doctag_len): * _doctag_indexes[i] = doctag_indexes[i] */ - __pyx_t_7 = PyObject_Length(__pyx_v_doctag_indexes); if (unlikely(__pyx_t_7 == ((Py_ssize_t)-1))) __PYX_ERR(0, 462, __pyx_L1_error) + __pyx_t_7 = PyObject_Length(__pyx_v_doctag_indexes); if (unlikely(__pyx_t_7 == ((Py_ssize_t)-1))) __PYX_ERR(0, 545, __pyx_L1_error) __pyx_t_17 = 0x2710; if (((__pyx_t_7 < __pyx_t_17) != 0)) { __pyx_t_13 = __pyx_t_7; @@ -6310,7 +6302,7 @@ static PyObject *__pyx_pf_6gensim_6models_13doc2vec_inner_2train_document_dm(CYT } __pyx_v_doctag_len = ((int)__pyx_t_13); - /* "gensim/models/doc2vec_inner.pyx":463 + /* "gensim/models/doc2vec_inner.pyx":546 * * doctag_len = min(MAX_DOCUMENT_LEN, len(doctag_indexes)) * for i in range(doctag_len): # <<<<<<<<<<<<<< @@ -6321,20 +6313,20 @@ static PyObject *__pyx_pf_6gensim_6models_13doc2vec_inner_2train_document_dm(CYT for (__pyx_t_15 = 0; __pyx_t_15 < __pyx_t_2; __pyx_t_15+=1) { __pyx_v_i = __pyx_t_15; - /* "gensim/models/doc2vec_inner.pyx":464 + /* "gensim/models/doc2vec_inner.pyx":547 * doctag_len = min(MAX_DOCUMENT_LEN, len(doctag_indexes)) * for i in range(doctag_len): * _doctag_indexes[i] = doctag_indexes[i] # <<<<<<<<<<<<<< * result += 1 * */ - __pyx_t_1 = __Pyx_GetItemInt(__pyx_v_doctag_indexes, __pyx_v_i, int, 1, __Pyx_PyInt_From_int, 0, 0, 0); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 464, __pyx_L1_error) + __pyx_t_1 = __Pyx_GetItemInt(__pyx_v_doctag_indexes, __pyx_v_i, int, 1, __Pyx_PyInt_From_int, 0, 0, 0); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 547, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_1); - __pyx_t_12 = __Pyx_PyInt_As_npy_uint32(__pyx_t_1); if (unlikely((__pyx_t_12 == ((npy_uint32)-1)) && PyErr_Occurred())) __PYX_ERR(0, 464, __pyx_L1_error) + __pyx_t_12 = __Pyx_PyInt_As_npy_uint32(__pyx_t_1); if (unlikely((__pyx_t_12 == ((npy_uint32)-1)) && PyErr_Occurred())) __PYX_ERR(0, 547, __pyx_L1_error) __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0; (__pyx_v__doctag_indexes[__pyx_v_i]) = __pyx_t_12; - /* "gensim/models/doc2vec_inner.pyx":465 + /* "gensim/models/doc2vec_inner.pyx":548 * for i in range(doctag_len): * _doctag_indexes[i] = doctag_indexes[i] * result += 1 # <<<<<<<<<<<<<< @@ -6344,7 +6336,7 @@ static PyObject *__pyx_pf_6gensim_6models_13doc2vec_inner_2train_document_dm(CYT __pyx_v_result = (__pyx_v_result + 1); } - /* "gensim/models/doc2vec_inner.pyx":468 + /* "gensim/models/doc2vec_inner.pyx":551 * * # release GIL & train on the document * with nogil: # <<<<<<<<<<<<<< @@ -6359,7 +6351,7 @@ static PyObject *__pyx_pf_6gensim_6models_13doc2vec_inner_2train_document_dm(CYT #endif /*try:*/ { - /* "gensim/models/doc2vec_inner.pyx":469 + /* "gensim/models/doc2vec_inner.pyx":552 * # release GIL & train on the document * with nogil: * for i in range(document_len): # <<<<<<<<<<<<<< @@ -6370,7 +6362,7 @@ static PyObject *__pyx_pf_6gensim_6models_13doc2vec_inner_2train_document_dm(CYT for (__pyx_t_15 = 0; __pyx_t_15 < __pyx_t_2; __pyx_t_15+=1) { __pyx_v_i = __pyx_t_15; - /* "gensim/models/doc2vec_inner.pyx":470 + /* "gensim/models/doc2vec_inner.pyx":553 * with nogil: * for i in range(document_len): * j = i - window + reduced_windows[i] # <<<<<<<<<<<<<< @@ -6379,7 +6371,7 @@ static PyObject *__pyx_pf_6gensim_6models_13doc2vec_inner_2train_document_dm(CYT */ __pyx_v_j = ((__pyx_v_i - __pyx_v_window) + (__pyx_v_reduced_windows[__pyx_v_i])); - /* "gensim/models/doc2vec_inner.pyx":471 + /* "gensim/models/doc2vec_inner.pyx":554 * for i in range(document_len): * j = i - window + reduced_windows[i] * if j < 0: # <<<<<<<<<<<<<< @@ -6389,7 +6381,7 @@ static PyObject *__pyx_pf_6gensim_6models_13doc2vec_inner_2train_document_dm(CYT __pyx_t_6 = ((__pyx_v_j < 0) != 0); if (__pyx_t_6) { - /* "gensim/models/doc2vec_inner.pyx":472 + /* "gensim/models/doc2vec_inner.pyx":555 * j = i - window + reduced_windows[i] * if j < 0: * j = 0 # <<<<<<<<<<<<<< @@ -6398,7 +6390,7 @@ static PyObject *__pyx_pf_6gensim_6models_13doc2vec_inner_2train_document_dm(CYT */ __pyx_v_j = 0; - /* "gensim/models/doc2vec_inner.pyx":471 + /* "gensim/models/doc2vec_inner.pyx":554 * for i in range(document_len): * j = i - window + reduced_windows[i] * if j < 0: # <<<<<<<<<<<<<< @@ -6407,7 +6399,7 @@ static PyObject *__pyx_pf_6gensim_6models_13doc2vec_inner_2train_document_dm(CYT */ } - /* "gensim/models/doc2vec_inner.pyx":473 + /* "gensim/models/doc2vec_inner.pyx":556 * if j < 0: * j = 0 * k = i + window + 1 - reduced_windows[i] # <<<<<<<<<<<<<< @@ -6416,7 +6408,7 @@ static PyObject *__pyx_pf_6gensim_6models_13doc2vec_inner_2train_document_dm(CYT */ __pyx_v_k = (((__pyx_v_i + __pyx_v_window) + 1) - (__pyx_v_reduced_windows[__pyx_v_i])); - /* "gensim/models/doc2vec_inner.pyx":474 + /* "gensim/models/doc2vec_inner.pyx":557 * j = 0 * k = i + window + 1 - reduced_windows[i] * if k > document_len: # <<<<<<<<<<<<<< @@ -6426,7 +6418,7 @@ static PyObject *__pyx_pf_6gensim_6models_13doc2vec_inner_2train_document_dm(CYT __pyx_t_6 = ((__pyx_v_k > __pyx_v_document_len) != 0); if (__pyx_t_6) { - /* "gensim/models/doc2vec_inner.pyx":475 + /* "gensim/models/doc2vec_inner.pyx":558 * k = i + window + 1 - reduced_windows[i] * if k > document_len: * k = document_len # <<<<<<<<<<<<<< @@ -6435,7 +6427,7 @@ static PyObject *__pyx_pf_6gensim_6models_13doc2vec_inner_2train_document_dm(CYT */ __pyx_v_k = __pyx_v_document_len; - /* "gensim/models/doc2vec_inner.pyx":474 + /* "gensim/models/doc2vec_inner.pyx":557 * j = 0 * k = i + window + 1 - reduced_windows[i] * if k > document_len: # <<<<<<<<<<<<<< @@ -6444,7 +6436,7 @@ static PyObject *__pyx_pf_6gensim_6models_13doc2vec_inner_2train_document_dm(CYT */ } - /* "gensim/models/doc2vec_inner.pyx":478 + /* "gensim/models/doc2vec_inner.pyx":561 * * # compose l1 (in _neu1) & clear _work * memset(_neu1, 0, size * cython.sizeof(REAL_t)) # <<<<<<<<<<<<<< @@ -6453,7 +6445,7 @@ static PyObject *__pyx_pf_6gensim_6models_13doc2vec_inner_2train_document_dm(CYT */ memset(__pyx_v__neu1, 0, (__pyx_v_size * (sizeof(__pyx_t_6gensim_6models_14word2vec_inner_REAL_t)))); - /* "gensim/models/doc2vec_inner.pyx":479 + /* "gensim/models/doc2vec_inner.pyx":562 * # compose l1 (in _neu1) & clear _work * memset(_neu1, 0, size * cython.sizeof(REAL_t)) * count = 0.0 # <<<<<<<<<<<<<< @@ -6462,7 +6454,7 @@ static PyObject *__pyx_pf_6gensim_6models_13doc2vec_inner_2train_document_dm(CYT */ __pyx_v_count = ((__pyx_t_6gensim_6models_14word2vec_inner_REAL_t)0.0); - /* "gensim/models/doc2vec_inner.pyx":480 + /* "gensim/models/doc2vec_inner.pyx":563 * memset(_neu1, 0, size * cython.sizeof(REAL_t)) * count = 0.0 * for m in range(j, k): # <<<<<<<<<<<<<< @@ -6473,7 +6465,7 @@ static PyObject *__pyx_pf_6gensim_6models_13doc2vec_inner_2train_document_dm(CYT for (__pyx_t_19 = __pyx_v_j; __pyx_t_19 < __pyx_t_18; __pyx_t_19+=1) { __pyx_v_m = __pyx_t_19; - /* "gensim/models/doc2vec_inner.pyx":481 + /* "gensim/models/doc2vec_inner.pyx":564 * count = 0.0 * for m in range(j, k): * if m == i: # <<<<<<<<<<<<<< @@ -6483,7 +6475,7 @@ static PyObject *__pyx_pf_6gensim_6models_13doc2vec_inner_2train_document_dm(CYT __pyx_t_6 = ((__pyx_v_m == __pyx_v_i) != 0); if (__pyx_t_6) { - /* "gensim/models/doc2vec_inner.pyx":482 + /* "gensim/models/doc2vec_inner.pyx":565 * for m in range(j, k): * if m == i: * continue # <<<<<<<<<<<<<< @@ -6492,7 +6484,7 @@ static PyObject *__pyx_pf_6gensim_6models_13doc2vec_inner_2train_document_dm(CYT */ goto __pyx_L33_continue; - /* "gensim/models/doc2vec_inner.pyx":481 + /* "gensim/models/doc2vec_inner.pyx":564 * count = 0.0 * for m in range(j, k): * if m == i: # <<<<<<<<<<<<<< @@ -6501,7 +6493,7 @@ static PyObject *__pyx_pf_6gensim_6models_13doc2vec_inner_2train_document_dm(CYT */ } - /* "gensim/models/doc2vec_inner.pyx":484 + /* "gensim/models/doc2vec_inner.pyx":567 * continue * else: * count += ONEF # <<<<<<<<<<<<<< @@ -6511,7 +6503,7 @@ static PyObject *__pyx_pf_6gensim_6models_13doc2vec_inner_2train_document_dm(CYT /*else*/ { __pyx_v_count = (__pyx_v_count + __pyx_v_6gensim_6models_13doc2vec_inner_ONEF); - /* "gensim/models/doc2vec_inner.pyx":485 + /* "gensim/models/doc2vec_inner.pyx":568 * else: * count += ONEF * our_saxpy(&size, &ONEF, &_word_vectors[indexes[m] * size], &ONE, _neu1, &ONE) # <<<<<<<<<<<<<< @@ -6523,7 +6515,7 @@ static PyObject *__pyx_pf_6gensim_6models_13doc2vec_inner_2train_document_dm(CYT __pyx_L33_continue:; } - /* "gensim/models/doc2vec_inner.pyx":486 + /* "gensim/models/doc2vec_inner.pyx":569 * count += ONEF * our_saxpy(&size, &ONEF, &_word_vectors[indexes[m] * size], &ONE, _neu1, &ONE) * for m in range(doctag_len): # <<<<<<<<<<<<<< @@ -6534,7 +6526,7 @@ static PyObject *__pyx_pf_6gensim_6models_13doc2vec_inner_2train_document_dm(CYT for (__pyx_t_19 = 0; __pyx_t_19 < __pyx_t_18; __pyx_t_19+=1) { __pyx_v_m = __pyx_t_19; - /* "gensim/models/doc2vec_inner.pyx":487 + /* "gensim/models/doc2vec_inner.pyx":570 * our_saxpy(&size, &ONEF, &_word_vectors[indexes[m] * size], &ONE, _neu1, &ONE) * for m in range(doctag_len): * count += ONEF # <<<<<<<<<<<<<< @@ -6543,7 +6535,7 @@ static PyObject *__pyx_pf_6gensim_6models_13doc2vec_inner_2train_document_dm(CYT */ __pyx_v_count = (__pyx_v_count + __pyx_v_6gensim_6models_13doc2vec_inner_ONEF); - /* "gensim/models/doc2vec_inner.pyx":488 + /* "gensim/models/doc2vec_inner.pyx":571 * for m in range(doctag_len): * count += ONEF * our_saxpy(&size, &ONEF, &_doctag_vectors[_doctag_indexes[m] * size], &ONE, _neu1, &ONE) # <<<<<<<<<<<<<< @@ -6553,7 +6545,7 @@ static PyObject *__pyx_pf_6gensim_6models_13doc2vec_inner_2train_document_dm(CYT __pyx_v_6gensim_6models_14word2vec_inner_our_saxpy((&__pyx_v_size), (&__pyx_v_6gensim_6models_13doc2vec_inner_ONEF), (&(__pyx_v__doctag_vectors[((__pyx_v__doctag_indexes[__pyx_v_m]) * __pyx_v_size)])), (&__pyx_v_6gensim_6models_13doc2vec_inner_ONE), __pyx_v__neu1, (&__pyx_v_6gensim_6models_13doc2vec_inner_ONE)); } - /* "gensim/models/doc2vec_inner.pyx":489 + /* "gensim/models/doc2vec_inner.pyx":572 * count += ONEF * our_saxpy(&size, &ONEF, &_doctag_vectors[_doctag_indexes[m] * size], &ONE, _neu1, &ONE) * if count > (0.5): # <<<<<<<<<<<<<< @@ -6563,7 +6555,7 @@ static PyObject *__pyx_pf_6gensim_6models_13doc2vec_inner_2train_document_dm(CYT __pyx_t_6 = ((__pyx_v_count > ((__pyx_t_6gensim_6models_14word2vec_inner_REAL_t)0.5)) != 0); if (__pyx_t_6) { - /* "gensim/models/doc2vec_inner.pyx":490 + /* "gensim/models/doc2vec_inner.pyx":573 * our_saxpy(&size, &ONEF, &_doctag_vectors[_doctag_indexes[m] * size], &ONE, _neu1, &ONE) * if count > (0.5): * inv_count = ONEF/count # <<<<<<<<<<<<<< @@ -6572,7 +6564,7 @@ static PyObject *__pyx_pf_6gensim_6models_13doc2vec_inner_2train_document_dm(CYT */ __pyx_v_inv_count = (__pyx_v_6gensim_6models_13doc2vec_inner_ONEF / __pyx_v_count); - /* "gensim/models/doc2vec_inner.pyx":489 + /* "gensim/models/doc2vec_inner.pyx":572 * count += ONEF * our_saxpy(&size, &ONEF, &_doctag_vectors[_doctag_indexes[m] * size], &ONE, _neu1, &ONE) * if count > (0.5): # <<<<<<<<<<<<<< @@ -6581,7 +6573,7 @@ static PyObject *__pyx_pf_6gensim_6models_13doc2vec_inner_2train_document_dm(CYT */ } - /* "gensim/models/doc2vec_inner.pyx":491 + /* "gensim/models/doc2vec_inner.pyx":574 * if count > (0.5): * inv_count = ONEF/count * if cbow_mean: # <<<<<<<<<<<<<< @@ -6591,7 +6583,7 @@ static PyObject *__pyx_pf_6gensim_6models_13doc2vec_inner_2train_document_dm(CYT __pyx_t_6 = (__pyx_v_cbow_mean != 0); if (__pyx_t_6) { - /* "gensim/models/doc2vec_inner.pyx":492 + /* "gensim/models/doc2vec_inner.pyx":575 * inv_count = ONEF/count * if cbow_mean: * sscal(&size, &inv_count, _neu1, &ONE) # (does this need BLAS-variants like saxpy?) # <<<<<<<<<<<<<< @@ -6600,7 +6592,7 @@ static PyObject *__pyx_pf_6gensim_6models_13doc2vec_inner_2train_document_dm(CYT */ __pyx_v_6gensim_6models_14word2vec_inner_sscal((&__pyx_v_size), (&__pyx_v_inv_count), __pyx_v__neu1, (&__pyx_v_6gensim_6models_13doc2vec_inner_ONE)); - /* "gensim/models/doc2vec_inner.pyx":491 + /* "gensim/models/doc2vec_inner.pyx":574 * if count > (0.5): * inv_count = ONEF/count * if cbow_mean: # <<<<<<<<<<<<<< @@ -6609,7 +6601,7 @@ static PyObject *__pyx_pf_6gensim_6models_13doc2vec_inner_2train_document_dm(CYT */ } - /* "gensim/models/doc2vec_inner.pyx":493 + /* "gensim/models/doc2vec_inner.pyx":576 * if cbow_mean: * sscal(&size, &inv_count, _neu1, &ONE) # (does this need BLAS-variants like saxpy?) * memset(_work, 0, size * cython.sizeof(REAL_t)) # work to accumulate l1 error # <<<<<<<<<<<<<< @@ -6618,7 +6610,7 @@ static PyObject *__pyx_pf_6gensim_6models_13doc2vec_inner_2train_document_dm(CYT */ memset(__pyx_v__work, 0, (__pyx_v_size * (sizeof(__pyx_t_6gensim_6models_14word2vec_inner_REAL_t)))); - /* "gensim/models/doc2vec_inner.pyx":494 + /* "gensim/models/doc2vec_inner.pyx":577 * sscal(&size, &inv_count, _neu1, &ONE) # (does this need BLAS-variants like saxpy?) * memset(_work, 0, size * cython.sizeof(REAL_t)) # work to accumulate l1 error * if hs: # <<<<<<<<<<<<<< @@ -6628,7 +6620,7 @@ static PyObject *__pyx_pf_6gensim_6models_13doc2vec_inner_2train_document_dm(CYT __pyx_t_6 = (__pyx_v_hs != 0); if (__pyx_t_6) { - /* "gensim/models/doc2vec_inner.pyx":495 + /* "gensim/models/doc2vec_inner.pyx":578 * memset(_work, 0, size * cython.sizeof(REAL_t)) # work to accumulate l1 error * if hs: * fast_document_dm_hs(points[i], codes[i], codelens[i], # <<<<<<<<<<<<<< @@ -6637,7 +6629,7 @@ static PyObject *__pyx_pf_6gensim_6models_13doc2vec_inner_2train_document_dm(CYT */ __pyx_f_6gensim_6models_13doc2vec_inner_fast_document_dm_hs((__pyx_v_points[__pyx_v_i]), (__pyx_v_codes[__pyx_v_i]), (__pyx_v_codelens[__pyx_v_i]), __pyx_v__neu1, __pyx_v_syn1, __pyx_v__alpha, __pyx_v__work, __pyx_v_size, __pyx_v__learn_hidden); - /* "gensim/models/doc2vec_inner.pyx":494 + /* "gensim/models/doc2vec_inner.pyx":577 * sscal(&size, &inv_count, _neu1, &ONE) # (does this need BLAS-variants like saxpy?) * memset(_work, 0, size * cython.sizeof(REAL_t)) # work to accumulate l1 error * if hs: # <<<<<<<<<<<<<< @@ -6646,7 +6638,7 @@ static PyObject *__pyx_pf_6gensim_6models_13doc2vec_inner_2train_document_dm(CYT */ } - /* "gensim/models/doc2vec_inner.pyx":498 + /* "gensim/models/doc2vec_inner.pyx":581 * _neu1, syn1, _alpha, _work, * size, _learn_hidden) * if negative: # <<<<<<<<<<<<<< @@ -6656,7 +6648,7 @@ static PyObject *__pyx_pf_6gensim_6models_13doc2vec_inner_2train_document_dm(CYT __pyx_t_6 = (__pyx_v_negative != 0); if (__pyx_t_6) { - /* "gensim/models/doc2vec_inner.pyx":499 + /* "gensim/models/doc2vec_inner.pyx":582 * size, _learn_hidden) * if negative: * next_random = fast_document_dm_neg(negative, cum_table, cum_table_len, next_random, # <<<<<<<<<<<<<< @@ -6665,7 +6657,7 @@ static PyObject *__pyx_pf_6gensim_6models_13doc2vec_inner_2train_document_dm(CYT */ __pyx_v_next_random = __pyx_f_6gensim_6models_13doc2vec_inner_fast_document_dm_neg(__pyx_v_negative, __pyx_v_cum_table, __pyx_v_cum_table_len, __pyx_v_next_random, __pyx_v__neu1, __pyx_v_syn1neg, (__pyx_v_indexes[__pyx_v_i]), __pyx_v__alpha, __pyx_v__work, __pyx_v_size, __pyx_v__learn_hidden); - /* "gensim/models/doc2vec_inner.pyx":498 + /* "gensim/models/doc2vec_inner.pyx":581 * _neu1, syn1, _alpha, _work, * size, _learn_hidden) * if negative: # <<<<<<<<<<<<<< @@ -6674,7 +6666,7 @@ static PyObject *__pyx_pf_6gensim_6models_13doc2vec_inner_2train_document_dm(CYT */ } - /* "gensim/models/doc2vec_inner.pyx":503 + /* "gensim/models/doc2vec_inner.pyx":586 * size, _learn_hidden) * * if not cbow_mean: # <<<<<<<<<<<<<< @@ -6684,7 +6676,7 @@ static PyObject *__pyx_pf_6gensim_6models_13doc2vec_inner_2train_document_dm(CYT __pyx_t_6 = ((!(__pyx_v_cbow_mean != 0)) != 0); if (__pyx_t_6) { - /* "gensim/models/doc2vec_inner.pyx":504 + /* "gensim/models/doc2vec_inner.pyx":587 * * if not cbow_mean: * sscal(&size, &inv_count, _work, &ONE) # (does this need BLAS-variants like saxpy?) # <<<<<<<<<<<<<< @@ -6693,7 +6685,7 @@ static PyObject *__pyx_pf_6gensim_6models_13doc2vec_inner_2train_document_dm(CYT */ __pyx_v_6gensim_6models_14word2vec_inner_sscal((&__pyx_v_size), (&__pyx_v_inv_count), __pyx_v__work, (&__pyx_v_6gensim_6models_13doc2vec_inner_ONE)); - /* "gensim/models/doc2vec_inner.pyx":503 + /* "gensim/models/doc2vec_inner.pyx":586 * size, _learn_hidden) * * if not cbow_mean: # <<<<<<<<<<<<<< @@ -6702,7 +6694,7 @@ static PyObject *__pyx_pf_6gensim_6models_13doc2vec_inner_2train_document_dm(CYT */ } - /* "gensim/models/doc2vec_inner.pyx":506 + /* "gensim/models/doc2vec_inner.pyx":589 * sscal(&size, &inv_count, _work, &ONE) # (does this need BLAS-variants like saxpy?) * # apply accumulated error in work * if _learn_doctags: # <<<<<<<<<<<<<< @@ -6712,7 +6704,7 @@ static PyObject *__pyx_pf_6gensim_6models_13doc2vec_inner_2train_document_dm(CYT __pyx_t_6 = (__pyx_v__learn_doctags != 0); if (__pyx_t_6) { - /* "gensim/models/doc2vec_inner.pyx":507 + /* "gensim/models/doc2vec_inner.pyx":590 * # apply accumulated error in work * if _learn_doctags: * for m in range(doctag_len): # <<<<<<<<<<<<<< @@ -6723,7 +6715,7 @@ static PyObject *__pyx_pf_6gensim_6models_13doc2vec_inner_2train_document_dm(CYT for (__pyx_t_19 = 0; __pyx_t_19 < __pyx_t_18; __pyx_t_19+=1) { __pyx_v_m = __pyx_t_19; - /* "gensim/models/doc2vec_inner.pyx":508 + /* "gensim/models/doc2vec_inner.pyx":591 * if _learn_doctags: * for m in range(doctag_len): * our_saxpy(&size, &_doctag_locks[_doctag_indexes[m]], _work, # <<<<<<<<<<<<<< @@ -6733,7 +6725,7 @@ static PyObject *__pyx_pf_6gensim_6models_13doc2vec_inner_2train_document_dm(CYT __pyx_v_6gensim_6models_14word2vec_inner_our_saxpy((&__pyx_v_size), (&(__pyx_v__doctag_locks[(__pyx_v__doctag_indexes[__pyx_v_m])])), __pyx_v__work, (&__pyx_v_6gensim_6models_13doc2vec_inner_ONE), (&(__pyx_v__doctag_vectors[((__pyx_v__doctag_indexes[__pyx_v_m]) * __pyx_v_size)])), (&__pyx_v_6gensim_6models_13doc2vec_inner_ONE)); } - /* "gensim/models/doc2vec_inner.pyx":506 + /* "gensim/models/doc2vec_inner.pyx":589 * sscal(&size, &inv_count, _work, &ONE) # (does this need BLAS-variants like saxpy?) * # apply accumulated error in work * if _learn_doctags: # <<<<<<<<<<<<<< @@ -6742,7 +6734,7 @@ static PyObject *__pyx_pf_6gensim_6models_13doc2vec_inner_2train_document_dm(CYT */ } - /* "gensim/models/doc2vec_inner.pyx":510 + /* "gensim/models/doc2vec_inner.pyx":593 * our_saxpy(&size, &_doctag_locks[_doctag_indexes[m]], _work, * &ONE, &_doctag_vectors[_doctag_indexes[m] * size], &ONE) * if _learn_words: # <<<<<<<<<<<<<< @@ -6752,7 +6744,7 @@ static PyObject *__pyx_pf_6gensim_6models_13doc2vec_inner_2train_document_dm(CYT __pyx_t_6 = (__pyx_v__learn_words != 0); if (__pyx_t_6) { - /* "gensim/models/doc2vec_inner.pyx":511 + /* "gensim/models/doc2vec_inner.pyx":594 * &ONE, &_doctag_vectors[_doctag_indexes[m] * size], &ONE) * if _learn_words: * for m in range(j, k): # <<<<<<<<<<<<<< @@ -6763,7 +6755,7 @@ static PyObject *__pyx_pf_6gensim_6models_13doc2vec_inner_2train_document_dm(CYT for (__pyx_t_19 = __pyx_v_j; __pyx_t_19 < __pyx_t_18; __pyx_t_19+=1) { __pyx_v_m = __pyx_t_19; - /* "gensim/models/doc2vec_inner.pyx":512 + /* "gensim/models/doc2vec_inner.pyx":595 * if _learn_words: * for m in range(j, k): * if m == i: # <<<<<<<<<<<<<< @@ -6773,7 +6765,7 @@ static PyObject *__pyx_pf_6gensim_6models_13doc2vec_inner_2train_document_dm(CYT __pyx_t_6 = ((__pyx_v_m == __pyx_v_i) != 0); if (__pyx_t_6) { - /* "gensim/models/doc2vec_inner.pyx":513 + /* "gensim/models/doc2vec_inner.pyx":596 * for m in range(j, k): * if m == i: * continue # <<<<<<<<<<<<<< @@ -6782,7 +6774,7 @@ static PyObject *__pyx_pf_6gensim_6models_13doc2vec_inner_2train_document_dm(CYT */ goto __pyx_L47_continue; - /* "gensim/models/doc2vec_inner.pyx":512 + /* "gensim/models/doc2vec_inner.pyx":595 * if _learn_words: * for m in range(j, k): * if m == i: # <<<<<<<<<<<<<< @@ -6791,7 +6783,7 @@ static PyObject *__pyx_pf_6gensim_6models_13doc2vec_inner_2train_document_dm(CYT */ } - /* "gensim/models/doc2vec_inner.pyx":515 + /* "gensim/models/doc2vec_inner.pyx":598 * continue * else: * our_saxpy(&size, &_word_locks[indexes[m]], _work, &ONE, # <<<<<<<<<<<<<< @@ -6800,7 +6792,7 @@ static PyObject *__pyx_pf_6gensim_6models_13doc2vec_inner_2train_document_dm(CYT */ /*else*/ { - /* "gensim/models/doc2vec_inner.pyx":516 + /* "gensim/models/doc2vec_inner.pyx":599 * else: * our_saxpy(&size, &_word_locks[indexes[m]], _work, &ONE, * &_word_vectors[indexes[m] * size], &ONE) # <<<<<<<<<<<<<< @@ -6812,7 +6804,7 @@ static PyObject *__pyx_pf_6gensim_6models_13doc2vec_inner_2train_document_dm(CYT __pyx_L47_continue:; } - /* "gensim/models/doc2vec_inner.pyx":510 + /* "gensim/models/doc2vec_inner.pyx":593 * our_saxpy(&size, &_doctag_locks[_doctag_indexes[m]], _work, * &ONE, &_doctag_vectors[_doctag_indexes[m] * size], &ONE) * if _learn_words: # <<<<<<<<<<<<<< @@ -6823,7 +6815,7 @@ static PyObject *__pyx_pf_6gensim_6models_13doc2vec_inner_2train_document_dm(CYT } } - /* "gensim/models/doc2vec_inner.pyx":468 + /* "gensim/models/doc2vec_inner.pyx":551 * * # release GIL & train on the document * with nogil: # <<<<<<<<<<<<<< @@ -6842,7 +6834,7 @@ static PyObject *__pyx_pf_6gensim_6models_13doc2vec_inner_2train_document_dm(CYT } } - /* "gensim/models/doc2vec_inner.pyx":518 + /* "gensim/models/doc2vec_inner.pyx":601 * &_word_vectors[indexes[m] * size], &ONE) * * return result # <<<<<<<<<<<<<< @@ -6850,13 +6842,13 @@ static PyObject *__pyx_pf_6gensim_6models_13doc2vec_inner_2train_document_dm(CYT * */ __Pyx_XDECREF(__pyx_r); - __pyx_t_1 = __Pyx_PyInt_From_long(__pyx_v_result); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 518, __pyx_L1_error) + __pyx_t_1 = __Pyx_PyInt_From_long(__pyx_v_result); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 601, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_1); __pyx_r = __pyx_t_1; __pyx_t_1 = 0; goto __pyx_L0; - /* "gensim/models/doc2vec_inner.pyx":363 + /* "gensim/models/doc2vec_inner.pyx":401 * * * def train_document_dm(model, doc_words, doctag_indexes, alpha, work=None, neu1=None, # <<<<<<<<<<<<<< @@ -6890,7 +6882,7 @@ static PyObject *__pyx_pf_6gensim_6models_13doc2vec_inner_2train_document_dm(CYT return __pyx_r; } -/* "gensim/models/doc2vec_inner.pyx":521 +/* "gensim/models/doc2vec_inner.pyx":604 * * * def train_document_dm_concat(model, doc_words, doctag_indexes, alpha, work=None, neu1=None, # <<<<<<<<<<<<<< @@ -6900,7 +6892,8 @@ static PyObject *__pyx_pf_6gensim_6models_13doc2vec_inner_2train_document_dm(CYT /* Python wrapper */ static PyObject *__pyx_pw_6gensim_6models_13doc2vec_inner_5train_document_dm_concat(PyObject *__pyx_self, PyObject *__pyx_args, PyObject *__pyx_kwds); /*proto*/ -static PyMethodDef __pyx_mdef_6gensim_6models_13doc2vec_inner_5train_document_dm_concat = {"train_document_dm_concat", (PyCFunction)__pyx_pw_6gensim_6models_13doc2vec_inner_5train_document_dm_concat, METH_VARARGS|METH_KEYWORDS, 0}; +static char __pyx_doc_6gensim_6models_13doc2vec_inner_4train_document_dm_concat[] = "train_document_dm_concat(model, doc_words, doctag_indexes, alpha, work=None, neu1=None, learn_doctags=True, learn_words=True, learn_hidden=True, word_vectors=None, word_locks=None, doctag_vectors=None, doctag_locks=None)\nUpdate distributed memory model (\"PV-DM\") by training on a single document, using a concatenation of the context\n window word vectors (rather than a sum or average).\n This might be slower since the input at each batch will be significantly larger.\n\n Called internally from :meth:`~gensim.models.doc2vec.Doc2Vec.train` and\n :meth:`~gensim.models.doc2vec.Doc2Vec.infer_vector`.\n\n Parameters\n ----------\n model : :class:`~gensim.models.doc2vec.Doc2Vec`\n The model to train.\n doc_words : list of str\n The input document as a list of words to be used for training. Each word will be looked up in\n the model's vocabulary.\n doctag_indexes : list of int\n Indices into `doctag_vectors` used to obtain the tags of the document.\n alpha : float, optional\n Learning rate.\n work : np.ndarray, optional\n Private working memory for each worker.\n neu1 : np.ndarray, optional\n Private working memory for each worker.\n learn_doctags : bool, optional\n Whether the tag vectors should be updated.\n learn_words : bool, optional\n Word vectors will be updated exactly as per Word2Vec skip-gram training only if **both**\n `learn_words` and `train_words` are set to True.\n learn_hidden : bool, optional\n Whether or not the weights of the hidden layer will be updated.\n word_vectors : numpy.ndarray, optional\n The vector representation for each word in the vocabulary. If None, these will be retrieved from the model.\n word_locks : numpy.ndarray, optional\n A learning lock factor for each weight in the hidden layer for words, value 0 completely blocks updates,\n a value of 1 allows to update word-vectors.\n doctag_v""ectors : numpy.ndarray, optional\n Vector representations of the tags. If None, these will be retrieved from the model.\n doctag_locks : numpy.ndarray, optional\n The lock factors for each tag, same as `word_locks`, but for document-vectors.\n\n Returns\n -------\n int\n Number of words in the input document that were actually used for training.\n\n "; +static PyMethodDef __pyx_mdef_6gensim_6models_13doc2vec_inner_5train_document_dm_concat = {"train_document_dm_concat", (PyCFunction)__pyx_pw_6gensim_6models_13doc2vec_inner_5train_document_dm_concat, METH_VARARGS|METH_KEYWORDS, __pyx_doc_6gensim_6models_13doc2vec_inner_4train_document_dm_concat}; static PyObject *__pyx_pw_6gensim_6models_13doc2vec_inner_5train_document_dm_concat(PyObject *__pyx_self, PyObject *__pyx_args, PyObject *__pyx_kwds) { PyObject *__pyx_v_model = 0; PyObject *__pyx_v_doc_words = 0; @@ -6924,23 +6917,23 @@ static PyObject *__pyx_pw_6gensim_6models_13doc2vec_inner_5train_document_dm_con values[4] = ((PyObject *)Py_None); values[5] = ((PyObject *)Py_None); - /* "gensim/models/doc2vec_inner.pyx":522 + /* "gensim/models/doc2vec_inner.pyx":605 * * def train_document_dm_concat(model, doc_words, doctag_indexes, alpha, work=None, neu1=None, * learn_doctags=True, learn_words=True, learn_hidden=True, # <<<<<<<<<<<<<< * word_vectors=None, word_locks=None, doctag_vectors=None, doctag_locks=None): - * cdef int hs = model.hs + * """Update distributed memory model ("PV-DM") by training on a single document, using a concatenation of the context */ values[6] = ((PyObject *)Py_True); values[7] = ((PyObject *)Py_True); values[8] = ((PyObject *)Py_True); - /* "gensim/models/doc2vec_inner.pyx":523 + /* "gensim/models/doc2vec_inner.pyx":606 * def train_document_dm_concat(model, doc_words, doctag_indexes, alpha, work=None, neu1=None, * learn_doctags=True, learn_words=True, learn_hidden=True, * word_vectors=None, word_locks=None, doctag_vectors=None, doctag_locks=None): # <<<<<<<<<<<<<< - * cdef int hs = model.hs - * cdef int negative = model.negative + * """Update distributed memory model ("PV-DM") by training on a single document, using a concatenation of the context + * window word vectors (rather than a sum or average). */ values[9] = ((PyObject *)Py_None); values[10] = ((PyObject *)Py_None); @@ -6988,19 +6981,19 @@ static PyObject *__pyx_pw_6gensim_6models_13doc2vec_inner_5train_document_dm_con case 1: if (likely((values[1] = PyDict_GetItem(__pyx_kwds, __pyx_n_s_doc_words)) != 0)) kw_args--; else { - __Pyx_RaiseArgtupleInvalid("train_document_dm_concat", 0, 4, 13, 1); __PYX_ERR(0, 521, __pyx_L3_error) + __Pyx_RaiseArgtupleInvalid("train_document_dm_concat", 0, 4, 13, 1); __PYX_ERR(0, 604, __pyx_L3_error) } CYTHON_FALLTHROUGH; case 2: if (likely((values[2] = PyDict_GetItem(__pyx_kwds, __pyx_n_s_doctag_indexes)) != 0)) kw_args--; else { - __Pyx_RaiseArgtupleInvalid("train_document_dm_concat", 0, 4, 13, 2); __PYX_ERR(0, 521, __pyx_L3_error) + __Pyx_RaiseArgtupleInvalid("train_document_dm_concat", 0, 4, 13, 2); __PYX_ERR(0, 604, __pyx_L3_error) } CYTHON_FALLTHROUGH; case 3: if (likely((values[3] = PyDict_GetItem(__pyx_kwds, __pyx_n_s_alpha)) != 0)) kw_args--; else { - __Pyx_RaiseArgtupleInvalid("train_document_dm_concat", 0, 4, 13, 3); __PYX_ERR(0, 521, __pyx_L3_error) + __Pyx_RaiseArgtupleInvalid("train_document_dm_concat", 0, 4, 13, 3); __PYX_ERR(0, 604, __pyx_L3_error) } CYTHON_FALLTHROUGH; case 4: @@ -7058,7 +7051,7 @@ static PyObject *__pyx_pw_6gensim_6models_13doc2vec_inner_5train_document_dm_con } } if (unlikely(kw_args > 0)) { - if (unlikely(__Pyx_ParseOptionalKeywords(__pyx_kwds, __pyx_pyargnames, 0, values, pos_args, "train_document_dm_concat") < 0)) __PYX_ERR(0, 521, __pyx_L3_error) + if (unlikely(__Pyx_ParseOptionalKeywords(__pyx_kwds, __pyx_pyargnames, 0, values, pos_args, "train_document_dm_concat") < 0)) __PYX_ERR(0, 604, __pyx_L3_error) } } else { switch (PyTuple_GET_SIZE(__pyx_args)) { @@ -7104,7 +7097,7 @@ static PyObject *__pyx_pw_6gensim_6models_13doc2vec_inner_5train_document_dm_con } goto __pyx_L4_argument_unpacking_done; __pyx_L5_argtuple_error:; - __Pyx_RaiseArgtupleInvalid("train_document_dm_concat", 0, 4, 13, PyTuple_GET_SIZE(__pyx_args)); __PYX_ERR(0, 521, __pyx_L3_error) + __Pyx_RaiseArgtupleInvalid("train_document_dm_concat", 0, 4, 13, PyTuple_GET_SIZE(__pyx_args)); __PYX_ERR(0, 604, __pyx_L3_error) __pyx_L3_error:; __Pyx_AddTraceback("gensim.models.doc2vec_inner.train_document_dm_concat", __pyx_clineno, __pyx_lineno, __pyx_filename); __Pyx_RefNannyFinishContext(); @@ -7112,7 +7105,7 @@ static PyObject *__pyx_pw_6gensim_6models_13doc2vec_inner_5train_document_dm_con __pyx_L4_argument_unpacking_done:; __pyx_r = __pyx_pf_6gensim_6models_13doc2vec_inner_4train_document_dm_concat(__pyx_self, __pyx_v_model, __pyx_v_doc_words, __pyx_v_doctag_indexes, __pyx_v_alpha, __pyx_v_work, __pyx_v_neu1, __pyx_v_learn_doctags, __pyx_v_learn_words, __pyx_v_learn_hidden, __pyx_v_word_vectors, __pyx_v_word_locks, __pyx_v_doctag_vectors, __pyx_v_doctag_locks); - /* "gensim/models/doc2vec_inner.pyx":521 + /* "gensim/models/doc2vec_inner.pyx":604 * * * def train_document_dm_concat(model, doc_words, doctag_indexes, alpha, work=None, neu1=None, # <<<<<<<<<<<<<< @@ -7193,149 +7186,149 @@ static PyObject *__pyx_pf_6gensim_6models_13doc2vec_inner_4train_document_dm_con __Pyx_INCREF(__pyx_v_doctag_vectors); __Pyx_INCREF(__pyx_v_doctag_locks); - /* "gensim/models/doc2vec_inner.pyx":524 - * learn_doctags=True, learn_words=True, learn_hidden=True, - * word_vectors=None, word_locks=None, doctag_vectors=None, doctag_locks=None): + /* "gensim/models/doc2vec_inner.pyx":652 + * + * """ * cdef int hs = model.hs # <<<<<<<<<<<<<< * cdef int negative = model.negative * cdef int sample = (model.vocabulary.sample != 0) */ - __pyx_t_1 = __Pyx_PyObject_GetAttrStr(__pyx_v_model, __pyx_n_s_hs); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 524, __pyx_L1_error) + __pyx_t_1 = __Pyx_PyObject_GetAttrStr(__pyx_v_model, __pyx_n_s_hs); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 652, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_1); - __pyx_t_2 = __Pyx_PyInt_As_int(__pyx_t_1); if (unlikely((__pyx_t_2 == (int)-1) && PyErr_Occurred())) __PYX_ERR(0, 524, __pyx_L1_error) + __pyx_t_2 = __Pyx_PyInt_As_int(__pyx_t_1); if (unlikely((__pyx_t_2 == (int)-1) && PyErr_Occurred())) __PYX_ERR(0, 652, __pyx_L1_error) __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0; __pyx_v_hs = __pyx_t_2; - /* "gensim/models/doc2vec_inner.pyx":525 - * word_vectors=None, word_locks=None, doctag_vectors=None, doctag_locks=None): + /* "gensim/models/doc2vec_inner.pyx":653 + * """ * cdef int hs = model.hs * cdef int negative = model.negative # <<<<<<<<<<<<<< * cdef int sample = (model.vocabulary.sample != 0) * cdef int _learn_doctags = learn_doctags */ - __pyx_t_1 = __Pyx_PyObject_GetAttrStr(__pyx_v_model, __pyx_n_s_negative); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 525, __pyx_L1_error) + __pyx_t_1 = __Pyx_PyObject_GetAttrStr(__pyx_v_model, __pyx_n_s_negative); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 653, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_1); - __pyx_t_2 = __Pyx_PyInt_As_int(__pyx_t_1); if (unlikely((__pyx_t_2 == (int)-1) && PyErr_Occurred())) __PYX_ERR(0, 525, __pyx_L1_error) + __pyx_t_2 = __Pyx_PyInt_As_int(__pyx_t_1); if (unlikely((__pyx_t_2 == (int)-1) && PyErr_Occurred())) __PYX_ERR(0, 653, __pyx_L1_error) __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0; __pyx_v_negative = __pyx_t_2; - /* "gensim/models/doc2vec_inner.pyx":526 + /* "gensim/models/doc2vec_inner.pyx":654 * cdef int hs = model.hs * cdef int negative = model.negative * cdef int sample = (model.vocabulary.sample != 0) # <<<<<<<<<<<<<< * cdef int _learn_doctags = learn_doctags * cdef int _learn_words = learn_words */ - __pyx_t_1 = __Pyx_PyObject_GetAttrStr(__pyx_v_model, __pyx_n_s_vocabulary); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 526, __pyx_L1_error) + __pyx_t_1 = __Pyx_PyObject_GetAttrStr(__pyx_v_model, __pyx_n_s_vocabulary); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 654, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_1); - __pyx_t_3 = __Pyx_PyObject_GetAttrStr(__pyx_t_1, __pyx_n_s_sample); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 526, __pyx_L1_error) + __pyx_t_3 = __Pyx_PyObject_GetAttrStr(__pyx_t_1, __pyx_n_s_sample); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 654, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_3); __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0; - __pyx_t_1 = PyObject_RichCompare(__pyx_t_3, __pyx_int_0, Py_NE); __Pyx_XGOTREF(__pyx_t_1); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 526, __pyx_L1_error) + __pyx_t_1 = PyObject_RichCompare(__pyx_t_3, __pyx_int_0, Py_NE); __Pyx_XGOTREF(__pyx_t_1); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 654, __pyx_L1_error) __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0; - __pyx_t_2 = __Pyx_PyInt_As_int(__pyx_t_1); if (unlikely((__pyx_t_2 == (int)-1) && PyErr_Occurred())) __PYX_ERR(0, 526, __pyx_L1_error) + __pyx_t_2 = __Pyx_PyInt_As_int(__pyx_t_1); if (unlikely((__pyx_t_2 == (int)-1) && PyErr_Occurred())) __PYX_ERR(0, 654, __pyx_L1_error) __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0; __pyx_v_sample = __pyx_t_2; - /* "gensim/models/doc2vec_inner.pyx":527 + /* "gensim/models/doc2vec_inner.pyx":655 * cdef int negative = model.negative * cdef int sample = (model.vocabulary.sample != 0) * cdef int _learn_doctags = learn_doctags # <<<<<<<<<<<<<< * cdef int _learn_words = learn_words * cdef int _learn_hidden = learn_hidden */ - __pyx_t_2 = __Pyx_PyInt_As_int(__pyx_v_learn_doctags); if (unlikely((__pyx_t_2 == (int)-1) && PyErr_Occurred())) __PYX_ERR(0, 527, __pyx_L1_error) + __pyx_t_2 = __Pyx_PyInt_As_int(__pyx_v_learn_doctags); if (unlikely((__pyx_t_2 == (int)-1) && PyErr_Occurred())) __PYX_ERR(0, 655, __pyx_L1_error) __pyx_v__learn_doctags = __pyx_t_2; - /* "gensim/models/doc2vec_inner.pyx":528 + /* "gensim/models/doc2vec_inner.pyx":656 * cdef int sample = (model.vocabulary.sample != 0) * cdef int _learn_doctags = learn_doctags * cdef int _learn_words = learn_words # <<<<<<<<<<<<<< * cdef int _learn_hidden = learn_hidden * */ - __pyx_t_2 = __Pyx_PyInt_As_int(__pyx_v_learn_words); if (unlikely((__pyx_t_2 == (int)-1) && PyErr_Occurred())) __PYX_ERR(0, 528, __pyx_L1_error) + __pyx_t_2 = __Pyx_PyInt_As_int(__pyx_v_learn_words); if (unlikely((__pyx_t_2 == (int)-1) && PyErr_Occurred())) __PYX_ERR(0, 656, __pyx_L1_error) __pyx_v__learn_words = __pyx_t_2; - /* "gensim/models/doc2vec_inner.pyx":529 + /* "gensim/models/doc2vec_inner.pyx":657 * cdef int _learn_doctags = learn_doctags * cdef int _learn_words = learn_words * cdef int _learn_hidden = learn_hidden # <<<<<<<<<<<<<< * * cdef REAL_t *_word_vectors */ - __pyx_t_2 = __Pyx_PyInt_As_int(__pyx_v_learn_hidden); if (unlikely((__pyx_t_2 == (int)-1) && PyErr_Occurred())) __PYX_ERR(0, 529, __pyx_L1_error) + __pyx_t_2 = __Pyx_PyInt_As_int(__pyx_v_learn_hidden); if (unlikely((__pyx_t_2 == (int)-1) && PyErr_Occurred())) __PYX_ERR(0, 657, __pyx_L1_error) __pyx_v__learn_hidden = __pyx_t_2; - /* "gensim/models/doc2vec_inner.pyx":537 + /* "gensim/models/doc2vec_inner.pyx":665 * cdef REAL_t *_work * cdef REAL_t *_neu1 * cdef REAL_t _alpha = alpha # <<<<<<<<<<<<<< * cdef int layer1_size = model.trainables.layer1_size * cdef int vector_size = model.docvecs.vector_size */ - __pyx_t_4 = __pyx_PyFloat_AsFloat(__pyx_v_alpha); if (unlikely((__pyx_t_4 == ((npy_float32)-1)) && PyErr_Occurred())) __PYX_ERR(0, 537, __pyx_L1_error) + __pyx_t_4 = __pyx_PyFloat_AsFloat(__pyx_v_alpha); if (unlikely((__pyx_t_4 == ((npy_float32)-1)) && PyErr_Occurred())) __PYX_ERR(0, 665, __pyx_L1_error) __pyx_v__alpha = __pyx_t_4; - /* "gensim/models/doc2vec_inner.pyx":538 + /* "gensim/models/doc2vec_inner.pyx":666 * cdef REAL_t *_neu1 * cdef REAL_t _alpha = alpha * cdef int layer1_size = model.trainables.layer1_size # <<<<<<<<<<<<<< * cdef int vector_size = model.docvecs.vector_size * */ - __pyx_t_1 = __Pyx_PyObject_GetAttrStr(__pyx_v_model, __pyx_n_s_trainables); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 538, __pyx_L1_error) + __pyx_t_1 = __Pyx_PyObject_GetAttrStr(__pyx_v_model, __pyx_n_s_trainables); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 666, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_1); - __pyx_t_3 = __Pyx_PyObject_GetAttrStr(__pyx_t_1, __pyx_n_s_layer1_size); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 538, __pyx_L1_error) + __pyx_t_3 = __Pyx_PyObject_GetAttrStr(__pyx_t_1, __pyx_n_s_layer1_size); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 666, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_3); __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0; - __pyx_t_2 = __Pyx_PyInt_As_int(__pyx_t_3); if (unlikely((__pyx_t_2 == (int)-1) && PyErr_Occurred())) __PYX_ERR(0, 538, __pyx_L1_error) + __pyx_t_2 = __Pyx_PyInt_As_int(__pyx_t_3); if (unlikely((__pyx_t_2 == (int)-1) && PyErr_Occurred())) __PYX_ERR(0, 666, __pyx_L1_error) __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0; __pyx_v_layer1_size = __pyx_t_2; - /* "gensim/models/doc2vec_inner.pyx":539 + /* "gensim/models/doc2vec_inner.pyx":667 * cdef REAL_t _alpha = alpha * cdef int layer1_size = model.trainables.layer1_size * cdef int vector_size = model.docvecs.vector_size # <<<<<<<<<<<<<< * * cdef int codelens[MAX_DOCUMENT_LEN] */ - __pyx_t_3 = __Pyx_PyObject_GetAttrStr(__pyx_v_model, __pyx_n_s_docvecs); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 539, __pyx_L1_error) + __pyx_t_3 = __Pyx_PyObject_GetAttrStr(__pyx_v_model, __pyx_n_s_docvecs); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 667, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_3); - __pyx_t_1 = __Pyx_PyObject_GetAttrStr(__pyx_t_3, __pyx_n_s_vector_size); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 539, __pyx_L1_error) + __pyx_t_1 = __Pyx_PyObject_GetAttrStr(__pyx_t_3, __pyx_n_s_vector_size); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 667, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_1); __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0; - __pyx_t_2 = __Pyx_PyInt_As_int(__pyx_t_1); if (unlikely((__pyx_t_2 == (int)-1) && PyErr_Occurred())) __PYX_ERR(0, 539, __pyx_L1_error) + __pyx_t_2 = __Pyx_PyInt_As_int(__pyx_t_1); if (unlikely((__pyx_t_2 == (int)-1) && PyErr_Occurred())) __PYX_ERR(0, 667, __pyx_L1_error) __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0; __pyx_v_vector_size = __pyx_t_2; - /* "gensim/models/doc2vec_inner.pyx":547 + /* "gensim/models/doc2vec_inner.pyx":675 * cdef int document_len * cdef int doctag_len * cdef int window = model.window # <<<<<<<<<<<<<< * cdef int expected_doctag_len = model.dm_tag_count * */ - __pyx_t_1 = __Pyx_PyObject_GetAttrStr(__pyx_v_model, __pyx_n_s_window); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 547, __pyx_L1_error) + __pyx_t_1 = __Pyx_PyObject_GetAttrStr(__pyx_v_model, __pyx_n_s_window); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 675, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_1); - __pyx_t_2 = __Pyx_PyInt_As_int(__pyx_t_1); if (unlikely((__pyx_t_2 == (int)-1) && PyErr_Occurred())) __PYX_ERR(0, 547, __pyx_L1_error) + __pyx_t_2 = __Pyx_PyInt_As_int(__pyx_t_1); if (unlikely((__pyx_t_2 == (int)-1) && PyErr_Occurred())) __PYX_ERR(0, 675, __pyx_L1_error) __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0; __pyx_v_window = __pyx_t_2; - /* "gensim/models/doc2vec_inner.pyx":548 + /* "gensim/models/doc2vec_inner.pyx":676 * cdef int doctag_len * cdef int window = model.window * cdef int expected_doctag_len = model.dm_tag_count # <<<<<<<<<<<<<< * * cdef int i, j, k, m, n */ - __pyx_t_1 = __Pyx_PyObject_GetAttrStr(__pyx_v_model, __pyx_n_s_dm_tag_count); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 548, __pyx_L1_error) + __pyx_t_1 = __Pyx_PyObject_GetAttrStr(__pyx_v_model, __pyx_n_s_dm_tag_count); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 676, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_1); - __pyx_t_2 = __Pyx_PyInt_As_int(__pyx_t_1); if (unlikely((__pyx_t_2 == (int)-1) && PyErr_Occurred())) __PYX_ERR(0, 548, __pyx_L1_error) + __pyx_t_2 = __Pyx_PyInt_As_int(__pyx_t_1); if (unlikely((__pyx_t_2 == (int)-1) && PyErr_Occurred())) __PYX_ERR(0, 676, __pyx_L1_error) __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0; __pyx_v_expected_doctag_len = __pyx_t_2; - /* "gensim/models/doc2vec_inner.pyx":551 + /* "gensim/models/doc2vec_inner.pyx":679 * * cdef int i, j, k, m, n * cdef long result = 0 # <<<<<<<<<<<<<< @@ -7344,36 +7337,36 @@ static PyObject *__pyx_pf_6gensim_6models_13doc2vec_inner_4train_document_dm_con */ __pyx_v_result = 0; - /* "gensim/models/doc2vec_inner.pyx":552 + /* "gensim/models/doc2vec_inner.pyx":680 * cdef int i, j, k, m, n * cdef long result = 0 * cdef int null_word_index = model.wv.vocab['\0'].index # <<<<<<<<<<<<<< * * # For hierarchical softmax */ - __pyx_t_1 = __Pyx_PyObject_GetAttrStr(__pyx_v_model, __pyx_n_s_wv); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 552, __pyx_L1_error) + __pyx_t_1 = __Pyx_PyObject_GetAttrStr(__pyx_v_model, __pyx_n_s_wv); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 680, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_1); - __pyx_t_3 = __Pyx_PyObject_GetAttrStr(__pyx_t_1, __pyx_n_s_vocab); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 552, __pyx_L1_error) + __pyx_t_3 = __Pyx_PyObject_GetAttrStr(__pyx_t_1, __pyx_n_s_vocab); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 680, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_3); __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0; - __pyx_t_1 = PyObject_GetItem(__pyx_t_3, __pyx_kp_s__5); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 552, __pyx_L1_error) + __pyx_t_1 = PyObject_GetItem(__pyx_t_3, __pyx_kp_s__5); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 680, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_1); __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0; - __pyx_t_3 = __Pyx_PyObject_GetAttrStr(__pyx_t_1, __pyx_n_s_index); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 552, __pyx_L1_error) + __pyx_t_3 = __Pyx_PyObject_GetAttrStr(__pyx_t_1, __pyx_n_s_index); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 680, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_3); __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0; - __pyx_t_2 = __Pyx_PyInt_As_int(__pyx_t_3); if (unlikely((__pyx_t_2 == (int)-1) && PyErr_Occurred())) __PYX_ERR(0, 552, __pyx_L1_error) + __pyx_t_2 = __Pyx_PyInt_As_int(__pyx_t_3); if (unlikely((__pyx_t_2 == (int)-1) && PyErr_Occurred())) __PYX_ERR(0, 680, __pyx_L1_error) __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0; __pyx_v_null_word_index = __pyx_t_2; - /* "gensim/models/doc2vec_inner.pyx":565 + /* "gensim/models/doc2vec_inner.pyx":693 * cdef unsigned long long next_random * * doctag_len = min(MAX_DOCUMENT_LEN, len(doctag_indexes)) # <<<<<<<<<<<<<< * if doctag_len != expected_doctag_len: * return 0 # skip doc without expected number of tags */ - __pyx_t_5 = PyObject_Length(__pyx_v_doctag_indexes); if (unlikely(__pyx_t_5 == ((Py_ssize_t)-1))) __PYX_ERR(0, 565, __pyx_L1_error) + __pyx_t_5 = PyObject_Length(__pyx_v_doctag_indexes); if (unlikely(__pyx_t_5 == ((Py_ssize_t)-1))) __PYX_ERR(0, 693, __pyx_L1_error) __pyx_t_6 = 0x2710; if (((__pyx_t_5 < __pyx_t_6) != 0)) { __pyx_t_7 = __pyx_t_5; @@ -7382,7 +7375,7 @@ static PyObject *__pyx_pf_6gensim_6models_13doc2vec_inner_4train_document_dm_con } __pyx_v_doctag_len = ((int)__pyx_t_7); - /* "gensim/models/doc2vec_inner.pyx":566 + /* "gensim/models/doc2vec_inner.pyx":694 * * doctag_len = min(MAX_DOCUMENT_LEN, len(doctag_indexes)) * if doctag_len != expected_doctag_len: # <<<<<<<<<<<<<< @@ -7392,7 +7385,7 @@ static PyObject *__pyx_pf_6gensim_6models_13doc2vec_inner_4train_document_dm_con __pyx_t_8 = ((__pyx_v_doctag_len != __pyx_v_expected_doctag_len) != 0); if (__pyx_t_8) { - /* "gensim/models/doc2vec_inner.pyx":567 + /* "gensim/models/doc2vec_inner.pyx":695 * doctag_len = min(MAX_DOCUMENT_LEN, len(doctag_indexes)) * if doctag_len != expected_doctag_len: * return 0 # skip doc without expected number of tags # <<<<<<<<<<<<<< @@ -7404,7 +7397,7 @@ static PyObject *__pyx_pf_6gensim_6models_13doc2vec_inner_4train_document_dm_con __pyx_r = __pyx_int_0; goto __pyx_L0; - /* "gensim/models/doc2vec_inner.pyx":566 + /* "gensim/models/doc2vec_inner.pyx":694 * * doctag_len = min(MAX_DOCUMENT_LEN, len(doctag_indexes)) * if doctag_len != expected_doctag_len: # <<<<<<<<<<<<<< @@ -7413,7 +7406,7 @@ static PyObject *__pyx_pf_6gensim_6models_13doc2vec_inner_4train_document_dm_con */ } - /* "gensim/models/doc2vec_inner.pyx":570 + /* "gensim/models/doc2vec_inner.pyx":698 * * # default vectors, locks from syn0/doctag_syn0 * if word_vectors is None: # <<<<<<<<<<<<<< @@ -7424,22 +7417,22 @@ static PyObject *__pyx_pf_6gensim_6models_13doc2vec_inner_4train_document_dm_con __pyx_t_9 = (__pyx_t_8 != 0); if (__pyx_t_9) { - /* "gensim/models/doc2vec_inner.pyx":571 + /* "gensim/models/doc2vec_inner.pyx":699 * # default vectors, locks from syn0/doctag_syn0 * if word_vectors is None: * word_vectors = model.wv.vectors # <<<<<<<<<<<<<< * _word_vectors = (np.PyArray_DATA(word_vectors)) * if doctag_vectors is None: */ - __pyx_t_3 = __Pyx_PyObject_GetAttrStr(__pyx_v_model, __pyx_n_s_wv); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 571, __pyx_L1_error) + __pyx_t_3 = __Pyx_PyObject_GetAttrStr(__pyx_v_model, __pyx_n_s_wv); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 699, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_3); - __pyx_t_1 = __Pyx_PyObject_GetAttrStr(__pyx_t_3, __pyx_n_s_vectors); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 571, __pyx_L1_error) + __pyx_t_1 = __Pyx_PyObject_GetAttrStr(__pyx_t_3, __pyx_n_s_vectors); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 699, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_1); __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0; __Pyx_DECREF_SET(__pyx_v_word_vectors, __pyx_t_1); __pyx_t_1 = 0; - /* "gensim/models/doc2vec_inner.pyx":570 + /* "gensim/models/doc2vec_inner.pyx":698 * * # default vectors, locks from syn0/doctag_syn0 * if word_vectors is None: # <<<<<<<<<<<<<< @@ -7448,17 +7441,17 @@ static PyObject *__pyx_pf_6gensim_6models_13doc2vec_inner_4train_document_dm_con */ } - /* "gensim/models/doc2vec_inner.pyx":572 + /* "gensim/models/doc2vec_inner.pyx":700 * if word_vectors is None: * word_vectors = model.wv.vectors * _word_vectors = (np.PyArray_DATA(word_vectors)) # <<<<<<<<<<<<<< * if doctag_vectors is None: * doctag_vectors = model.docvecs.vectors_docs */ - if (!(likely(((__pyx_v_word_vectors) == Py_None) || likely(__Pyx_TypeTest(__pyx_v_word_vectors, __pyx_ptype_5numpy_ndarray))))) __PYX_ERR(0, 572, __pyx_L1_error) + if (!(likely(((__pyx_v_word_vectors) == Py_None) || likely(__Pyx_TypeTest(__pyx_v_word_vectors, __pyx_ptype_5numpy_ndarray))))) __PYX_ERR(0, 700, __pyx_L1_error) __pyx_v__word_vectors = ((__pyx_t_6gensim_6models_14word2vec_inner_REAL_t *)PyArray_DATA(((PyArrayObject *)__pyx_v_word_vectors))); - /* "gensim/models/doc2vec_inner.pyx":573 + /* "gensim/models/doc2vec_inner.pyx":701 * word_vectors = model.wv.vectors * _word_vectors = (np.PyArray_DATA(word_vectors)) * if doctag_vectors is None: # <<<<<<<<<<<<<< @@ -7469,22 +7462,22 @@ static PyObject *__pyx_pf_6gensim_6models_13doc2vec_inner_4train_document_dm_con __pyx_t_8 = (__pyx_t_9 != 0); if (__pyx_t_8) { - /* "gensim/models/doc2vec_inner.pyx":574 + /* "gensim/models/doc2vec_inner.pyx":702 * _word_vectors = (np.PyArray_DATA(word_vectors)) * if doctag_vectors is None: * doctag_vectors = model.docvecs.vectors_docs # <<<<<<<<<<<<<< * _doctag_vectors = (np.PyArray_DATA(doctag_vectors)) * if word_locks is None: */ - __pyx_t_1 = __Pyx_PyObject_GetAttrStr(__pyx_v_model, __pyx_n_s_docvecs); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 574, __pyx_L1_error) + __pyx_t_1 = __Pyx_PyObject_GetAttrStr(__pyx_v_model, __pyx_n_s_docvecs); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 702, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_1); - __pyx_t_3 = __Pyx_PyObject_GetAttrStr(__pyx_t_1, __pyx_n_s_vectors_docs); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 574, __pyx_L1_error) + __pyx_t_3 = __Pyx_PyObject_GetAttrStr(__pyx_t_1, __pyx_n_s_vectors_docs); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 702, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_3); __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0; __Pyx_DECREF_SET(__pyx_v_doctag_vectors, __pyx_t_3); __pyx_t_3 = 0; - /* "gensim/models/doc2vec_inner.pyx":573 + /* "gensim/models/doc2vec_inner.pyx":701 * word_vectors = model.wv.vectors * _word_vectors = (np.PyArray_DATA(word_vectors)) * if doctag_vectors is None: # <<<<<<<<<<<<<< @@ -7493,17 +7486,17 @@ static PyObject *__pyx_pf_6gensim_6models_13doc2vec_inner_4train_document_dm_con */ } - /* "gensim/models/doc2vec_inner.pyx":575 + /* "gensim/models/doc2vec_inner.pyx":703 * if doctag_vectors is None: * doctag_vectors = model.docvecs.vectors_docs * _doctag_vectors = (np.PyArray_DATA(doctag_vectors)) # <<<<<<<<<<<<<< * if word_locks is None: * word_locks = model.trainables.vectors_lockf */ - if (!(likely(((__pyx_v_doctag_vectors) == Py_None) || likely(__Pyx_TypeTest(__pyx_v_doctag_vectors, __pyx_ptype_5numpy_ndarray))))) __PYX_ERR(0, 575, __pyx_L1_error) + if (!(likely(((__pyx_v_doctag_vectors) == Py_None) || likely(__Pyx_TypeTest(__pyx_v_doctag_vectors, __pyx_ptype_5numpy_ndarray))))) __PYX_ERR(0, 703, __pyx_L1_error) __pyx_v__doctag_vectors = ((__pyx_t_6gensim_6models_14word2vec_inner_REAL_t *)PyArray_DATA(((PyArrayObject *)__pyx_v_doctag_vectors))); - /* "gensim/models/doc2vec_inner.pyx":576 + /* "gensim/models/doc2vec_inner.pyx":704 * doctag_vectors = model.docvecs.vectors_docs * _doctag_vectors = (np.PyArray_DATA(doctag_vectors)) * if word_locks is None: # <<<<<<<<<<<<<< @@ -7514,22 +7507,22 @@ static PyObject *__pyx_pf_6gensim_6models_13doc2vec_inner_4train_document_dm_con __pyx_t_9 = (__pyx_t_8 != 0); if (__pyx_t_9) { - /* "gensim/models/doc2vec_inner.pyx":577 + /* "gensim/models/doc2vec_inner.pyx":705 * _doctag_vectors = (np.PyArray_DATA(doctag_vectors)) * if word_locks is None: * word_locks = model.trainables.vectors_lockf # <<<<<<<<<<<<<< * _word_locks = (np.PyArray_DATA(word_locks)) * if doctag_locks is None: */ - __pyx_t_3 = __Pyx_PyObject_GetAttrStr(__pyx_v_model, __pyx_n_s_trainables); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 577, __pyx_L1_error) + __pyx_t_3 = __Pyx_PyObject_GetAttrStr(__pyx_v_model, __pyx_n_s_trainables); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 705, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_3); - __pyx_t_1 = __Pyx_PyObject_GetAttrStr(__pyx_t_3, __pyx_n_s_vectors_lockf); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 577, __pyx_L1_error) + __pyx_t_1 = __Pyx_PyObject_GetAttrStr(__pyx_t_3, __pyx_n_s_vectors_lockf); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 705, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_1); __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0; __Pyx_DECREF_SET(__pyx_v_word_locks, __pyx_t_1); __pyx_t_1 = 0; - /* "gensim/models/doc2vec_inner.pyx":576 + /* "gensim/models/doc2vec_inner.pyx":704 * doctag_vectors = model.docvecs.vectors_docs * _doctag_vectors = (np.PyArray_DATA(doctag_vectors)) * if word_locks is None: # <<<<<<<<<<<<<< @@ -7538,17 +7531,17 @@ static PyObject *__pyx_pf_6gensim_6models_13doc2vec_inner_4train_document_dm_con */ } - /* "gensim/models/doc2vec_inner.pyx":578 + /* "gensim/models/doc2vec_inner.pyx":706 * if word_locks is None: * word_locks = model.trainables.vectors_lockf * _word_locks = (np.PyArray_DATA(word_locks)) # <<<<<<<<<<<<<< * if doctag_locks is None: * doctag_locks = model.trainables.vectors_docs_lockf */ - if (!(likely(((__pyx_v_word_locks) == Py_None) || likely(__Pyx_TypeTest(__pyx_v_word_locks, __pyx_ptype_5numpy_ndarray))))) __PYX_ERR(0, 578, __pyx_L1_error) + if (!(likely(((__pyx_v_word_locks) == Py_None) || likely(__Pyx_TypeTest(__pyx_v_word_locks, __pyx_ptype_5numpy_ndarray))))) __PYX_ERR(0, 706, __pyx_L1_error) __pyx_v__word_locks = ((__pyx_t_6gensim_6models_14word2vec_inner_REAL_t *)PyArray_DATA(((PyArrayObject *)__pyx_v_word_locks))); - /* "gensim/models/doc2vec_inner.pyx":579 + /* "gensim/models/doc2vec_inner.pyx":707 * word_locks = model.trainables.vectors_lockf * _word_locks = (np.PyArray_DATA(word_locks)) * if doctag_locks is None: # <<<<<<<<<<<<<< @@ -7559,22 +7552,22 @@ static PyObject *__pyx_pf_6gensim_6models_13doc2vec_inner_4train_document_dm_con __pyx_t_8 = (__pyx_t_9 != 0); if (__pyx_t_8) { - /* "gensim/models/doc2vec_inner.pyx":580 + /* "gensim/models/doc2vec_inner.pyx":708 * _word_locks = (np.PyArray_DATA(word_locks)) * if doctag_locks is None: * doctag_locks = model.trainables.vectors_docs_lockf # <<<<<<<<<<<<<< * _doctag_locks = (np.PyArray_DATA(doctag_locks)) * */ - __pyx_t_1 = __Pyx_PyObject_GetAttrStr(__pyx_v_model, __pyx_n_s_trainables); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 580, __pyx_L1_error) + __pyx_t_1 = __Pyx_PyObject_GetAttrStr(__pyx_v_model, __pyx_n_s_trainables); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 708, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_1); - __pyx_t_3 = __Pyx_PyObject_GetAttrStr(__pyx_t_1, __pyx_n_s_vectors_docs_lockf); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 580, __pyx_L1_error) + __pyx_t_3 = __Pyx_PyObject_GetAttrStr(__pyx_t_1, __pyx_n_s_vectors_docs_lockf); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 708, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_3); __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0; __Pyx_DECREF_SET(__pyx_v_doctag_locks, __pyx_t_3); __pyx_t_3 = 0; - /* "gensim/models/doc2vec_inner.pyx":579 + /* "gensim/models/doc2vec_inner.pyx":707 * word_locks = model.trainables.vectors_lockf * _word_locks = (np.PyArray_DATA(word_locks)) * if doctag_locks is None: # <<<<<<<<<<<<<< @@ -7583,17 +7576,17 @@ static PyObject *__pyx_pf_6gensim_6models_13doc2vec_inner_4train_document_dm_con */ } - /* "gensim/models/doc2vec_inner.pyx":581 + /* "gensim/models/doc2vec_inner.pyx":709 * if doctag_locks is None: * doctag_locks = model.trainables.vectors_docs_lockf * _doctag_locks = (np.PyArray_DATA(doctag_locks)) # <<<<<<<<<<<<<< * * if hs: */ - if (!(likely(((__pyx_v_doctag_locks) == Py_None) || likely(__Pyx_TypeTest(__pyx_v_doctag_locks, __pyx_ptype_5numpy_ndarray))))) __PYX_ERR(0, 581, __pyx_L1_error) + if (!(likely(((__pyx_v_doctag_locks) == Py_None) || likely(__Pyx_TypeTest(__pyx_v_doctag_locks, __pyx_ptype_5numpy_ndarray))))) __PYX_ERR(0, 709, __pyx_L1_error) __pyx_v__doctag_locks = ((__pyx_t_6gensim_6models_14word2vec_inner_REAL_t *)PyArray_DATA(((PyArrayObject *)__pyx_v_doctag_locks))); - /* "gensim/models/doc2vec_inner.pyx":583 + /* "gensim/models/doc2vec_inner.pyx":711 * _doctag_locks = (np.PyArray_DATA(doctag_locks)) * * if hs: # <<<<<<<<<<<<<< @@ -7603,23 +7596,23 @@ static PyObject *__pyx_pf_6gensim_6models_13doc2vec_inner_4train_document_dm_con __pyx_t_8 = (__pyx_v_hs != 0); if (__pyx_t_8) { - /* "gensim/models/doc2vec_inner.pyx":584 + /* "gensim/models/doc2vec_inner.pyx":712 * * if hs: * syn1 = (np.PyArray_DATA(model.trainables.syn1)) # <<<<<<<<<<<<<< * * if negative: */ - __pyx_t_3 = __Pyx_PyObject_GetAttrStr(__pyx_v_model, __pyx_n_s_trainables); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 584, __pyx_L1_error) + __pyx_t_3 = __Pyx_PyObject_GetAttrStr(__pyx_v_model, __pyx_n_s_trainables); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 712, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_3); - __pyx_t_1 = __Pyx_PyObject_GetAttrStr(__pyx_t_3, __pyx_n_s_syn1); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 584, __pyx_L1_error) + __pyx_t_1 = __Pyx_PyObject_GetAttrStr(__pyx_t_3, __pyx_n_s_syn1); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 712, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_1); __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0; - if (!(likely(((__pyx_t_1) == Py_None) || likely(__Pyx_TypeTest(__pyx_t_1, __pyx_ptype_5numpy_ndarray))))) __PYX_ERR(0, 584, __pyx_L1_error) + if (!(likely(((__pyx_t_1) == Py_None) || likely(__Pyx_TypeTest(__pyx_t_1, __pyx_ptype_5numpy_ndarray))))) __PYX_ERR(0, 712, __pyx_L1_error) __pyx_v_syn1 = ((__pyx_t_6gensim_6models_14word2vec_inner_REAL_t *)PyArray_DATA(((PyArrayObject *)__pyx_t_1))); __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0; - /* "gensim/models/doc2vec_inner.pyx":583 + /* "gensim/models/doc2vec_inner.pyx":711 * _doctag_locks = (np.PyArray_DATA(doctag_locks)) * * if hs: # <<<<<<<<<<<<<< @@ -7628,7 +7621,7 @@ static PyObject *__pyx_pf_6gensim_6models_13doc2vec_inner_4train_document_dm_con */ } - /* "gensim/models/doc2vec_inner.pyx":586 + /* "gensim/models/doc2vec_inner.pyx":714 * syn1 = (np.PyArray_DATA(model.trainables.syn1)) * * if negative: # <<<<<<<<<<<<<< @@ -7638,55 +7631,55 @@ static PyObject *__pyx_pf_6gensim_6models_13doc2vec_inner_4train_document_dm_con __pyx_t_8 = (__pyx_v_negative != 0); if (__pyx_t_8) { - /* "gensim/models/doc2vec_inner.pyx":587 + /* "gensim/models/doc2vec_inner.pyx":715 * * if negative: * syn1neg = (np.PyArray_DATA(model.trainables.syn1neg)) # <<<<<<<<<<<<<< * cum_table = (np.PyArray_DATA(model.vocabulary.cum_table)) * cum_table_len = len(model.vocabulary.cum_table) */ - __pyx_t_1 = __Pyx_PyObject_GetAttrStr(__pyx_v_model, __pyx_n_s_trainables); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 587, __pyx_L1_error) + __pyx_t_1 = __Pyx_PyObject_GetAttrStr(__pyx_v_model, __pyx_n_s_trainables); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 715, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_1); - __pyx_t_3 = __Pyx_PyObject_GetAttrStr(__pyx_t_1, __pyx_n_s_syn1neg); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 587, __pyx_L1_error) + __pyx_t_3 = __Pyx_PyObject_GetAttrStr(__pyx_t_1, __pyx_n_s_syn1neg); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 715, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_3); __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0; - if (!(likely(((__pyx_t_3) == Py_None) || likely(__Pyx_TypeTest(__pyx_t_3, __pyx_ptype_5numpy_ndarray))))) __PYX_ERR(0, 587, __pyx_L1_error) + if (!(likely(((__pyx_t_3) == Py_None) || likely(__Pyx_TypeTest(__pyx_t_3, __pyx_ptype_5numpy_ndarray))))) __PYX_ERR(0, 715, __pyx_L1_error) __pyx_v_syn1neg = ((__pyx_t_6gensim_6models_14word2vec_inner_REAL_t *)PyArray_DATA(((PyArrayObject *)__pyx_t_3))); __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0; - /* "gensim/models/doc2vec_inner.pyx":588 + /* "gensim/models/doc2vec_inner.pyx":716 * if negative: * syn1neg = (np.PyArray_DATA(model.trainables.syn1neg)) * cum_table = (np.PyArray_DATA(model.vocabulary.cum_table)) # <<<<<<<<<<<<<< * cum_table_len = len(model.vocabulary.cum_table) * if negative or sample: */ - __pyx_t_3 = __Pyx_PyObject_GetAttrStr(__pyx_v_model, __pyx_n_s_vocabulary); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 588, __pyx_L1_error) + __pyx_t_3 = __Pyx_PyObject_GetAttrStr(__pyx_v_model, __pyx_n_s_vocabulary); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 716, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_3); - __pyx_t_1 = __Pyx_PyObject_GetAttrStr(__pyx_t_3, __pyx_n_s_cum_table); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 588, __pyx_L1_error) + __pyx_t_1 = __Pyx_PyObject_GetAttrStr(__pyx_t_3, __pyx_n_s_cum_table); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 716, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_1); __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0; - if (!(likely(((__pyx_t_1) == Py_None) || likely(__Pyx_TypeTest(__pyx_t_1, __pyx_ptype_5numpy_ndarray))))) __PYX_ERR(0, 588, __pyx_L1_error) + if (!(likely(((__pyx_t_1) == Py_None) || likely(__Pyx_TypeTest(__pyx_t_1, __pyx_ptype_5numpy_ndarray))))) __PYX_ERR(0, 716, __pyx_L1_error) __pyx_v_cum_table = ((__pyx_t_5numpy_uint32_t *)PyArray_DATA(((PyArrayObject *)__pyx_t_1))); __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0; - /* "gensim/models/doc2vec_inner.pyx":589 + /* "gensim/models/doc2vec_inner.pyx":717 * syn1neg = (np.PyArray_DATA(model.trainables.syn1neg)) * cum_table = (np.PyArray_DATA(model.vocabulary.cum_table)) * cum_table_len = len(model.vocabulary.cum_table) # <<<<<<<<<<<<<< * if negative or sample: * next_random = (2**24) * model.random.randint(0, 2**24) + model.random.randint(0, 2**24) */ - __pyx_t_1 = __Pyx_PyObject_GetAttrStr(__pyx_v_model, __pyx_n_s_vocabulary); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 589, __pyx_L1_error) + __pyx_t_1 = __Pyx_PyObject_GetAttrStr(__pyx_v_model, __pyx_n_s_vocabulary); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 717, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_1); - __pyx_t_3 = __Pyx_PyObject_GetAttrStr(__pyx_t_1, __pyx_n_s_cum_table); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 589, __pyx_L1_error) + __pyx_t_3 = __Pyx_PyObject_GetAttrStr(__pyx_t_1, __pyx_n_s_cum_table); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 717, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_3); __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0; - __pyx_t_7 = PyObject_Length(__pyx_t_3); if (unlikely(__pyx_t_7 == ((Py_ssize_t)-1))) __PYX_ERR(0, 589, __pyx_L1_error) + __pyx_t_7 = PyObject_Length(__pyx_t_3); if (unlikely(__pyx_t_7 == ((Py_ssize_t)-1))) __PYX_ERR(0, 717, __pyx_L1_error) __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0; __pyx_v_cum_table_len = __pyx_t_7; - /* "gensim/models/doc2vec_inner.pyx":586 + /* "gensim/models/doc2vec_inner.pyx":714 * syn1 = (np.PyArray_DATA(model.trainables.syn1)) * * if negative: # <<<<<<<<<<<<<< @@ -7695,7 +7688,7 @@ static PyObject *__pyx_pf_6gensim_6models_13doc2vec_inner_4train_document_dm_con */ } - /* "gensim/models/doc2vec_inner.pyx":590 + /* "gensim/models/doc2vec_inner.pyx":718 * cum_table = (np.PyArray_DATA(model.vocabulary.cum_table)) * cum_table_len = len(model.vocabulary.cum_table) * if negative or sample: # <<<<<<<<<<<<<< @@ -7713,41 +7706,41 @@ static PyObject *__pyx_pf_6gensim_6models_13doc2vec_inner_4train_document_dm_con __pyx_L11_bool_binop_done:; if (__pyx_t_8) { - /* "gensim/models/doc2vec_inner.pyx":591 + /* "gensim/models/doc2vec_inner.pyx":719 * cum_table_len = len(model.vocabulary.cum_table) * if negative or sample: * next_random = (2**24) * model.random.randint(0, 2**24) + model.random.randint(0, 2**24) # <<<<<<<<<<<<<< * * # convert Python structures to primitive types, so we can release the GIL */ - __pyx_t_3 = __Pyx_PyObject_GetAttrStr(__pyx_v_model, __pyx_n_s_random); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 591, __pyx_L1_error) + __pyx_t_3 = __Pyx_PyObject_GetAttrStr(__pyx_v_model, __pyx_n_s_random); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 719, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_3); - __pyx_t_1 = __Pyx_PyObject_GetAttrStr(__pyx_t_3, __pyx_n_s_randint); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 591, __pyx_L1_error) + __pyx_t_1 = __Pyx_PyObject_GetAttrStr(__pyx_t_3, __pyx_n_s_randint); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 719, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_1); __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0; - __pyx_t_3 = __Pyx_PyObject_Call(__pyx_t_1, __pyx_tuple__6, NULL); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 591, __pyx_L1_error) + __pyx_t_3 = __Pyx_PyObject_Call(__pyx_t_1, __pyx_tuple__6, NULL); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 719, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_3); __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0; - __pyx_t_1 = PyNumber_Multiply(__pyx_int_16777216, __pyx_t_3); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 591, __pyx_L1_error) + __pyx_t_1 = PyNumber_Multiply(__pyx_int_16777216, __pyx_t_3); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 719, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_1); __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0; - __pyx_t_3 = __Pyx_PyObject_GetAttrStr(__pyx_v_model, __pyx_n_s_random); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 591, __pyx_L1_error) + __pyx_t_3 = __Pyx_PyObject_GetAttrStr(__pyx_v_model, __pyx_n_s_random); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 719, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_3); - __pyx_t_10 = __Pyx_PyObject_GetAttrStr(__pyx_t_3, __pyx_n_s_randint); if (unlikely(!__pyx_t_10)) __PYX_ERR(0, 591, __pyx_L1_error) + __pyx_t_10 = __Pyx_PyObject_GetAttrStr(__pyx_t_3, __pyx_n_s_randint); if (unlikely(!__pyx_t_10)) __PYX_ERR(0, 719, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_10); __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0; - __pyx_t_3 = __Pyx_PyObject_Call(__pyx_t_10, __pyx_tuple__7, NULL); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 591, __pyx_L1_error) + __pyx_t_3 = __Pyx_PyObject_Call(__pyx_t_10, __pyx_tuple__7, NULL); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 719, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_3); __Pyx_DECREF(__pyx_t_10); __pyx_t_10 = 0; - __pyx_t_10 = PyNumber_Add(__pyx_t_1, __pyx_t_3); if (unlikely(!__pyx_t_10)) __PYX_ERR(0, 591, __pyx_L1_error) + __pyx_t_10 = PyNumber_Add(__pyx_t_1, __pyx_t_3); if (unlikely(!__pyx_t_10)) __PYX_ERR(0, 719, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_10); __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0; __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0; - __pyx_t_11 = __Pyx_PyInt_As_unsigned_PY_LONG_LONG(__pyx_t_10); if (unlikely((__pyx_t_11 == (unsigned PY_LONG_LONG)-1) && PyErr_Occurred())) __PYX_ERR(0, 591, __pyx_L1_error) + __pyx_t_11 = __Pyx_PyInt_As_unsigned_PY_LONG_LONG(__pyx_t_10); if (unlikely((__pyx_t_11 == (unsigned PY_LONG_LONG)-1) && PyErr_Occurred())) __PYX_ERR(0, 719, __pyx_L1_error) __Pyx_DECREF(__pyx_t_10); __pyx_t_10 = 0; __pyx_v_next_random = __pyx_t_11; - /* "gensim/models/doc2vec_inner.pyx":590 + /* "gensim/models/doc2vec_inner.pyx":718 * cum_table = (np.PyArray_DATA(model.vocabulary.cum_table)) * cum_table_len = len(model.vocabulary.cum_table) * if negative or sample: # <<<<<<<<<<<<<< @@ -7756,7 +7749,7 @@ static PyObject *__pyx_pf_6gensim_6models_13doc2vec_inner_4train_document_dm_con */ } - /* "gensim/models/doc2vec_inner.pyx":594 + /* "gensim/models/doc2vec_inner.pyx":722 * * # convert Python structures to primitive types, so we can release the GIL * if work is None: # <<<<<<<<<<<<<< @@ -7767,32 +7760,32 @@ static PyObject *__pyx_pf_6gensim_6models_13doc2vec_inner_4train_document_dm_con __pyx_t_9 = (__pyx_t_8 != 0); if (__pyx_t_9) { - /* "gensim/models/doc2vec_inner.pyx":595 + /* "gensim/models/doc2vec_inner.pyx":723 * # convert Python structures to primitive types, so we can release the GIL * if work is None: * work = zeros(model.trainables.layer1_size, dtype=REAL) # <<<<<<<<<<<<<< * _work = np.PyArray_DATA(work) * if neu1 is None: */ - __pyx_t_10 = __Pyx_GetModuleGlobalName(__pyx_n_s_zeros); if (unlikely(!__pyx_t_10)) __PYX_ERR(0, 595, __pyx_L1_error) + __pyx_t_10 = __Pyx_GetModuleGlobalName(__pyx_n_s_zeros); if (unlikely(!__pyx_t_10)) __PYX_ERR(0, 723, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_10); - __pyx_t_3 = __Pyx_PyObject_GetAttrStr(__pyx_v_model, __pyx_n_s_trainables); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 595, __pyx_L1_error) + __pyx_t_3 = __Pyx_PyObject_GetAttrStr(__pyx_v_model, __pyx_n_s_trainables); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 723, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_3); - __pyx_t_1 = __Pyx_PyObject_GetAttrStr(__pyx_t_3, __pyx_n_s_layer1_size); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 595, __pyx_L1_error) + __pyx_t_1 = __Pyx_PyObject_GetAttrStr(__pyx_t_3, __pyx_n_s_layer1_size); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 723, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_1); __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0; - __pyx_t_3 = PyTuple_New(1); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 595, __pyx_L1_error) + __pyx_t_3 = PyTuple_New(1); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 723, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_3); __Pyx_GIVEREF(__pyx_t_1); PyTuple_SET_ITEM(__pyx_t_3, 0, __pyx_t_1); __pyx_t_1 = 0; - __pyx_t_1 = __Pyx_PyDict_NewPresized(1); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 595, __pyx_L1_error) + __pyx_t_1 = __Pyx_PyDict_NewPresized(1); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 723, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_1); - __pyx_t_12 = __Pyx_GetModuleGlobalName(__pyx_n_s_REAL); if (unlikely(!__pyx_t_12)) __PYX_ERR(0, 595, __pyx_L1_error) + __pyx_t_12 = __Pyx_GetModuleGlobalName(__pyx_n_s_REAL); if (unlikely(!__pyx_t_12)) __PYX_ERR(0, 723, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_12); - if (PyDict_SetItem(__pyx_t_1, __pyx_n_s_dtype, __pyx_t_12) < 0) __PYX_ERR(0, 595, __pyx_L1_error) + if (PyDict_SetItem(__pyx_t_1, __pyx_n_s_dtype, __pyx_t_12) < 0) __PYX_ERR(0, 723, __pyx_L1_error) __Pyx_DECREF(__pyx_t_12); __pyx_t_12 = 0; - __pyx_t_12 = __Pyx_PyObject_Call(__pyx_t_10, __pyx_t_3, __pyx_t_1); if (unlikely(!__pyx_t_12)) __PYX_ERR(0, 595, __pyx_L1_error) + __pyx_t_12 = __Pyx_PyObject_Call(__pyx_t_10, __pyx_t_3, __pyx_t_1); if (unlikely(!__pyx_t_12)) __PYX_ERR(0, 723, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_12); __Pyx_DECREF(__pyx_t_10); __pyx_t_10 = 0; __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0; @@ -7800,7 +7793,7 @@ static PyObject *__pyx_pf_6gensim_6models_13doc2vec_inner_4train_document_dm_con __Pyx_DECREF_SET(__pyx_v_work, __pyx_t_12); __pyx_t_12 = 0; - /* "gensim/models/doc2vec_inner.pyx":594 + /* "gensim/models/doc2vec_inner.pyx":722 * * # convert Python structures to primitive types, so we can release the GIL * if work is None: # <<<<<<<<<<<<<< @@ -7809,17 +7802,17 @@ static PyObject *__pyx_pf_6gensim_6models_13doc2vec_inner_4train_document_dm_con */ } - /* "gensim/models/doc2vec_inner.pyx":596 + /* "gensim/models/doc2vec_inner.pyx":724 * if work is None: * work = zeros(model.trainables.layer1_size, dtype=REAL) * _work = np.PyArray_DATA(work) # <<<<<<<<<<<<<< * if neu1 is None: * neu1 = zeros(model.trainables.layer1_size, dtype=REAL) */ - if (!(likely(((__pyx_v_work) == Py_None) || likely(__Pyx_TypeTest(__pyx_v_work, __pyx_ptype_5numpy_ndarray))))) __PYX_ERR(0, 596, __pyx_L1_error) + if (!(likely(((__pyx_v_work) == Py_None) || likely(__Pyx_TypeTest(__pyx_v_work, __pyx_ptype_5numpy_ndarray))))) __PYX_ERR(0, 724, __pyx_L1_error) __pyx_v__work = ((__pyx_t_6gensim_6models_14word2vec_inner_REAL_t *)PyArray_DATA(((PyArrayObject *)__pyx_v_work))); - /* "gensim/models/doc2vec_inner.pyx":597 + /* "gensim/models/doc2vec_inner.pyx":725 * work = zeros(model.trainables.layer1_size, dtype=REAL) * _work = np.PyArray_DATA(work) * if neu1 is None: # <<<<<<<<<<<<<< @@ -7830,32 +7823,32 @@ static PyObject *__pyx_pf_6gensim_6models_13doc2vec_inner_4train_document_dm_con __pyx_t_8 = (__pyx_t_9 != 0); if (__pyx_t_8) { - /* "gensim/models/doc2vec_inner.pyx":598 + /* "gensim/models/doc2vec_inner.pyx":726 * _work = np.PyArray_DATA(work) * if neu1 is None: * neu1 = zeros(model.trainables.layer1_size, dtype=REAL) # <<<<<<<<<<<<<< * _neu1 = np.PyArray_DATA(neu1) * */ - __pyx_t_12 = __Pyx_GetModuleGlobalName(__pyx_n_s_zeros); if (unlikely(!__pyx_t_12)) __PYX_ERR(0, 598, __pyx_L1_error) + __pyx_t_12 = __Pyx_GetModuleGlobalName(__pyx_n_s_zeros); if (unlikely(!__pyx_t_12)) __PYX_ERR(0, 726, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_12); - __pyx_t_1 = __Pyx_PyObject_GetAttrStr(__pyx_v_model, __pyx_n_s_trainables); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 598, __pyx_L1_error) + __pyx_t_1 = __Pyx_PyObject_GetAttrStr(__pyx_v_model, __pyx_n_s_trainables); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 726, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_1); - __pyx_t_3 = __Pyx_PyObject_GetAttrStr(__pyx_t_1, __pyx_n_s_layer1_size); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 598, __pyx_L1_error) + __pyx_t_3 = __Pyx_PyObject_GetAttrStr(__pyx_t_1, __pyx_n_s_layer1_size); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 726, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_3); __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0; - __pyx_t_1 = PyTuple_New(1); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 598, __pyx_L1_error) + __pyx_t_1 = PyTuple_New(1); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 726, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_1); __Pyx_GIVEREF(__pyx_t_3); PyTuple_SET_ITEM(__pyx_t_1, 0, __pyx_t_3); __pyx_t_3 = 0; - __pyx_t_3 = __Pyx_PyDict_NewPresized(1); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 598, __pyx_L1_error) + __pyx_t_3 = __Pyx_PyDict_NewPresized(1); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 726, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_3); - __pyx_t_10 = __Pyx_GetModuleGlobalName(__pyx_n_s_REAL); if (unlikely(!__pyx_t_10)) __PYX_ERR(0, 598, __pyx_L1_error) + __pyx_t_10 = __Pyx_GetModuleGlobalName(__pyx_n_s_REAL); if (unlikely(!__pyx_t_10)) __PYX_ERR(0, 726, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_10); - if (PyDict_SetItem(__pyx_t_3, __pyx_n_s_dtype, __pyx_t_10) < 0) __PYX_ERR(0, 598, __pyx_L1_error) + if (PyDict_SetItem(__pyx_t_3, __pyx_n_s_dtype, __pyx_t_10) < 0) __PYX_ERR(0, 726, __pyx_L1_error) __Pyx_DECREF(__pyx_t_10); __pyx_t_10 = 0; - __pyx_t_10 = __Pyx_PyObject_Call(__pyx_t_12, __pyx_t_1, __pyx_t_3); if (unlikely(!__pyx_t_10)) __PYX_ERR(0, 598, __pyx_L1_error) + __pyx_t_10 = __Pyx_PyObject_Call(__pyx_t_12, __pyx_t_1, __pyx_t_3); if (unlikely(!__pyx_t_10)) __PYX_ERR(0, 726, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_10); __Pyx_DECREF(__pyx_t_12); __pyx_t_12 = 0; __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0; @@ -7863,7 +7856,7 @@ static PyObject *__pyx_pf_6gensim_6models_13doc2vec_inner_4train_document_dm_con __Pyx_DECREF_SET(__pyx_v_neu1, __pyx_t_10); __pyx_t_10 = 0; - /* "gensim/models/doc2vec_inner.pyx":597 + /* "gensim/models/doc2vec_inner.pyx":725 * work = zeros(model.trainables.layer1_size, dtype=REAL) * _work = np.PyArray_DATA(work) * if neu1 is None: # <<<<<<<<<<<<<< @@ -7872,32 +7865,32 @@ static PyObject *__pyx_pf_6gensim_6models_13doc2vec_inner_4train_document_dm_con */ } - /* "gensim/models/doc2vec_inner.pyx":599 + /* "gensim/models/doc2vec_inner.pyx":727 * if neu1 is None: * neu1 = zeros(model.trainables.layer1_size, dtype=REAL) * _neu1 = np.PyArray_DATA(neu1) # <<<<<<<<<<<<<< * * vlookup = model.wv.vocab */ - if (!(likely(((__pyx_v_neu1) == Py_None) || likely(__Pyx_TypeTest(__pyx_v_neu1, __pyx_ptype_5numpy_ndarray))))) __PYX_ERR(0, 599, __pyx_L1_error) + if (!(likely(((__pyx_v_neu1) == Py_None) || likely(__Pyx_TypeTest(__pyx_v_neu1, __pyx_ptype_5numpy_ndarray))))) __PYX_ERR(0, 727, __pyx_L1_error) __pyx_v__neu1 = ((__pyx_t_6gensim_6models_14word2vec_inner_REAL_t *)PyArray_DATA(((PyArrayObject *)__pyx_v_neu1))); - /* "gensim/models/doc2vec_inner.pyx":601 + /* "gensim/models/doc2vec_inner.pyx":729 * _neu1 = np.PyArray_DATA(neu1) * * vlookup = model.wv.vocab # <<<<<<<<<<<<<< * i = 0 * for token in doc_words: */ - __pyx_t_10 = __Pyx_PyObject_GetAttrStr(__pyx_v_model, __pyx_n_s_wv); if (unlikely(!__pyx_t_10)) __PYX_ERR(0, 601, __pyx_L1_error) + __pyx_t_10 = __Pyx_PyObject_GetAttrStr(__pyx_v_model, __pyx_n_s_wv); if (unlikely(!__pyx_t_10)) __PYX_ERR(0, 729, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_10); - __pyx_t_3 = __Pyx_PyObject_GetAttrStr(__pyx_t_10, __pyx_n_s_vocab); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 601, __pyx_L1_error) + __pyx_t_3 = __Pyx_PyObject_GetAttrStr(__pyx_t_10, __pyx_n_s_vocab); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 729, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_3); __Pyx_DECREF(__pyx_t_10); __pyx_t_10 = 0; __pyx_v_vlookup = __pyx_t_3; __pyx_t_3 = 0; - /* "gensim/models/doc2vec_inner.pyx":602 + /* "gensim/models/doc2vec_inner.pyx":730 * * vlookup = model.wv.vocab * i = 0 # <<<<<<<<<<<<<< @@ -7906,7 +7899,7 @@ static PyObject *__pyx_pf_6gensim_6models_13doc2vec_inner_4train_document_dm_con */ __pyx_v_i = 0; - /* "gensim/models/doc2vec_inner.pyx":603 + /* "gensim/models/doc2vec_inner.pyx":731 * vlookup = model.wv.vocab * i = 0 * for token in doc_words: # <<<<<<<<<<<<<< @@ -7917,26 +7910,26 @@ static PyObject *__pyx_pf_6gensim_6models_13doc2vec_inner_4train_document_dm_con __pyx_t_3 = __pyx_v_doc_words; __Pyx_INCREF(__pyx_t_3); __pyx_t_7 = 0; __pyx_t_13 = NULL; } else { - __pyx_t_7 = -1; __pyx_t_3 = PyObject_GetIter(__pyx_v_doc_words); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 603, __pyx_L1_error) + __pyx_t_7 = -1; __pyx_t_3 = PyObject_GetIter(__pyx_v_doc_words); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 731, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_3); - __pyx_t_13 = Py_TYPE(__pyx_t_3)->tp_iternext; if (unlikely(!__pyx_t_13)) __PYX_ERR(0, 603, __pyx_L1_error) + __pyx_t_13 = Py_TYPE(__pyx_t_3)->tp_iternext; if (unlikely(!__pyx_t_13)) __PYX_ERR(0, 731, __pyx_L1_error) } for (;;) { if (likely(!__pyx_t_13)) { if (likely(PyList_CheckExact(__pyx_t_3))) { if (__pyx_t_7 >= PyList_GET_SIZE(__pyx_t_3)) break; #if CYTHON_ASSUME_SAFE_MACROS && !CYTHON_AVOID_BORROWED_REFS - __pyx_t_10 = PyList_GET_ITEM(__pyx_t_3, __pyx_t_7); __Pyx_INCREF(__pyx_t_10); __pyx_t_7++; if (unlikely(0 < 0)) __PYX_ERR(0, 603, __pyx_L1_error) + __pyx_t_10 = PyList_GET_ITEM(__pyx_t_3, __pyx_t_7); __Pyx_INCREF(__pyx_t_10); __pyx_t_7++; if (unlikely(0 < 0)) __PYX_ERR(0, 731, __pyx_L1_error) #else - __pyx_t_10 = PySequence_ITEM(__pyx_t_3, __pyx_t_7); __pyx_t_7++; if (unlikely(!__pyx_t_10)) __PYX_ERR(0, 603, __pyx_L1_error) + __pyx_t_10 = PySequence_ITEM(__pyx_t_3, __pyx_t_7); __pyx_t_7++; if (unlikely(!__pyx_t_10)) __PYX_ERR(0, 731, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_10); #endif } else { if (__pyx_t_7 >= PyTuple_GET_SIZE(__pyx_t_3)) break; #if CYTHON_ASSUME_SAFE_MACROS && !CYTHON_AVOID_BORROWED_REFS - __pyx_t_10 = PyTuple_GET_ITEM(__pyx_t_3, __pyx_t_7); __Pyx_INCREF(__pyx_t_10); __pyx_t_7++; if (unlikely(0 < 0)) __PYX_ERR(0, 603, __pyx_L1_error) + __pyx_t_10 = PyTuple_GET_ITEM(__pyx_t_3, __pyx_t_7); __Pyx_INCREF(__pyx_t_10); __pyx_t_7++; if (unlikely(0 < 0)) __PYX_ERR(0, 731, __pyx_L1_error) #else - __pyx_t_10 = PySequence_ITEM(__pyx_t_3, __pyx_t_7); __pyx_t_7++; if (unlikely(!__pyx_t_10)) __PYX_ERR(0, 603, __pyx_L1_error) + __pyx_t_10 = PySequence_ITEM(__pyx_t_3, __pyx_t_7); __pyx_t_7++; if (unlikely(!__pyx_t_10)) __PYX_ERR(0, 731, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_10); #endif } @@ -7946,7 +7939,7 @@ static PyObject *__pyx_pf_6gensim_6models_13doc2vec_inner_4train_document_dm_con PyObject* exc_type = PyErr_Occurred(); if (exc_type) { if (likely(__Pyx_PyErr_GivenExceptionMatches(exc_type, PyExc_StopIteration))) PyErr_Clear(); - else __PYX_ERR(0, 603, __pyx_L1_error) + else __PYX_ERR(0, 731, __pyx_L1_error) } break; } @@ -7955,16 +7948,16 @@ static PyObject *__pyx_pf_6gensim_6models_13doc2vec_inner_4train_document_dm_con __Pyx_XDECREF_SET(__pyx_v_token, __pyx_t_10); __pyx_t_10 = 0; - /* "gensim/models/doc2vec_inner.pyx":604 + /* "gensim/models/doc2vec_inner.pyx":732 * i = 0 * for token in doc_words: * predict_word = vlookup[token] if token in vlookup else None # <<<<<<<<<<<<<< * if predict_word is None: # shrink document to leave out word * continue # leaving i unchanged */ - __pyx_t_8 = (__Pyx_PySequence_ContainsTF(__pyx_v_token, __pyx_v_vlookup, Py_EQ)); if (unlikely(__pyx_t_8 < 0)) __PYX_ERR(0, 604, __pyx_L1_error) + __pyx_t_8 = (__Pyx_PySequence_ContainsTF(__pyx_v_token, __pyx_v_vlookup, Py_EQ)); if (unlikely(__pyx_t_8 < 0)) __PYX_ERR(0, 732, __pyx_L1_error) if ((__pyx_t_8 != 0)) { - __pyx_t_1 = PyObject_GetItem(__pyx_v_vlookup, __pyx_v_token); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 604, __pyx_L1_error) + __pyx_t_1 = PyObject_GetItem(__pyx_v_vlookup, __pyx_v_token); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 732, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_1); __pyx_t_10 = __pyx_t_1; __pyx_t_1 = 0; @@ -7975,7 +7968,7 @@ static PyObject *__pyx_pf_6gensim_6models_13doc2vec_inner_4train_document_dm_con __Pyx_XDECREF_SET(__pyx_v_predict_word, __pyx_t_10); __pyx_t_10 = 0; - /* "gensim/models/doc2vec_inner.pyx":605 + /* "gensim/models/doc2vec_inner.pyx":733 * for token in doc_words: * predict_word = vlookup[token] if token in vlookup else None * if predict_word is None: # shrink document to leave out word # <<<<<<<<<<<<<< @@ -7986,7 +7979,7 @@ static PyObject *__pyx_pf_6gensim_6models_13doc2vec_inner_4train_document_dm_con __pyx_t_9 = (__pyx_t_8 != 0); if (__pyx_t_9) { - /* "gensim/models/doc2vec_inner.pyx":606 + /* "gensim/models/doc2vec_inner.pyx":734 * predict_word = vlookup[token] if token in vlookup else None * if predict_word is None: # shrink document to leave out word * continue # leaving i unchanged # <<<<<<<<<<<<<< @@ -7995,7 +7988,7 @@ static PyObject *__pyx_pf_6gensim_6models_13doc2vec_inner_4train_document_dm_con */ goto __pyx_L15_continue; - /* "gensim/models/doc2vec_inner.pyx":605 + /* "gensim/models/doc2vec_inner.pyx":733 * for token in doc_words: * predict_word = vlookup[token] if token in vlookup else None * if predict_word is None: # shrink document to leave out word # <<<<<<<<<<<<<< @@ -8004,7 +7997,7 @@ static PyObject *__pyx_pf_6gensim_6models_13doc2vec_inner_4train_document_dm_con */ } - /* "gensim/models/doc2vec_inner.pyx":607 + /* "gensim/models/doc2vec_inner.pyx":735 * if predict_word is None: # shrink document to leave out word * continue # leaving i unchanged * if sample and predict_word.sample_int < random_int32(&next_random): # <<<<<<<<<<<<<< @@ -8017,20 +8010,20 @@ static PyObject *__pyx_pf_6gensim_6models_13doc2vec_inner_4train_document_dm_con __pyx_t_9 = __pyx_t_8; goto __pyx_L19_bool_binop_done; } - __pyx_t_10 = __Pyx_PyObject_GetAttrStr(__pyx_v_predict_word, __pyx_n_s_sample_int); if (unlikely(!__pyx_t_10)) __PYX_ERR(0, 607, __pyx_L1_error) + __pyx_t_10 = __Pyx_PyObject_GetAttrStr(__pyx_v_predict_word, __pyx_n_s_sample_int); if (unlikely(!__pyx_t_10)) __PYX_ERR(0, 735, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_10); - __pyx_t_1 = __Pyx_PyInt_From_unsigned_PY_LONG_LONG(__pyx_f_6gensim_6models_14word2vec_inner_random_int32((&__pyx_v_next_random))); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 607, __pyx_L1_error) + __pyx_t_1 = __Pyx_PyInt_From_unsigned_PY_LONG_LONG(__pyx_f_6gensim_6models_14word2vec_inner_random_int32((&__pyx_v_next_random))); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 735, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_1); - __pyx_t_12 = PyObject_RichCompare(__pyx_t_10, __pyx_t_1, Py_LT); __Pyx_XGOTREF(__pyx_t_12); if (unlikely(!__pyx_t_12)) __PYX_ERR(0, 607, __pyx_L1_error) + __pyx_t_12 = PyObject_RichCompare(__pyx_t_10, __pyx_t_1, Py_LT); __Pyx_XGOTREF(__pyx_t_12); if (unlikely(!__pyx_t_12)) __PYX_ERR(0, 735, __pyx_L1_error) __Pyx_DECREF(__pyx_t_10); __pyx_t_10 = 0; __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0; - __pyx_t_8 = __Pyx_PyObject_IsTrue(__pyx_t_12); if (unlikely(__pyx_t_8 < 0)) __PYX_ERR(0, 607, __pyx_L1_error) + __pyx_t_8 = __Pyx_PyObject_IsTrue(__pyx_t_12); if (unlikely(__pyx_t_8 < 0)) __PYX_ERR(0, 735, __pyx_L1_error) __Pyx_DECREF(__pyx_t_12); __pyx_t_12 = 0; __pyx_t_9 = __pyx_t_8; __pyx_L19_bool_binop_done:; if (__pyx_t_9) { - /* "gensim/models/doc2vec_inner.pyx":608 + /* "gensim/models/doc2vec_inner.pyx":736 * continue # leaving i unchanged * if sample and predict_word.sample_int < random_int32(&next_random): * continue # <<<<<<<<<<<<<< @@ -8039,7 +8032,7 @@ static PyObject *__pyx_pf_6gensim_6models_13doc2vec_inner_4train_document_dm_con */ goto __pyx_L15_continue; - /* "gensim/models/doc2vec_inner.pyx":607 + /* "gensim/models/doc2vec_inner.pyx":735 * if predict_word is None: # shrink document to leave out word * continue # leaving i unchanged * if sample and predict_word.sample_int < random_int32(&next_random): # <<<<<<<<<<<<<< @@ -8048,20 +8041,20 @@ static PyObject *__pyx_pf_6gensim_6models_13doc2vec_inner_4train_document_dm_con */ } - /* "gensim/models/doc2vec_inner.pyx":609 + /* "gensim/models/doc2vec_inner.pyx":737 * if sample and predict_word.sample_int < random_int32(&next_random): * continue * indexes[i] = predict_word.index # <<<<<<<<<<<<<< * if hs: * codelens[i] = len(predict_word.code) */ - __pyx_t_12 = __Pyx_PyObject_GetAttrStr(__pyx_v_predict_word, __pyx_n_s_index); if (unlikely(!__pyx_t_12)) __PYX_ERR(0, 609, __pyx_L1_error) + __pyx_t_12 = __Pyx_PyObject_GetAttrStr(__pyx_v_predict_word, __pyx_n_s_index); if (unlikely(!__pyx_t_12)) __PYX_ERR(0, 737, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_12); - __pyx_t_14 = __Pyx_PyInt_As_npy_uint32(__pyx_t_12); if (unlikely((__pyx_t_14 == ((npy_uint32)-1)) && PyErr_Occurred())) __PYX_ERR(0, 609, __pyx_L1_error) + __pyx_t_14 = __Pyx_PyInt_As_npy_uint32(__pyx_t_12); if (unlikely((__pyx_t_14 == ((npy_uint32)-1)) && PyErr_Occurred())) __PYX_ERR(0, 737, __pyx_L1_error) __Pyx_DECREF(__pyx_t_12); __pyx_t_12 = 0; (__pyx_v_indexes[__pyx_v_i]) = __pyx_t_14; - /* "gensim/models/doc2vec_inner.pyx":610 + /* "gensim/models/doc2vec_inner.pyx":738 * continue * indexes[i] = predict_word.index * if hs: # <<<<<<<<<<<<<< @@ -8071,46 +8064,46 @@ static PyObject *__pyx_pf_6gensim_6models_13doc2vec_inner_4train_document_dm_con __pyx_t_9 = (__pyx_v_hs != 0); if (__pyx_t_9) { - /* "gensim/models/doc2vec_inner.pyx":611 + /* "gensim/models/doc2vec_inner.pyx":739 * indexes[i] = predict_word.index * if hs: * codelens[i] = len(predict_word.code) # <<<<<<<<<<<<<< * codes[i] = np.PyArray_DATA(predict_word.code) * points[i] = np.PyArray_DATA(predict_word.point) */ - __pyx_t_12 = __Pyx_PyObject_GetAttrStr(__pyx_v_predict_word, __pyx_n_s_code); if (unlikely(!__pyx_t_12)) __PYX_ERR(0, 611, __pyx_L1_error) + __pyx_t_12 = __Pyx_PyObject_GetAttrStr(__pyx_v_predict_word, __pyx_n_s_code); if (unlikely(!__pyx_t_12)) __PYX_ERR(0, 739, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_12); - __pyx_t_5 = PyObject_Length(__pyx_t_12); if (unlikely(__pyx_t_5 == ((Py_ssize_t)-1))) __PYX_ERR(0, 611, __pyx_L1_error) + __pyx_t_5 = PyObject_Length(__pyx_t_12); if (unlikely(__pyx_t_5 == ((Py_ssize_t)-1))) __PYX_ERR(0, 739, __pyx_L1_error) __Pyx_DECREF(__pyx_t_12); __pyx_t_12 = 0; (__pyx_v_codelens[__pyx_v_i]) = ((int)__pyx_t_5); - /* "gensim/models/doc2vec_inner.pyx":612 + /* "gensim/models/doc2vec_inner.pyx":740 * if hs: * codelens[i] = len(predict_word.code) * codes[i] = np.PyArray_DATA(predict_word.code) # <<<<<<<<<<<<<< * points[i] = np.PyArray_DATA(predict_word.point) * result += 1 */ - __pyx_t_12 = __Pyx_PyObject_GetAttrStr(__pyx_v_predict_word, __pyx_n_s_code); if (unlikely(!__pyx_t_12)) __PYX_ERR(0, 612, __pyx_L1_error) + __pyx_t_12 = __Pyx_PyObject_GetAttrStr(__pyx_v_predict_word, __pyx_n_s_code); if (unlikely(!__pyx_t_12)) __PYX_ERR(0, 740, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_12); - if (!(likely(((__pyx_t_12) == Py_None) || likely(__Pyx_TypeTest(__pyx_t_12, __pyx_ptype_5numpy_ndarray))))) __PYX_ERR(0, 612, __pyx_L1_error) + if (!(likely(((__pyx_t_12) == Py_None) || likely(__Pyx_TypeTest(__pyx_t_12, __pyx_ptype_5numpy_ndarray))))) __PYX_ERR(0, 740, __pyx_L1_error) (__pyx_v_codes[__pyx_v_i]) = ((__pyx_t_5numpy_uint8_t *)PyArray_DATA(((PyArrayObject *)__pyx_t_12))); __Pyx_DECREF(__pyx_t_12); __pyx_t_12 = 0; - /* "gensim/models/doc2vec_inner.pyx":613 + /* "gensim/models/doc2vec_inner.pyx":741 * codelens[i] = len(predict_word.code) * codes[i] = np.PyArray_DATA(predict_word.code) * points[i] = np.PyArray_DATA(predict_word.point) # <<<<<<<<<<<<<< * result += 1 * i += 1 */ - __pyx_t_12 = __Pyx_PyObject_GetAttrStr(__pyx_v_predict_word, __pyx_n_s_point); if (unlikely(!__pyx_t_12)) __PYX_ERR(0, 613, __pyx_L1_error) + __pyx_t_12 = __Pyx_PyObject_GetAttrStr(__pyx_v_predict_word, __pyx_n_s_point); if (unlikely(!__pyx_t_12)) __PYX_ERR(0, 741, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_12); - if (!(likely(((__pyx_t_12) == Py_None) || likely(__Pyx_TypeTest(__pyx_t_12, __pyx_ptype_5numpy_ndarray))))) __PYX_ERR(0, 613, __pyx_L1_error) + if (!(likely(((__pyx_t_12) == Py_None) || likely(__Pyx_TypeTest(__pyx_t_12, __pyx_ptype_5numpy_ndarray))))) __PYX_ERR(0, 741, __pyx_L1_error) (__pyx_v_points[__pyx_v_i]) = ((__pyx_t_5numpy_uint32_t *)PyArray_DATA(((PyArrayObject *)__pyx_t_12))); __Pyx_DECREF(__pyx_t_12); __pyx_t_12 = 0; - /* "gensim/models/doc2vec_inner.pyx":610 + /* "gensim/models/doc2vec_inner.pyx":738 * continue * indexes[i] = predict_word.index * if hs: # <<<<<<<<<<<<<< @@ -8119,7 +8112,7 @@ static PyObject *__pyx_pf_6gensim_6models_13doc2vec_inner_4train_document_dm_con */ } - /* "gensim/models/doc2vec_inner.pyx":614 + /* "gensim/models/doc2vec_inner.pyx":742 * codes[i] = np.PyArray_DATA(predict_word.code) * points[i] = np.PyArray_DATA(predict_word.point) * result += 1 # <<<<<<<<<<<<<< @@ -8128,7 +8121,7 @@ static PyObject *__pyx_pf_6gensim_6models_13doc2vec_inner_4train_document_dm_con */ __pyx_v_result = (__pyx_v_result + 1); - /* "gensim/models/doc2vec_inner.pyx":615 + /* "gensim/models/doc2vec_inner.pyx":743 * points[i] = np.PyArray_DATA(predict_word.point) * result += 1 * i += 1 # <<<<<<<<<<<<<< @@ -8137,7 +8130,7 @@ static PyObject *__pyx_pf_6gensim_6models_13doc2vec_inner_4train_document_dm_con */ __pyx_v_i = (__pyx_v_i + 1); - /* "gensim/models/doc2vec_inner.pyx":616 + /* "gensim/models/doc2vec_inner.pyx":744 * result += 1 * i += 1 * if i == MAX_DOCUMENT_LEN: # <<<<<<<<<<<<<< @@ -8147,7 +8140,7 @@ static PyObject *__pyx_pf_6gensim_6models_13doc2vec_inner_4train_document_dm_con __pyx_t_9 = ((__pyx_v_i == 0x2710) != 0); if (__pyx_t_9) { - /* "gensim/models/doc2vec_inner.pyx":617 + /* "gensim/models/doc2vec_inner.pyx":745 * i += 1 * if i == MAX_DOCUMENT_LEN: * break # TODO: log warning, tally overflow? # <<<<<<<<<<<<<< @@ -8156,7 +8149,7 @@ static PyObject *__pyx_pf_6gensim_6models_13doc2vec_inner_4train_document_dm_con */ goto __pyx_L16_break; - /* "gensim/models/doc2vec_inner.pyx":616 + /* "gensim/models/doc2vec_inner.pyx":744 * result += 1 * i += 1 * if i == MAX_DOCUMENT_LEN: # <<<<<<<<<<<<<< @@ -8165,7 +8158,7 @@ static PyObject *__pyx_pf_6gensim_6models_13doc2vec_inner_4train_document_dm_con */ } - /* "gensim/models/doc2vec_inner.pyx":603 + /* "gensim/models/doc2vec_inner.pyx":731 * vlookup = model.wv.vocab * i = 0 * for token in doc_words: # <<<<<<<<<<<<<< @@ -8177,7 +8170,7 @@ static PyObject *__pyx_pf_6gensim_6models_13doc2vec_inner_4train_document_dm_con __pyx_L16_break:; __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0; - /* "gensim/models/doc2vec_inner.pyx":618 + /* "gensim/models/doc2vec_inner.pyx":746 * if i == MAX_DOCUMENT_LEN: * break # TODO: log warning, tally overflow? * document_len = i # <<<<<<<<<<<<<< @@ -8186,7 +8179,7 @@ static PyObject *__pyx_pf_6gensim_6models_13doc2vec_inner_4train_document_dm_con */ __pyx_v_document_len = __pyx_v_i; - /* "gensim/models/doc2vec_inner.pyx":620 + /* "gensim/models/doc2vec_inner.pyx":748 * document_len = i * * for i in range(doctag_len): # <<<<<<<<<<<<<< @@ -8197,20 +8190,20 @@ static PyObject *__pyx_pf_6gensim_6models_13doc2vec_inner_4train_document_dm_con for (__pyx_t_15 = 0; __pyx_t_15 < __pyx_t_2; __pyx_t_15+=1) { __pyx_v_i = __pyx_t_15; - /* "gensim/models/doc2vec_inner.pyx":621 + /* "gensim/models/doc2vec_inner.pyx":749 * * for i in range(doctag_len): * _doctag_indexes[i] = doctag_indexes[i] # <<<<<<<<<<<<<< * result += 1 * */ - __pyx_t_3 = __Pyx_GetItemInt(__pyx_v_doctag_indexes, __pyx_v_i, int, 1, __Pyx_PyInt_From_int, 0, 0, 0); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 621, __pyx_L1_error) + __pyx_t_3 = __Pyx_GetItemInt(__pyx_v_doctag_indexes, __pyx_v_i, int, 1, __Pyx_PyInt_From_int, 0, 0, 0); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 749, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_3); - __pyx_t_14 = __Pyx_PyInt_As_npy_uint32(__pyx_t_3); if (unlikely((__pyx_t_14 == ((npy_uint32)-1)) && PyErr_Occurred())) __PYX_ERR(0, 621, __pyx_L1_error) + __pyx_t_14 = __Pyx_PyInt_As_npy_uint32(__pyx_t_3); if (unlikely((__pyx_t_14 == ((npy_uint32)-1)) && PyErr_Occurred())) __PYX_ERR(0, 749, __pyx_L1_error) __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0; (__pyx_v__doctag_indexes[__pyx_v_i]) = __pyx_t_14; - /* "gensim/models/doc2vec_inner.pyx":622 + /* "gensim/models/doc2vec_inner.pyx":750 * for i in range(doctag_len): * _doctag_indexes[i] = doctag_indexes[i] * result += 1 # <<<<<<<<<<<<<< @@ -8220,7 +8213,7 @@ static PyObject *__pyx_pf_6gensim_6models_13doc2vec_inner_4train_document_dm_con __pyx_v_result = (__pyx_v_result + 1); } - /* "gensim/models/doc2vec_inner.pyx":625 + /* "gensim/models/doc2vec_inner.pyx":753 * * # release GIL & train on the document * with nogil: # <<<<<<<<<<<<<< @@ -8235,7 +8228,7 @@ static PyObject *__pyx_pf_6gensim_6models_13doc2vec_inner_4train_document_dm_con #endif /*try:*/ { - /* "gensim/models/doc2vec_inner.pyx":626 + /* "gensim/models/doc2vec_inner.pyx":754 * # release GIL & train on the document * with nogil: * for i in range(document_len): # <<<<<<<<<<<<<< @@ -8246,7 +8239,7 @@ static PyObject *__pyx_pf_6gensim_6models_13doc2vec_inner_4train_document_dm_con for (__pyx_t_15 = 0; __pyx_t_15 < __pyx_t_2; __pyx_t_15+=1) { __pyx_v_i = __pyx_t_15; - /* "gensim/models/doc2vec_inner.pyx":627 + /* "gensim/models/doc2vec_inner.pyx":755 * with nogil: * for i in range(document_len): * j = i - window # negative OK: will pad with null word # <<<<<<<<<<<<<< @@ -8255,7 +8248,7 @@ static PyObject *__pyx_pf_6gensim_6models_13doc2vec_inner_4train_document_dm_con */ __pyx_v_j = (__pyx_v_i - __pyx_v_window); - /* "gensim/models/doc2vec_inner.pyx":628 + /* "gensim/models/doc2vec_inner.pyx":756 * for i in range(document_len): * j = i - window # negative OK: will pad with null word * k = i + window + 1 # past document end OK: will pad with null word # <<<<<<<<<<<<<< @@ -8264,7 +8257,7 @@ static PyObject *__pyx_pf_6gensim_6models_13doc2vec_inner_4train_document_dm_con */ __pyx_v_k = ((__pyx_v_i + __pyx_v_window) + 1); - /* "gensim/models/doc2vec_inner.pyx":631 + /* "gensim/models/doc2vec_inner.pyx":759 * * # compose l1 & clear work * for m in range(doctag_len): # <<<<<<<<<<<<<< @@ -8275,7 +8268,7 @@ static PyObject *__pyx_pf_6gensim_6models_13doc2vec_inner_4train_document_dm_con for (__pyx_t_17 = 0; __pyx_t_17 < __pyx_t_16; __pyx_t_17+=1) { __pyx_v_m = __pyx_t_17; - /* "gensim/models/doc2vec_inner.pyx":633 + /* "gensim/models/doc2vec_inner.pyx":761 * for m in range(doctag_len): * # doc vector(s) * memcpy(&_neu1[m * vector_size], &_doctag_vectors[_doctag_indexes[m] * vector_size], # <<<<<<<<<<<<<< @@ -8285,7 +8278,7 @@ static PyObject *__pyx_pf_6gensim_6models_13doc2vec_inner_4train_document_dm_con memcpy((&(__pyx_v__neu1[(__pyx_v_m * __pyx_v_vector_size)])), (&(__pyx_v__doctag_vectors[((__pyx_v__doctag_indexes[__pyx_v_m]) * __pyx_v_vector_size)])), (__pyx_v_vector_size * (sizeof(__pyx_t_6gensim_6models_14word2vec_inner_REAL_t)))); } - /* "gensim/models/doc2vec_inner.pyx":635 + /* "gensim/models/doc2vec_inner.pyx":763 * memcpy(&_neu1[m * vector_size], &_doctag_vectors[_doctag_indexes[m] * vector_size], * vector_size * cython.sizeof(REAL_t)) * n = 0 # <<<<<<<<<<<<<< @@ -8294,7 +8287,7 @@ static PyObject *__pyx_pf_6gensim_6models_13doc2vec_inner_4train_document_dm_con */ __pyx_v_n = 0; - /* "gensim/models/doc2vec_inner.pyx":636 + /* "gensim/models/doc2vec_inner.pyx":764 * vector_size * cython.sizeof(REAL_t)) * n = 0 * for m in range(j, k): # <<<<<<<<<<<<<< @@ -8305,7 +8298,7 @@ static PyObject *__pyx_pf_6gensim_6models_13doc2vec_inner_4train_document_dm_con for (__pyx_t_17 = __pyx_v_j; __pyx_t_17 < __pyx_t_16; __pyx_t_17+=1) { __pyx_v_m = __pyx_t_17; - /* "gensim/models/doc2vec_inner.pyx":638 + /* "gensim/models/doc2vec_inner.pyx":766 * for m in range(j, k): * # word vectors in window * if m == i: # <<<<<<<<<<<<<< @@ -8315,7 +8308,7 @@ static PyObject *__pyx_pf_6gensim_6models_13doc2vec_inner_4train_document_dm_con __pyx_t_9 = ((__pyx_v_m == __pyx_v_i) != 0); if (__pyx_t_9) { - /* "gensim/models/doc2vec_inner.pyx":639 + /* "gensim/models/doc2vec_inner.pyx":767 * # word vectors in window * if m == i: * continue # <<<<<<<<<<<<<< @@ -8324,7 +8317,7 @@ static PyObject *__pyx_pf_6gensim_6models_13doc2vec_inner_4train_document_dm_con */ goto __pyx_L32_continue; - /* "gensim/models/doc2vec_inner.pyx":638 + /* "gensim/models/doc2vec_inner.pyx":766 * for m in range(j, k): * # word vectors in window * if m == i: # <<<<<<<<<<<<<< @@ -8333,7 +8326,7 @@ static PyObject *__pyx_pf_6gensim_6models_13doc2vec_inner_4train_document_dm_con */ } - /* "gensim/models/doc2vec_inner.pyx":640 + /* "gensim/models/doc2vec_inner.pyx":768 * if m == i: * continue * if m < 0 or m >= document_len: # <<<<<<<<<<<<<< @@ -8351,7 +8344,7 @@ static PyObject *__pyx_pf_6gensim_6models_13doc2vec_inner_4train_document_dm_con __pyx_L36_bool_binop_done:; if (__pyx_t_9) { - /* "gensim/models/doc2vec_inner.pyx":641 + /* "gensim/models/doc2vec_inner.pyx":769 * continue * if m < 0 or m >= document_len: * window_indexes[n] = null_word_index # <<<<<<<<<<<<<< @@ -8360,7 +8353,7 @@ static PyObject *__pyx_pf_6gensim_6models_13doc2vec_inner_4train_document_dm_con */ (__pyx_v_window_indexes[__pyx_v_n]) = __pyx_v_null_word_index; - /* "gensim/models/doc2vec_inner.pyx":640 + /* "gensim/models/doc2vec_inner.pyx":768 * if m == i: * continue * if m < 0 or m >= document_len: # <<<<<<<<<<<<<< @@ -8370,7 +8363,7 @@ static PyObject *__pyx_pf_6gensim_6models_13doc2vec_inner_4train_document_dm_con goto __pyx_L35; } - /* "gensim/models/doc2vec_inner.pyx":643 + /* "gensim/models/doc2vec_inner.pyx":771 * window_indexes[n] = null_word_index * else: * window_indexes[n] = indexes[m] # <<<<<<<<<<<<<< @@ -8382,7 +8375,7 @@ static PyObject *__pyx_pf_6gensim_6models_13doc2vec_inner_4train_document_dm_con } __pyx_L35:; - /* "gensim/models/doc2vec_inner.pyx":644 + /* "gensim/models/doc2vec_inner.pyx":772 * else: * window_indexes[n] = indexes[m] * n += 1 # <<<<<<<<<<<<<< @@ -8393,7 +8386,7 @@ static PyObject *__pyx_pf_6gensim_6models_13doc2vec_inner_4train_document_dm_con __pyx_L32_continue:; } - /* "gensim/models/doc2vec_inner.pyx":645 + /* "gensim/models/doc2vec_inner.pyx":773 * window_indexes[n] = indexes[m] * n += 1 * for m in range(2 * window): # <<<<<<<<<<<<<< @@ -8404,7 +8397,7 @@ static PyObject *__pyx_pf_6gensim_6models_13doc2vec_inner_4train_document_dm_con for (__pyx_t_16 = 0; __pyx_t_16 < __pyx_t_6; __pyx_t_16+=1) { __pyx_v_m = __pyx_t_16; - /* "gensim/models/doc2vec_inner.pyx":646 + /* "gensim/models/doc2vec_inner.pyx":774 * n += 1 * for m in range(2 * window): * memcpy(&_neu1[(doctag_len + m) * vector_size], &_word_vectors[window_indexes[m] * vector_size], # <<<<<<<<<<<<<< @@ -8414,7 +8407,7 @@ static PyObject *__pyx_pf_6gensim_6models_13doc2vec_inner_4train_document_dm_con memcpy((&(__pyx_v__neu1[((__pyx_v_doctag_len + __pyx_v_m) * __pyx_v_vector_size)])), (&(__pyx_v__word_vectors[((__pyx_v_window_indexes[__pyx_v_m]) * __pyx_v_vector_size)])), (__pyx_v_vector_size * (sizeof(__pyx_t_6gensim_6models_14word2vec_inner_REAL_t)))); } - /* "gensim/models/doc2vec_inner.pyx":648 + /* "gensim/models/doc2vec_inner.pyx":776 * memcpy(&_neu1[(doctag_len + m) * vector_size], &_word_vectors[window_indexes[m] * vector_size], * vector_size * cython.sizeof(REAL_t)) * memset(_work, 0, layer1_size * cython.sizeof(REAL_t)) # work to accumulate l1 error # <<<<<<<<<<<<<< @@ -8423,7 +8416,7 @@ static PyObject *__pyx_pf_6gensim_6models_13doc2vec_inner_4train_document_dm_con */ memset(__pyx_v__work, 0, (__pyx_v_layer1_size * (sizeof(__pyx_t_6gensim_6models_14word2vec_inner_REAL_t)))); - /* "gensim/models/doc2vec_inner.pyx":650 + /* "gensim/models/doc2vec_inner.pyx":778 * memset(_work, 0, layer1_size * cython.sizeof(REAL_t)) # work to accumulate l1 error * * if hs: # <<<<<<<<<<<<<< @@ -8433,7 +8426,7 @@ static PyObject *__pyx_pf_6gensim_6models_13doc2vec_inner_4train_document_dm_con __pyx_t_9 = (__pyx_v_hs != 0); if (__pyx_t_9) { - /* "gensim/models/doc2vec_inner.pyx":651 + /* "gensim/models/doc2vec_inner.pyx":779 * * if hs: * fast_document_dmc_hs(points[i], codes[i], codelens[i], # <<<<<<<<<<<<<< @@ -8442,7 +8435,7 @@ static PyObject *__pyx_pf_6gensim_6models_13doc2vec_inner_4train_document_dm_con */ __pyx_f_6gensim_6models_13doc2vec_inner_fast_document_dmc_hs((__pyx_v_points[__pyx_v_i]), (__pyx_v_codes[__pyx_v_i]), (__pyx_v_codelens[__pyx_v_i]), __pyx_v__neu1, __pyx_v_syn1, __pyx_v__alpha, __pyx_v__work, __pyx_v_layer1_size, __pyx_v_vector_size, __pyx_v__learn_hidden); - /* "gensim/models/doc2vec_inner.pyx":650 + /* "gensim/models/doc2vec_inner.pyx":778 * memset(_work, 0, layer1_size * cython.sizeof(REAL_t)) # work to accumulate l1 error * * if hs: # <<<<<<<<<<<<<< @@ -8451,7 +8444,7 @@ static PyObject *__pyx_pf_6gensim_6models_13doc2vec_inner_4train_document_dm_con */ } - /* "gensim/models/doc2vec_inner.pyx":654 + /* "gensim/models/doc2vec_inner.pyx":782 * _neu1, syn1, _alpha, _work, * layer1_size, vector_size, _learn_hidden) * if negative: # <<<<<<<<<<<<<< @@ -8461,7 +8454,7 @@ static PyObject *__pyx_pf_6gensim_6models_13doc2vec_inner_4train_document_dm_con __pyx_t_9 = (__pyx_v_negative != 0); if (__pyx_t_9) { - /* "gensim/models/doc2vec_inner.pyx":655 + /* "gensim/models/doc2vec_inner.pyx":783 * layer1_size, vector_size, _learn_hidden) * if negative: * next_random = fast_document_dmc_neg(negative, cum_table, cum_table_len, next_random, # <<<<<<<<<<<<<< @@ -8470,7 +8463,7 @@ static PyObject *__pyx_pf_6gensim_6models_13doc2vec_inner_4train_document_dm_con */ __pyx_v_next_random = __pyx_f_6gensim_6models_13doc2vec_inner_fast_document_dmc_neg(__pyx_v_negative, __pyx_v_cum_table, __pyx_v_cum_table_len, __pyx_v_next_random, __pyx_v__neu1, __pyx_v_syn1neg, (__pyx_v_indexes[__pyx_v_i]), __pyx_v__alpha, __pyx_v__work, __pyx_v_layer1_size, __pyx_v_vector_size, __pyx_v__learn_hidden); - /* "gensim/models/doc2vec_inner.pyx":654 + /* "gensim/models/doc2vec_inner.pyx":782 * _neu1, syn1, _alpha, _work, * layer1_size, vector_size, _learn_hidden) * if negative: # <<<<<<<<<<<<<< @@ -8479,7 +8472,7 @@ static PyObject *__pyx_pf_6gensim_6models_13doc2vec_inner_4train_document_dm_con */ } - /* "gensim/models/doc2vec_inner.pyx":659 + /* "gensim/models/doc2vec_inner.pyx":787 * layer1_size, vector_size, _learn_hidden) * * if _learn_doctags: # <<<<<<<<<<<<<< @@ -8489,7 +8482,7 @@ static PyObject *__pyx_pf_6gensim_6models_13doc2vec_inner_4train_document_dm_con __pyx_t_9 = (__pyx_v__learn_doctags != 0); if (__pyx_t_9) { - /* "gensim/models/doc2vec_inner.pyx":660 + /* "gensim/models/doc2vec_inner.pyx":788 * * if _learn_doctags: * for m in range(doctag_len): # <<<<<<<<<<<<<< @@ -8500,7 +8493,7 @@ static PyObject *__pyx_pf_6gensim_6models_13doc2vec_inner_4train_document_dm_con for (__pyx_t_17 = 0; __pyx_t_17 < __pyx_t_16; __pyx_t_17+=1) { __pyx_v_m = __pyx_t_17; - /* "gensim/models/doc2vec_inner.pyx":661 + /* "gensim/models/doc2vec_inner.pyx":789 * if _learn_doctags: * for m in range(doctag_len): * our_saxpy(&vector_size, &_doctag_locks[_doctag_indexes[m]], &_work[m * vector_size], # <<<<<<<<<<<<<< @@ -8510,7 +8503,7 @@ static PyObject *__pyx_pf_6gensim_6models_13doc2vec_inner_4train_document_dm_con __pyx_v_6gensim_6models_14word2vec_inner_our_saxpy((&__pyx_v_vector_size), (&(__pyx_v__doctag_locks[(__pyx_v__doctag_indexes[__pyx_v_m])])), (&(__pyx_v__work[(__pyx_v_m * __pyx_v_vector_size)])), (&__pyx_v_6gensim_6models_13doc2vec_inner_ONE), (&(__pyx_v__doctag_vectors[((__pyx_v__doctag_indexes[__pyx_v_m]) * __pyx_v_vector_size)])), (&__pyx_v_6gensim_6models_13doc2vec_inner_ONE)); } - /* "gensim/models/doc2vec_inner.pyx":659 + /* "gensim/models/doc2vec_inner.pyx":787 * layer1_size, vector_size, _learn_hidden) * * if _learn_doctags: # <<<<<<<<<<<<<< @@ -8519,7 +8512,7 @@ static PyObject *__pyx_pf_6gensim_6models_13doc2vec_inner_4train_document_dm_con */ } - /* "gensim/models/doc2vec_inner.pyx":663 + /* "gensim/models/doc2vec_inner.pyx":791 * our_saxpy(&vector_size, &_doctag_locks[_doctag_indexes[m]], &_work[m * vector_size], * &ONE, &_doctag_vectors[_doctag_indexes[m] * vector_size], &ONE) * if _learn_words: # <<<<<<<<<<<<<< @@ -8529,7 +8522,7 @@ static PyObject *__pyx_pf_6gensim_6models_13doc2vec_inner_4train_document_dm_con __pyx_t_9 = (__pyx_v__learn_words != 0); if (__pyx_t_9) { - /* "gensim/models/doc2vec_inner.pyx":664 + /* "gensim/models/doc2vec_inner.pyx":792 * &ONE, &_doctag_vectors[_doctag_indexes[m] * vector_size], &ONE) * if _learn_words: * for m in range(2 * window): # <<<<<<<<<<<<<< @@ -8540,7 +8533,7 @@ static PyObject *__pyx_pf_6gensim_6models_13doc2vec_inner_4train_document_dm_con for (__pyx_t_16 = 0; __pyx_t_16 < __pyx_t_6; __pyx_t_16+=1) { __pyx_v_m = __pyx_t_16; - /* "gensim/models/doc2vec_inner.pyx":665 + /* "gensim/models/doc2vec_inner.pyx":793 * if _learn_words: * for m in range(2 * window): * our_saxpy(&vector_size, &_word_locks[window_indexes[m]], &_work[(doctag_len + m) * vector_size], # <<<<<<<<<<<<<< @@ -8550,7 +8543,7 @@ static PyObject *__pyx_pf_6gensim_6models_13doc2vec_inner_4train_document_dm_con __pyx_v_6gensim_6models_14word2vec_inner_our_saxpy((&__pyx_v_vector_size), (&(__pyx_v__word_locks[(__pyx_v_window_indexes[__pyx_v_m])])), (&(__pyx_v__work[((__pyx_v_doctag_len + __pyx_v_m) * __pyx_v_vector_size)])), (&__pyx_v_6gensim_6models_13doc2vec_inner_ONE), (&(__pyx_v__word_vectors[((__pyx_v_window_indexes[__pyx_v_m]) * __pyx_v_vector_size)])), (&__pyx_v_6gensim_6models_13doc2vec_inner_ONE)); } - /* "gensim/models/doc2vec_inner.pyx":663 + /* "gensim/models/doc2vec_inner.pyx":791 * our_saxpy(&vector_size, &_doctag_locks[_doctag_indexes[m]], &_work[m * vector_size], * &ONE, &_doctag_vectors[_doctag_indexes[m] * vector_size], &ONE) * if _learn_words: # <<<<<<<<<<<<<< @@ -8561,7 +8554,7 @@ static PyObject *__pyx_pf_6gensim_6models_13doc2vec_inner_4train_document_dm_con } } - /* "gensim/models/doc2vec_inner.pyx":625 + /* "gensim/models/doc2vec_inner.pyx":753 * * # release GIL & train on the document * with nogil: # <<<<<<<<<<<<<< @@ -8580,19 +8573,19 @@ static PyObject *__pyx_pf_6gensim_6models_13doc2vec_inner_4train_document_dm_con } } - /* "gensim/models/doc2vec_inner.pyx":668 + /* "gensim/models/doc2vec_inner.pyx":796 * &ONE, &_word_vectors[window_indexes[m] * vector_size], &ONE) * * return result # <<<<<<<<<<<<<< */ __Pyx_XDECREF(__pyx_r); - __pyx_t_3 = __Pyx_PyInt_From_long(__pyx_v_result); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 668, __pyx_L1_error) + __pyx_t_3 = __Pyx_PyInt_From_long(__pyx_v_result); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 796, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_3); __pyx_r = __pyx_t_3; __pyx_t_3 = 0; goto __pyx_L0; - /* "gensim/models/doc2vec_inner.pyx":521 + /* "gensim/models/doc2vec_inner.pyx":604 * * * def train_document_dm_concat(model, doc_words, doctag_indexes, alpha, work=None, neu1=None, # <<<<<<<<<<<<<< @@ -8623,7 +8616,7 @@ static PyObject *__pyx_pf_6gensim_6models_13doc2vec_inner_4train_document_dm_con return __pyx_r; } -/* "../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":214 +/* "../../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":214 * # experimental exception made for __getbuffer__ and __releasebuffer__ * # -- the details of this may change. * def __getbuffer__(ndarray self, Py_buffer* info, int flags): # <<<<<<<<<<<<<< @@ -8670,7 +8663,7 @@ static int __pyx_pf_5numpy_7ndarray___getbuffer__(PyArrayObject *__pyx_v_self, P __Pyx_GIVEREF(__pyx_v_info->obj); } - /* "../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":220 + /* "../../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":220 * # of flags * * if info == NULL: return # <<<<<<<<<<<<<< @@ -8683,7 +8676,7 @@ static int __pyx_pf_5numpy_7ndarray___getbuffer__(PyArrayObject *__pyx_v_self, P goto __pyx_L0; } - /* "../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":223 + /* "../../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":223 * * cdef int copy_shape, i, ndim * cdef int endian_detector = 1 # <<<<<<<<<<<<<< @@ -8692,7 +8685,7 @@ static int __pyx_pf_5numpy_7ndarray___getbuffer__(PyArrayObject *__pyx_v_self, P */ __pyx_v_endian_detector = 1; - /* "../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":224 + /* "../../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":224 * cdef int copy_shape, i, ndim * cdef int endian_detector = 1 * cdef bint little_endian = ((&endian_detector)[0] != 0) # <<<<<<<<<<<<<< @@ -8701,7 +8694,7 @@ static int __pyx_pf_5numpy_7ndarray___getbuffer__(PyArrayObject *__pyx_v_self, P */ __pyx_v_little_endian = ((((char *)(&__pyx_v_endian_detector))[0]) != 0); - /* "../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":226 + /* "../../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":226 * cdef bint little_endian = ((&endian_detector)[0] != 0) * * ndim = PyArray_NDIM(self) # <<<<<<<<<<<<<< @@ -8710,7 +8703,7 @@ static int __pyx_pf_5numpy_7ndarray___getbuffer__(PyArrayObject *__pyx_v_self, P */ __pyx_v_ndim = PyArray_NDIM(__pyx_v_self); - /* "../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":228 + /* "../../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":228 * ndim = PyArray_NDIM(self) * * if sizeof(npy_intp) != sizeof(Py_ssize_t): # <<<<<<<<<<<<<< @@ -8720,7 +8713,7 @@ static int __pyx_pf_5numpy_7ndarray___getbuffer__(PyArrayObject *__pyx_v_self, P __pyx_t_1 = (((sizeof(npy_intp)) != (sizeof(Py_ssize_t))) != 0); if (__pyx_t_1) { - /* "../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":229 + /* "../../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":229 * * if sizeof(npy_intp) != sizeof(Py_ssize_t): * copy_shape = 1 # <<<<<<<<<<<<<< @@ -8729,7 +8722,7 @@ static int __pyx_pf_5numpy_7ndarray___getbuffer__(PyArrayObject *__pyx_v_self, P */ __pyx_v_copy_shape = 1; - /* "../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":228 + /* "../../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":228 * ndim = PyArray_NDIM(self) * * if sizeof(npy_intp) != sizeof(Py_ssize_t): # <<<<<<<<<<<<<< @@ -8739,7 +8732,7 @@ static int __pyx_pf_5numpy_7ndarray___getbuffer__(PyArrayObject *__pyx_v_self, P goto __pyx_L4; } - /* "../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":231 + /* "../../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":231 * copy_shape = 1 * else: * copy_shape = 0 # <<<<<<<<<<<<<< @@ -8751,7 +8744,7 @@ static int __pyx_pf_5numpy_7ndarray___getbuffer__(PyArrayObject *__pyx_v_self, P } __pyx_L4:; - /* "../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":233 + /* "../../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":233 * copy_shape = 0 * * if ((flags & pybuf.PyBUF_C_CONTIGUOUS == pybuf.PyBUF_C_CONTIGUOUS) # <<<<<<<<<<<<<< @@ -8765,7 +8758,7 @@ static int __pyx_pf_5numpy_7ndarray___getbuffer__(PyArrayObject *__pyx_v_self, P goto __pyx_L6_bool_binop_done; } - /* "../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":234 + /* "../../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":234 * * if ((flags & pybuf.PyBUF_C_CONTIGUOUS == pybuf.PyBUF_C_CONTIGUOUS) * and not PyArray_CHKFLAGS(self, NPY_C_CONTIGUOUS)): # <<<<<<<<<<<<<< @@ -8776,7 +8769,7 @@ static int __pyx_pf_5numpy_7ndarray___getbuffer__(PyArrayObject *__pyx_v_self, P __pyx_t_1 = __pyx_t_2; __pyx_L6_bool_binop_done:; - /* "../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":233 + /* "../../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":233 * copy_shape = 0 * * if ((flags & pybuf.PyBUF_C_CONTIGUOUS == pybuf.PyBUF_C_CONTIGUOUS) # <<<<<<<<<<<<<< @@ -8785,7 +8778,7 @@ static int __pyx_pf_5numpy_7ndarray___getbuffer__(PyArrayObject *__pyx_v_self, P */ if (__pyx_t_1) { - /* "../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":235 + /* "../../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":235 * if ((flags & pybuf.PyBUF_C_CONTIGUOUS == pybuf.PyBUF_C_CONTIGUOUS) * and not PyArray_CHKFLAGS(self, NPY_C_CONTIGUOUS)): * raise ValueError(u"ndarray is not C contiguous") # <<<<<<<<<<<<<< @@ -8798,7 +8791,7 @@ static int __pyx_pf_5numpy_7ndarray___getbuffer__(PyArrayObject *__pyx_v_self, P __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0; __PYX_ERR(1, 235, __pyx_L1_error) - /* "../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":233 + /* "../../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":233 * copy_shape = 0 * * if ((flags & pybuf.PyBUF_C_CONTIGUOUS == pybuf.PyBUF_C_CONTIGUOUS) # <<<<<<<<<<<<<< @@ -8807,7 +8800,7 @@ static int __pyx_pf_5numpy_7ndarray___getbuffer__(PyArrayObject *__pyx_v_self, P */ } - /* "../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":237 + /* "../../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":237 * raise ValueError(u"ndarray is not C contiguous") * * if ((flags & pybuf.PyBUF_F_CONTIGUOUS == pybuf.PyBUF_F_CONTIGUOUS) # <<<<<<<<<<<<<< @@ -8821,7 +8814,7 @@ static int __pyx_pf_5numpy_7ndarray___getbuffer__(PyArrayObject *__pyx_v_self, P goto __pyx_L9_bool_binop_done; } - /* "../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":238 + /* "../../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":238 * * if ((flags & pybuf.PyBUF_F_CONTIGUOUS == pybuf.PyBUF_F_CONTIGUOUS) * and not PyArray_CHKFLAGS(self, NPY_F_CONTIGUOUS)): # <<<<<<<<<<<<<< @@ -8832,7 +8825,7 @@ static int __pyx_pf_5numpy_7ndarray___getbuffer__(PyArrayObject *__pyx_v_self, P __pyx_t_1 = __pyx_t_2; __pyx_L9_bool_binop_done:; - /* "../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":237 + /* "../../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":237 * raise ValueError(u"ndarray is not C contiguous") * * if ((flags & pybuf.PyBUF_F_CONTIGUOUS == pybuf.PyBUF_F_CONTIGUOUS) # <<<<<<<<<<<<<< @@ -8841,7 +8834,7 @@ static int __pyx_pf_5numpy_7ndarray___getbuffer__(PyArrayObject *__pyx_v_self, P */ if (__pyx_t_1) { - /* "../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":239 + /* "../../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":239 * if ((flags & pybuf.PyBUF_F_CONTIGUOUS == pybuf.PyBUF_F_CONTIGUOUS) * and not PyArray_CHKFLAGS(self, NPY_F_CONTIGUOUS)): * raise ValueError(u"ndarray is not Fortran contiguous") # <<<<<<<<<<<<<< @@ -8854,7 +8847,7 @@ static int __pyx_pf_5numpy_7ndarray___getbuffer__(PyArrayObject *__pyx_v_self, P __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0; __PYX_ERR(1, 239, __pyx_L1_error) - /* "../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":237 + /* "../../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":237 * raise ValueError(u"ndarray is not C contiguous") * * if ((flags & pybuf.PyBUF_F_CONTIGUOUS == pybuf.PyBUF_F_CONTIGUOUS) # <<<<<<<<<<<<<< @@ -8863,7 +8856,7 @@ static int __pyx_pf_5numpy_7ndarray___getbuffer__(PyArrayObject *__pyx_v_self, P */ } - /* "../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":241 + /* "../../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":241 * raise ValueError(u"ndarray is not Fortran contiguous") * * info.buf = PyArray_DATA(self) # <<<<<<<<<<<<<< @@ -8872,7 +8865,7 @@ static int __pyx_pf_5numpy_7ndarray___getbuffer__(PyArrayObject *__pyx_v_self, P */ __pyx_v_info->buf = PyArray_DATA(__pyx_v_self); - /* "../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":242 + /* "../../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":242 * * info.buf = PyArray_DATA(self) * info.ndim = ndim # <<<<<<<<<<<<<< @@ -8881,7 +8874,7 @@ static int __pyx_pf_5numpy_7ndarray___getbuffer__(PyArrayObject *__pyx_v_self, P */ __pyx_v_info->ndim = __pyx_v_ndim; - /* "../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":243 + /* "../../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":243 * info.buf = PyArray_DATA(self) * info.ndim = ndim * if copy_shape: # <<<<<<<<<<<<<< @@ -8891,7 +8884,7 @@ static int __pyx_pf_5numpy_7ndarray___getbuffer__(PyArrayObject *__pyx_v_self, P __pyx_t_1 = (__pyx_v_copy_shape != 0); if (__pyx_t_1) { - /* "../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":246 + /* "../../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":246 * # Allocate new buffer for strides and shape info. * # This is allocated as one block, strides first. * info.strides = PyObject_Malloc(sizeof(Py_ssize_t) * 2 * ndim) # <<<<<<<<<<<<<< @@ -8900,7 +8893,7 @@ static int __pyx_pf_5numpy_7ndarray___getbuffer__(PyArrayObject *__pyx_v_self, P */ __pyx_v_info->strides = ((Py_ssize_t *)PyObject_Malloc((((sizeof(Py_ssize_t)) * 2) * ((size_t)__pyx_v_ndim)))); - /* "../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":247 + /* "../../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":247 * # This is allocated as one block, strides first. * info.strides = PyObject_Malloc(sizeof(Py_ssize_t) * 2 * ndim) * info.shape = info.strides + ndim # <<<<<<<<<<<<<< @@ -8909,7 +8902,7 @@ static int __pyx_pf_5numpy_7ndarray___getbuffer__(PyArrayObject *__pyx_v_self, P */ __pyx_v_info->shape = (__pyx_v_info->strides + __pyx_v_ndim); - /* "../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":248 + /* "../../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":248 * info.strides = PyObject_Malloc(sizeof(Py_ssize_t) * 2 * ndim) * info.shape = info.strides + ndim * for i in range(ndim): # <<<<<<<<<<<<<< @@ -8920,7 +8913,7 @@ static int __pyx_pf_5numpy_7ndarray___getbuffer__(PyArrayObject *__pyx_v_self, P for (__pyx_t_5 = 0; __pyx_t_5 < __pyx_t_4; __pyx_t_5+=1) { __pyx_v_i = __pyx_t_5; - /* "../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":249 + /* "../../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":249 * info.shape = info.strides + ndim * for i in range(ndim): * info.strides[i] = PyArray_STRIDES(self)[i] # <<<<<<<<<<<<<< @@ -8929,7 +8922,7 @@ static int __pyx_pf_5numpy_7ndarray___getbuffer__(PyArrayObject *__pyx_v_self, P */ (__pyx_v_info->strides[__pyx_v_i]) = (PyArray_STRIDES(__pyx_v_self)[__pyx_v_i]); - /* "../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":250 + /* "../../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":250 * for i in range(ndim): * info.strides[i] = PyArray_STRIDES(self)[i] * info.shape[i] = PyArray_DIMS(self)[i] # <<<<<<<<<<<<<< @@ -8939,7 +8932,7 @@ static int __pyx_pf_5numpy_7ndarray___getbuffer__(PyArrayObject *__pyx_v_self, P (__pyx_v_info->shape[__pyx_v_i]) = (PyArray_DIMS(__pyx_v_self)[__pyx_v_i]); } - /* "../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":243 + /* "../../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":243 * info.buf = PyArray_DATA(self) * info.ndim = ndim * if copy_shape: # <<<<<<<<<<<<<< @@ -8949,7 +8942,7 @@ static int __pyx_pf_5numpy_7ndarray___getbuffer__(PyArrayObject *__pyx_v_self, P goto __pyx_L11; } - /* "../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":252 + /* "../../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":252 * info.shape[i] = PyArray_DIMS(self)[i] * else: * info.strides = PyArray_STRIDES(self) # <<<<<<<<<<<<<< @@ -8959,7 +8952,7 @@ static int __pyx_pf_5numpy_7ndarray___getbuffer__(PyArrayObject *__pyx_v_self, P /*else*/ { __pyx_v_info->strides = ((Py_ssize_t *)PyArray_STRIDES(__pyx_v_self)); - /* "../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":253 + /* "../../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":253 * else: * info.strides = PyArray_STRIDES(self) * info.shape = PyArray_DIMS(self) # <<<<<<<<<<<<<< @@ -8970,7 +8963,7 @@ static int __pyx_pf_5numpy_7ndarray___getbuffer__(PyArrayObject *__pyx_v_self, P } __pyx_L11:; - /* "../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":254 + /* "../../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":254 * info.strides = PyArray_STRIDES(self) * info.shape = PyArray_DIMS(self) * info.suboffsets = NULL # <<<<<<<<<<<<<< @@ -8979,7 +8972,7 @@ static int __pyx_pf_5numpy_7ndarray___getbuffer__(PyArrayObject *__pyx_v_self, P */ __pyx_v_info->suboffsets = NULL; - /* "../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":255 + /* "../../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":255 * info.shape = PyArray_DIMS(self) * info.suboffsets = NULL * info.itemsize = PyArray_ITEMSIZE(self) # <<<<<<<<<<<<<< @@ -8988,7 +8981,7 @@ static int __pyx_pf_5numpy_7ndarray___getbuffer__(PyArrayObject *__pyx_v_self, P */ __pyx_v_info->itemsize = PyArray_ITEMSIZE(__pyx_v_self); - /* "../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":256 + /* "../../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":256 * info.suboffsets = NULL * info.itemsize = PyArray_ITEMSIZE(self) * info.readonly = not PyArray_ISWRITEABLE(self) # <<<<<<<<<<<<<< @@ -8997,7 +8990,7 @@ static int __pyx_pf_5numpy_7ndarray___getbuffer__(PyArrayObject *__pyx_v_self, P */ __pyx_v_info->readonly = (!(PyArray_ISWRITEABLE(__pyx_v_self) != 0)); - /* "../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":259 + /* "../../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":259 * * cdef int t * cdef char* f = NULL # <<<<<<<<<<<<<< @@ -9006,7 +8999,7 @@ static int __pyx_pf_5numpy_7ndarray___getbuffer__(PyArrayObject *__pyx_v_self, P */ __pyx_v_f = NULL; - /* "../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":260 + /* "../../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":260 * cdef int t * cdef char* f = NULL * cdef dtype descr = self.descr # <<<<<<<<<<<<<< @@ -9018,7 +9011,7 @@ static int __pyx_pf_5numpy_7ndarray___getbuffer__(PyArrayObject *__pyx_v_self, P __pyx_v_descr = ((PyArray_Descr *)__pyx_t_3); __pyx_t_3 = 0; - /* "../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":263 + /* "../../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":263 * cdef int offset * * cdef bint hasfields = PyDataType_HASFIELDS(descr) # <<<<<<<<<<<<<< @@ -9027,7 +9020,7 @@ static int __pyx_pf_5numpy_7ndarray___getbuffer__(PyArrayObject *__pyx_v_self, P */ __pyx_v_hasfields = PyDataType_HASFIELDS(__pyx_v_descr); - /* "../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":265 + /* "../../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":265 * cdef bint hasfields = PyDataType_HASFIELDS(descr) * * if not hasfields and not copy_shape: # <<<<<<<<<<<<<< @@ -9045,7 +9038,7 @@ static int __pyx_pf_5numpy_7ndarray___getbuffer__(PyArrayObject *__pyx_v_self, P __pyx_L15_bool_binop_done:; if (__pyx_t_1) { - /* "../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":267 + /* "../../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":267 * if not hasfields and not copy_shape: * # do not call releasebuffer * info.obj = None # <<<<<<<<<<<<<< @@ -9058,7 +9051,7 @@ static int __pyx_pf_5numpy_7ndarray___getbuffer__(PyArrayObject *__pyx_v_self, P __Pyx_DECREF(__pyx_v_info->obj); __pyx_v_info->obj = Py_None; - /* "../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":265 + /* "../../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":265 * cdef bint hasfields = PyDataType_HASFIELDS(descr) * * if not hasfields and not copy_shape: # <<<<<<<<<<<<<< @@ -9068,7 +9061,7 @@ static int __pyx_pf_5numpy_7ndarray___getbuffer__(PyArrayObject *__pyx_v_self, P goto __pyx_L14; } - /* "../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":270 + /* "../../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":270 * else: * # need to call releasebuffer * info.obj = self # <<<<<<<<<<<<<< @@ -9084,7 +9077,7 @@ static int __pyx_pf_5numpy_7ndarray___getbuffer__(PyArrayObject *__pyx_v_self, P } __pyx_L14:; - /* "../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":272 + /* "../../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":272 * info.obj = self * * if not hasfields: # <<<<<<<<<<<<<< @@ -9094,7 +9087,7 @@ static int __pyx_pf_5numpy_7ndarray___getbuffer__(PyArrayObject *__pyx_v_self, P __pyx_t_1 = ((!(__pyx_v_hasfields != 0)) != 0); if (__pyx_t_1) { - /* "../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":273 + /* "../../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":273 * * if not hasfields: * t = descr.type_num # <<<<<<<<<<<<<< @@ -9104,7 +9097,7 @@ static int __pyx_pf_5numpy_7ndarray___getbuffer__(PyArrayObject *__pyx_v_self, P __pyx_t_4 = __pyx_v_descr->type_num; __pyx_v_t = __pyx_t_4; - /* "../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":274 + /* "../../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":274 * if not hasfields: * t = descr.type_num * if ((descr.byteorder == c'>' and little_endian) or # <<<<<<<<<<<<<< @@ -9124,7 +9117,7 @@ static int __pyx_pf_5numpy_7ndarray___getbuffer__(PyArrayObject *__pyx_v_self, P } __pyx_L20_next_or:; - /* "../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":275 + /* "../../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":275 * t = descr.type_num * if ((descr.byteorder == c'>' and little_endian) or * (descr.byteorder == c'<' and not little_endian)): # <<<<<<<<<<<<<< @@ -9141,7 +9134,7 @@ static int __pyx_pf_5numpy_7ndarray___getbuffer__(PyArrayObject *__pyx_v_self, P __pyx_t_1 = __pyx_t_2; __pyx_L19_bool_binop_done:; - /* "../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":274 + /* "../../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":274 * if not hasfields: * t = descr.type_num * if ((descr.byteorder == c'>' and little_endian) or # <<<<<<<<<<<<<< @@ -9150,7 +9143,7 @@ static int __pyx_pf_5numpy_7ndarray___getbuffer__(PyArrayObject *__pyx_v_self, P */ if (__pyx_t_1) { - /* "../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":276 + /* "../../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":276 * if ((descr.byteorder == c'>' and little_endian) or * (descr.byteorder == c'<' and not little_endian)): * raise ValueError(u"Non-native byte order not supported") # <<<<<<<<<<<<<< @@ -9163,7 +9156,7 @@ static int __pyx_pf_5numpy_7ndarray___getbuffer__(PyArrayObject *__pyx_v_self, P __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0; __PYX_ERR(1, 276, __pyx_L1_error) - /* "../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":274 + /* "../../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":274 * if not hasfields: * t = descr.type_num * if ((descr.byteorder == c'>' and little_endian) or # <<<<<<<<<<<<<< @@ -9172,7 +9165,7 @@ static int __pyx_pf_5numpy_7ndarray___getbuffer__(PyArrayObject *__pyx_v_self, P */ } - /* "../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":277 + /* "../../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":277 * (descr.byteorder == c'<' and not little_endian)): * raise ValueError(u"Non-native byte order not supported") * if t == NPY_BYTE: f = "b" # <<<<<<<<<<<<<< @@ -9184,7 +9177,7 @@ static int __pyx_pf_5numpy_7ndarray___getbuffer__(PyArrayObject *__pyx_v_self, P __pyx_v_f = ((char *)"b"); break; - /* "../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":278 + /* "../../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":278 * raise ValueError(u"Non-native byte order not supported") * if t == NPY_BYTE: f = "b" * elif t == NPY_UBYTE: f = "B" # <<<<<<<<<<<<<< @@ -9195,7 +9188,7 @@ static int __pyx_pf_5numpy_7ndarray___getbuffer__(PyArrayObject *__pyx_v_self, P __pyx_v_f = ((char *)"B"); break; - /* "../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":279 + /* "../../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":279 * if t == NPY_BYTE: f = "b" * elif t == NPY_UBYTE: f = "B" * elif t == NPY_SHORT: f = "h" # <<<<<<<<<<<<<< @@ -9206,7 +9199,7 @@ static int __pyx_pf_5numpy_7ndarray___getbuffer__(PyArrayObject *__pyx_v_self, P __pyx_v_f = ((char *)"h"); break; - /* "../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":280 + /* "../../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":280 * elif t == NPY_UBYTE: f = "B" * elif t == NPY_SHORT: f = "h" * elif t == NPY_USHORT: f = "H" # <<<<<<<<<<<<<< @@ -9217,7 +9210,7 @@ static int __pyx_pf_5numpy_7ndarray___getbuffer__(PyArrayObject *__pyx_v_self, P __pyx_v_f = ((char *)"H"); break; - /* "../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":281 + /* "../../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":281 * elif t == NPY_SHORT: f = "h" * elif t == NPY_USHORT: f = "H" * elif t == NPY_INT: f = "i" # <<<<<<<<<<<<<< @@ -9228,7 +9221,7 @@ static int __pyx_pf_5numpy_7ndarray___getbuffer__(PyArrayObject *__pyx_v_self, P __pyx_v_f = ((char *)"i"); break; - /* "../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":282 + /* "../../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":282 * elif t == NPY_USHORT: f = "H" * elif t == NPY_INT: f = "i" * elif t == NPY_UINT: f = "I" # <<<<<<<<<<<<<< @@ -9239,7 +9232,7 @@ static int __pyx_pf_5numpy_7ndarray___getbuffer__(PyArrayObject *__pyx_v_self, P __pyx_v_f = ((char *)"I"); break; - /* "../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":283 + /* "../../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":283 * elif t == NPY_INT: f = "i" * elif t == NPY_UINT: f = "I" * elif t == NPY_LONG: f = "l" # <<<<<<<<<<<<<< @@ -9250,7 +9243,7 @@ static int __pyx_pf_5numpy_7ndarray___getbuffer__(PyArrayObject *__pyx_v_self, P __pyx_v_f = ((char *)"l"); break; - /* "../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":284 + /* "../../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":284 * elif t == NPY_UINT: f = "I" * elif t == NPY_LONG: f = "l" * elif t == NPY_ULONG: f = "L" # <<<<<<<<<<<<<< @@ -9261,7 +9254,7 @@ static int __pyx_pf_5numpy_7ndarray___getbuffer__(PyArrayObject *__pyx_v_self, P __pyx_v_f = ((char *)"L"); break; - /* "../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":285 + /* "../../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":285 * elif t == NPY_LONG: f = "l" * elif t == NPY_ULONG: f = "L" * elif t == NPY_LONGLONG: f = "q" # <<<<<<<<<<<<<< @@ -9272,7 +9265,7 @@ static int __pyx_pf_5numpy_7ndarray___getbuffer__(PyArrayObject *__pyx_v_self, P __pyx_v_f = ((char *)"q"); break; - /* "../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":286 + /* "../../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":286 * elif t == NPY_ULONG: f = "L" * elif t == NPY_LONGLONG: f = "q" * elif t == NPY_ULONGLONG: f = "Q" # <<<<<<<<<<<<<< @@ -9283,7 +9276,7 @@ static int __pyx_pf_5numpy_7ndarray___getbuffer__(PyArrayObject *__pyx_v_self, P __pyx_v_f = ((char *)"Q"); break; - /* "../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":287 + /* "../../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":287 * elif t == NPY_LONGLONG: f = "q" * elif t == NPY_ULONGLONG: f = "Q" * elif t == NPY_FLOAT: f = "f" # <<<<<<<<<<<<<< @@ -9294,7 +9287,7 @@ static int __pyx_pf_5numpy_7ndarray___getbuffer__(PyArrayObject *__pyx_v_self, P __pyx_v_f = ((char *)"f"); break; - /* "../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":288 + /* "../../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":288 * elif t == NPY_ULONGLONG: f = "Q" * elif t == NPY_FLOAT: f = "f" * elif t == NPY_DOUBLE: f = "d" # <<<<<<<<<<<<<< @@ -9305,7 +9298,7 @@ static int __pyx_pf_5numpy_7ndarray___getbuffer__(PyArrayObject *__pyx_v_self, P __pyx_v_f = ((char *)"d"); break; - /* "../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":289 + /* "../../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":289 * elif t == NPY_FLOAT: f = "f" * elif t == NPY_DOUBLE: f = "d" * elif t == NPY_LONGDOUBLE: f = "g" # <<<<<<<<<<<<<< @@ -9316,7 +9309,7 @@ static int __pyx_pf_5numpy_7ndarray___getbuffer__(PyArrayObject *__pyx_v_self, P __pyx_v_f = ((char *)"g"); break; - /* "../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":290 + /* "../../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":290 * elif t == NPY_DOUBLE: f = "d" * elif t == NPY_LONGDOUBLE: f = "g" * elif t == NPY_CFLOAT: f = "Zf" # <<<<<<<<<<<<<< @@ -9327,7 +9320,7 @@ static int __pyx_pf_5numpy_7ndarray___getbuffer__(PyArrayObject *__pyx_v_self, P __pyx_v_f = ((char *)"Zf"); break; - /* "../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":291 + /* "../../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":291 * elif t == NPY_LONGDOUBLE: f = "g" * elif t == NPY_CFLOAT: f = "Zf" * elif t == NPY_CDOUBLE: f = "Zd" # <<<<<<<<<<<<<< @@ -9338,7 +9331,7 @@ static int __pyx_pf_5numpy_7ndarray___getbuffer__(PyArrayObject *__pyx_v_self, P __pyx_v_f = ((char *)"Zd"); break; - /* "../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":292 + /* "../../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":292 * elif t == NPY_CFLOAT: f = "Zf" * elif t == NPY_CDOUBLE: f = "Zd" * elif t == NPY_CLONGDOUBLE: f = "Zg" # <<<<<<<<<<<<<< @@ -9349,7 +9342,7 @@ static int __pyx_pf_5numpy_7ndarray___getbuffer__(PyArrayObject *__pyx_v_self, P __pyx_v_f = ((char *)"Zg"); break; - /* "../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":293 + /* "../../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":293 * elif t == NPY_CDOUBLE: f = "Zd" * elif t == NPY_CLONGDOUBLE: f = "Zg" * elif t == NPY_OBJECT: f = "O" # <<<<<<<<<<<<<< @@ -9361,7 +9354,7 @@ static int __pyx_pf_5numpy_7ndarray___getbuffer__(PyArrayObject *__pyx_v_self, P break; default: - /* "../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":295 + /* "../../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":295 * elif t == NPY_OBJECT: f = "O" * else: * raise ValueError(u"unknown dtype code in numpy.pxd (%d)" % t) # <<<<<<<<<<<<<< @@ -9387,7 +9380,7 @@ static int __pyx_pf_5numpy_7ndarray___getbuffer__(PyArrayObject *__pyx_v_self, P break; } - /* "../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":296 + /* "../../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":296 * else: * raise ValueError(u"unknown dtype code in numpy.pxd (%d)" % t) * info.format = f # <<<<<<<<<<<<<< @@ -9396,7 +9389,7 @@ static int __pyx_pf_5numpy_7ndarray___getbuffer__(PyArrayObject *__pyx_v_self, P */ __pyx_v_info->format = __pyx_v_f; - /* "../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":297 + /* "../../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":297 * raise ValueError(u"unknown dtype code in numpy.pxd (%d)" % t) * info.format = f * return # <<<<<<<<<<<<<< @@ -9406,7 +9399,7 @@ static int __pyx_pf_5numpy_7ndarray___getbuffer__(PyArrayObject *__pyx_v_self, P __pyx_r = 0; goto __pyx_L0; - /* "../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":272 + /* "../../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":272 * info.obj = self * * if not hasfields: # <<<<<<<<<<<<<< @@ -9415,7 +9408,7 @@ static int __pyx_pf_5numpy_7ndarray___getbuffer__(PyArrayObject *__pyx_v_self, P */ } - /* "../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":299 + /* "../../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":299 * return * else: * info.format = PyObject_Malloc(_buffer_format_string_len) # <<<<<<<<<<<<<< @@ -9425,7 +9418,7 @@ static int __pyx_pf_5numpy_7ndarray___getbuffer__(PyArrayObject *__pyx_v_self, P /*else*/ { __pyx_v_info->format = ((char *)PyObject_Malloc(0xFF)); - /* "../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":300 + /* "../../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":300 * else: * info.format = PyObject_Malloc(_buffer_format_string_len) * info.format[0] = c'^' # Native data types, manual alignment # <<<<<<<<<<<<<< @@ -9434,7 +9427,7 @@ static int __pyx_pf_5numpy_7ndarray___getbuffer__(PyArrayObject *__pyx_v_self, P */ (__pyx_v_info->format[0]) = '^'; - /* "../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":301 + /* "../../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":301 * info.format = PyObject_Malloc(_buffer_format_string_len) * info.format[0] = c'^' # Native data types, manual alignment * offset = 0 # <<<<<<<<<<<<<< @@ -9443,7 +9436,7 @@ static int __pyx_pf_5numpy_7ndarray___getbuffer__(PyArrayObject *__pyx_v_self, P */ __pyx_v_offset = 0; - /* "../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":302 + /* "../../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":302 * info.format[0] = c'^' # Native data types, manual alignment * offset = 0 * f = _util_dtypestring(descr, info.format + 1, # <<<<<<<<<<<<<< @@ -9453,7 +9446,7 @@ static int __pyx_pf_5numpy_7ndarray___getbuffer__(PyArrayObject *__pyx_v_self, P __pyx_t_7 = __pyx_f_5numpy__util_dtypestring(__pyx_v_descr, (__pyx_v_info->format + 1), (__pyx_v_info->format + 0xFF), (&__pyx_v_offset)); if (unlikely(__pyx_t_7 == ((char *)NULL))) __PYX_ERR(1, 302, __pyx_L1_error) __pyx_v_f = __pyx_t_7; - /* "../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":305 + /* "../../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":305 * info.format + _buffer_format_string_len, * &offset) * f[0] = c'\0' # Terminate format string # <<<<<<<<<<<<<< @@ -9463,7 +9456,7 @@ static int __pyx_pf_5numpy_7ndarray___getbuffer__(PyArrayObject *__pyx_v_self, P (__pyx_v_f[0]) = '\x00'; } - /* "../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":214 + /* "../../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":214 * # experimental exception made for __getbuffer__ and __releasebuffer__ * # -- the details of this may change. * def __getbuffer__(ndarray self, Py_buffer* info, int flags): # <<<<<<<<<<<<<< @@ -9495,7 +9488,7 @@ static int __pyx_pf_5numpy_7ndarray___getbuffer__(PyArrayObject *__pyx_v_self, P return __pyx_r; } -/* "../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":307 +/* "../../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":307 * f[0] = c'\0' # Terminate format string * * def __releasebuffer__(ndarray self, Py_buffer* info): # <<<<<<<<<<<<<< @@ -9519,7 +9512,7 @@ static void __pyx_pf_5numpy_7ndarray_2__releasebuffer__(PyArrayObject *__pyx_v_s int __pyx_t_1; __Pyx_RefNannySetupContext("__releasebuffer__", 0); - /* "../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":308 + /* "../../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":308 * * def __releasebuffer__(ndarray self, Py_buffer* info): * if PyArray_HASFIELDS(self): # <<<<<<<<<<<<<< @@ -9529,7 +9522,7 @@ static void __pyx_pf_5numpy_7ndarray_2__releasebuffer__(PyArrayObject *__pyx_v_s __pyx_t_1 = (PyArray_HASFIELDS(__pyx_v_self) != 0); if (__pyx_t_1) { - /* "../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":309 + /* "../../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":309 * def __releasebuffer__(ndarray self, Py_buffer* info): * if PyArray_HASFIELDS(self): * PyObject_Free(info.format) # <<<<<<<<<<<<<< @@ -9538,7 +9531,7 @@ static void __pyx_pf_5numpy_7ndarray_2__releasebuffer__(PyArrayObject *__pyx_v_s */ PyObject_Free(__pyx_v_info->format); - /* "../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":308 + /* "../../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":308 * * def __releasebuffer__(ndarray self, Py_buffer* info): * if PyArray_HASFIELDS(self): # <<<<<<<<<<<<<< @@ -9547,7 +9540,7 @@ static void __pyx_pf_5numpy_7ndarray_2__releasebuffer__(PyArrayObject *__pyx_v_s */ } - /* "../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":310 + /* "../../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":310 * if PyArray_HASFIELDS(self): * PyObject_Free(info.format) * if sizeof(npy_intp) != sizeof(Py_ssize_t): # <<<<<<<<<<<<<< @@ -9557,7 +9550,7 @@ static void __pyx_pf_5numpy_7ndarray_2__releasebuffer__(PyArrayObject *__pyx_v_s __pyx_t_1 = (((sizeof(npy_intp)) != (sizeof(Py_ssize_t))) != 0); if (__pyx_t_1) { - /* "../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":311 + /* "../../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":311 * PyObject_Free(info.format) * if sizeof(npy_intp) != sizeof(Py_ssize_t): * PyObject_Free(info.strides) # <<<<<<<<<<<<<< @@ -9566,7 +9559,7 @@ static void __pyx_pf_5numpy_7ndarray_2__releasebuffer__(PyArrayObject *__pyx_v_s */ PyObject_Free(__pyx_v_info->strides); - /* "../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":310 + /* "../../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":310 * if PyArray_HASFIELDS(self): * PyObject_Free(info.format) * if sizeof(npy_intp) != sizeof(Py_ssize_t): # <<<<<<<<<<<<<< @@ -9575,7 +9568,7 @@ static void __pyx_pf_5numpy_7ndarray_2__releasebuffer__(PyArrayObject *__pyx_v_s */ } - /* "../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":307 + /* "../../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":307 * f[0] = c'\0' # Terminate format string * * def __releasebuffer__(ndarray self, Py_buffer* info): # <<<<<<<<<<<<<< @@ -9587,7 +9580,7 @@ static void __pyx_pf_5numpy_7ndarray_2__releasebuffer__(PyArrayObject *__pyx_v_s __Pyx_RefNannyFinishContext(); } -/* "../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":788 +/* "../../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":788 * ctypedef npy_cdouble complex_t * * cdef inline object PyArray_MultiIterNew1(a): # <<<<<<<<<<<<<< @@ -9601,7 +9594,7 @@ static CYTHON_INLINE PyObject *__pyx_f_5numpy_PyArray_MultiIterNew1(PyObject *__ PyObject *__pyx_t_1 = NULL; __Pyx_RefNannySetupContext("PyArray_MultiIterNew1", 0); - /* "../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":789 + /* "../../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":789 * * cdef inline object PyArray_MultiIterNew1(a): * return PyArray_MultiIterNew(1, a) # <<<<<<<<<<<<<< @@ -9615,7 +9608,7 @@ static CYTHON_INLINE PyObject *__pyx_f_5numpy_PyArray_MultiIterNew1(PyObject *__ __pyx_t_1 = 0; goto __pyx_L0; - /* "../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":788 + /* "../../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":788 * ctypedef npy_cdouble complex_t * * cdef inline object PyArray_MultiIterNew1(a): # <<<<<<<<<<<<<< @@ -9634,7 +9627,7 @@ static CYTHON_INLINE PyObject *__pyx_f_5numpy_PyArray_MultiIterNew1(PyObject *__ return __pyx_r; } -/* "../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":791 +/* "../../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":791 * return PyArray_MultiIterNew(1, a) * * cdef inline object PyArray_MultiIterNew2(a, b): # <<<<<<<<<<<<<< @@ -9648,7 +9641,7 @@ static CYTHON_INLINE PyObject *__pyx_f_5numpy_PyArray_MultiIterNew2(PyObject *__ PyObject *__pyx_t_1 = NULL; __Pyx_RefNannySetupContext("PyArray_MultiIterNew2", 0); - /* "../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":792 + /* "../../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":792 * * cdef inline object PyArray_MultiIterNew2(a, b): * return PyArray_MultiIterNew(2, a, b) # <<<<<<<<<<<<<< @@ -9662,7 +9655,7 @@ static CYTHON_INLINE PyObject *__pyx_f_5numpy_PyArray_MultiIterNew2(PyObject *__ __pyx_t_1 = 0; goto __pyx_L0; - /* "../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":791 + /* "../../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":791 * return PyArray_MultiIterNew(1, a) * * cdef inline object PyArray_MultiIterNew2(a, b): # <<<<<<<<<<<<<< @@ -9681,7 +9674,7 @@ static CYTHON_INLINE PyObject *__pyx_f_5numpy_PyArray_MultiIterNew2(PyObject *__ return __pyx_r; } -/* "../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":794 +/* "../../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":794 * return PyArray_MultiIterNew(2, a, b) * * cdef inline object PyArray_MultiIterNew3(a, b, c): # <<<<<<<<<<<<<< @@ -9695,7 +9688,7 @@ static CYTHON_INLINE PyObject *__pyx_f_5numpy_PyArray_MultiIterNew3(PyObject *__ PyObject *__pyx_t_1 = NULL; __Pyx_RefNannySetupContext("PyArray_MultiIterNew3", 0); - /* "../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":795 + /* "../../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":795 * * cdef inline object PyArray_MultiIterNew3(a, b, c): * return PyArray_MultiIterNew(3, a, b, c) # <<<<<<<<<<<<<< @@ -9709,7 +9702,7 @@ static CYTHON_INLINE PyObject *__pyx_f_5numpy_PyArray_MultiIterNew3(PyObject *__ __pyx_t_1 = 0; goto __pyx_L0; - /* "../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":794 + /* "../../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":794 * return PyArray_MultiIterNew(2, a, b) * * cdef inline object PyArray_MultiIterNew3(a, b, c): # <<<<<<<<<<<<<< @@ -9728,7 +9721,7 @@ static CYTHON_INLINE PyObject *__pyx_f_5numpy_PyArray_MultiIterNew3(PyObject *__ return __pyx_r; } -/* "../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":797 +/* "../../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":797 * return PyArray_MultiIterNew(3, a, b, c) * * cdef inline object PyArray_MultiIterNew4(a, b, c, d): # <<<<<<<<<<<<<< @@ -9742,7 +9735,7 @@ static CYTHON_INLINE PyObject *__pyx_f_5numpy_PyArray_MultiIterNew4(PyObject *__ PyObject *__pyx_t_1 = NULL; __Pyx_RefNannySetupContext("PyArray_MultiIterNew4", 0); - /* "../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":798 + /* "../../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":798 * * cdef inline object PyArray_MultiIterNew4(a, b, c, d): * return PyArray_MultiIterNew(4, a, b, c, d) # <<<<<<<<<<<<<< @@ -9756,7 +9749,7 @@ static CYTHON_INLINE PyObject *__pyx_f_5numpy_PyArray_MultiIterNew4(PyObject *__ __pyx_t_1 = 0; goto __pyx_L0; - /* "../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":797 + /* "../../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":797 * return PyArray_MultiIterNew(3, a, b, c) * * cdef inline object PyArray_MultiIterNew4(a, b, c, d): # <<<<<<<<<<<<<< @@ -9775,7 +9768,7 @@ static CYTHON_INLINE PyObject *__pyx_f_5numpy_PyArray_MultiIterNew4(PyObject *__ return __pyx_r; } -/* "../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":800 +/* "../../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":800 * return PyArray_MultiIterNew(4, a, b, c, d) * * cdef inline object PyArray_MultiIterNew5(a, b, c, d, e): # <<<<<<<<<<<<<< @@ -9789,7 +9782,7 @@ static CYTHON_INLINE PyObject *__pyx_f_5numpy_PyArray_MultiIterNew5(PyObject *__ PyObject *__pyx_t_1 = NULL; __Pyx_RefNannySetupContext("PyArray_MultiIterNew5", 0); - /* "../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":801 + /* "../../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":801 * * cdef inline object PyArray_MultiIterNew5(a, b, c, d, e): * return PyArray_MultiIterNew(5, a, b, c, d, e) # <<<<<<<<<<<<<< @@ -9803,7 +9796,7 @@ static CYTHON_INLINE PyObject *__pyx_f_5numpy_PyArray_MultiIterNew5(PyObject *__ __pyx_t_1 = 0; goto __pyx_L0; - /* "../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":800 + /* "../../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":800 * return PyArray_MultiIterNew(4, a, b, c, d) * * cdef inline object PyArray_MultiIterNew5(a, b, c, d, e): # <<<<<<<<<<<<<< @@ -9822,7 +9815,7 @@ static CYTHON_INLINE PyObject *__pyx_f_5numpy_PyArray_MultiIterNew5(PyObject *__ return __pyx_r; } -/* "../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":803 +/* "../../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":803 * return PyArray_MultiIterNew(5, a, b, c, d, e) * * cdef inline tuple PyDataType_SHAPE(dtype d): # <<<<<<<<<<<<<< @@ -9836,7 +9829,7 @@ static CYTHON_INLINE PyObject *__pyx_f_5numpy_PyDataType_SHAPE(PyArray_Descr *__ int __pyx_t_1; __Pyx_RefNannySetupContext("PyDataType_SHAPE", 0); - /* "../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":804 + /* "../../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":804 * * cdef inline tuple PyDataType_SHAPE(dtype d): * if PyDataType_HASSUBARRAY(d): # <<<<<<<<<<<<<< @@ -9846,7 +9839,7 @@ static CYTHON_INLINE PyObject *__pyx_f_5numpy_PyDataType_SHAPE(PyArray_Descr *__ __pyx_t_1 = (PyDataType_HASSUBARRAY(__pyx_v_d) != 0); if (__pyx_t_1) { - /* "../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":805 + /* "../../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":805 * cdef inline tuple PyDataType_SHAPE(dtype d): * if PyDataType_HASSUBARRAY(d): * return d.subarray.shape # <<<<<<<<<<<<<< @@ -9858,7 +9851,7 @@ static CYTHON_INLINE PyObject *__pyx_f_5numpy_PyDataType_SHAPE(PyArray_Descr *__ __pyx_r = ((PyObject*)__pyx_v_d->subarray->shape); goto __pyx_L0; - /* "../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":804 + /* "../../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":804 * * cdef inline tuple PyDataType_SHAPE(dtype d): * if PyDataType_HASSUBARRAY(d): # <<<<<<<<<<<<<< @@ -9867,7 +9860,7 @@ static CYTHON_INLINE PyObject *__pyx_f_5numpy_PyDataType_SHAPE(PyArray_Descr *__ */ } - /* "../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":807 + /* "../../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":807 * return d.subarray.shape * else: * return () # <<<<<<<<<<<<<< @@ -9881,7 +9874,7 @@ static CYTHON_INLINE PyObject *__pyx_f_5numpy_PyDataType_SHAPE(PyArray_Descr *__ goto __pyx_L0; } - /* "../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":803 + /* "../../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":803 * return PyArray_MultiIterNew(5, a, b, c, d, e) * * cdef inline tuple PyDataType_SHAPE(dtype d): # <<<<<<<<<<<<<< @@ -9896,7 +9889,7 @@ static CYTHON_INLINE PyObject *__pyx_f_5numpy_PyDataType_SHAPE(PyArray_Descr *__ return __pyx_r; } -/* "../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":809 +/* "../../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":809 * return () * * cdef inline char* _util_dtypestring(dtype descr, char* f, char* end, int* offset) except NULL: # <<<<<<<<<<<<<< @@ -9925,7 +9918,7 @@ static CYTHON_INLINE char *__pyx_f_5numpy__util_dtypestring(PyArray_Descr *__pyx char *__pyx_t_9; __Pyx_RefNannySetupContext("_util_dtypestring", 0); - /* "../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":814 + /* "../../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":814 * * cdef dtype child * cdef int endian_detector = 1 # <<<<<<<<<<<<<< @@ -9934,7 +9927,7 @@ static CYTHON_INLINE char *__pyx_f_5numpy__util_dtypestring(PyArray_Descr *__pyx */ __pyx_v_endian_detector = 1; - /* "../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":815 + /* "../../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":815 * cdef dtype child * cdef int endian_detector = 1 * cdef bint little_endian = ((&endian_detector)[0] != 0) # <<<<<<<<<<<<<< @@ -9943,7 +9936,7 @@ static CYTHON_INLINE char *__pyx_f_5numpy__util_dtypestring(PyArray_Descr *__pyx */ __pyx_v_little_endian = ((((char *)(&__pyx_v_endian_detector))[0]) != 0); - /* "../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":818 + /* "../../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":818 * cdef tuple fields * * for childname in descr.names: # <<<<<<<<<<<<<< @@ -9966,7 +9959,7 @@ static CYTHON_INLINE char *__pyx_f_5numpy__util_dtypestring(PyArray_Descr *__pyx __Pyx_XDECREF_SET(__pyx_v_childname, __pyx_t_3); __pyx_t_3 = 0; - /* "../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":819 + /* "../../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":819 * * for childname in descr.names: * fields = descr.fields[childname] # <<<<<<<<<<<<<< @@ -9983,7 +9976,7 @@ static CYTHON_INLINE char *__pyx_f_5numpy__util_dtypestring(PyArray_Descr *__pyx __Pyx_XDECREF_SET(__pyx_v_fields, ((PyObject*)__pyx_t_3)); __pyx_t_3 = 0; - /* "../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":820 + /* "../../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":820 * for childname in descr.names: * fields = descr.fields[childname] * child, new_offset = fields # <<<<<<<<<<<<<< @@ -10022,7 +10015,7 @@ static CYTHON_INLINE char *__pyx_f_5numpy__util_dtypestring(PyArray_Descr *__pyx __Pyx_XDECREF_SET(__pyx_v_new_offset, __pyx_t_4); __pyx_t_4 = 0; - /* "../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":822 + /* "../../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":822 * child, new_offset = fields * * if (end - f) - (new_offset - offset[0]) < 15: # <<<<<<<<<<<<<< @@ -10039,7 +10032,7 @@ static CYTHON_INLINE char *__pyx_f_5numpy__util_dtypestring(PyArray_Descr *__pyx __pyx_t_6 = ((((__pyx_v_end - __pyx_v_f) - ((int)__pyx_t_5)) < 15) != 0); if (__pyx_t_6) { - /* "../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":823 + /* "../../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":823 * * if (end - f) - (new_offset - offset[0]) < 15: * raise RuntimeError(u"Format string allocated too short, see comment in numpy.pxd") # <<<<<<<<<<<<<< @@ -10052,7 +10045,7 @@ static CYTHON_INLINE char *__pyx_f_5numpy__util_dtypestring(PyArray_Descr *__pyx __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0; __PYX_ERR(1, 823, __pyx_L1_error) - /* "../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":822 + /* "../../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":822 * child, new_offset = fields * * if (end - f) - (new_offset - offset[0]) < 15: # <<<<<<<<<<<<<< @@ -10061,7 +10054,7 @@ static CYTHON_INLINE char *__pyx_f_5numpy__util_dtypestring(PyArray_Descr *__pyx */ } - /* "../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":825 + /* "../../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":825 * raise RuntimeError(u"Format string allocated too short, see comment in numpy.pxd") * * if ((child.byteorder == c'>' and little_endian) or # <<<<<<<<<<<<<< @@ -10081,7 +10074,7 @@ static CYTHON_INLINE char *__pyx_f_5numpy__util_dtypestring(PyArray_Descr *__pyx } __pyx_L8_next_or:; - /* "../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":826 + /* "../../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":826 * * if ((child.byteorder == c'>' and little_endian) or * (child.byteorder == c'<' and not little_endian)): # <<<<<<<<<<<<<< @@ -10098,7 +10091,7 @@ static CYTHON_INLINE char *__pyx_f_5numpy__util_dtypestring(PyArray_Descr *__pyx __pyx_t_6 = __pyx_t_7; __pyx_L7_bool_binop_done:; - /* "../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":825 + /* "../../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":825 * raise RuntimeError(u"Format string allocated too short, see comment in numpy.pxd") * * if ((child.byteorder == c'>' and little_endian) or # <<<<<<<<<<<<<< @@ -10107,7 +10100,7 @@ static CYTHON_INLINE char *__pyx_f_5numpy__util_dtypestring(PyArray_Descr *__pyx */ if (__pyx_t_6) { - /* "../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":827 + /* "../../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":827 * if ((child.byteorder == c'>' and little_endian) or * (child.byteorder == c'<' and not little_endian)): * raise ValueError(u"Non-native byte order not supported") # <<<<<<<<<<<<<< @@ -10120,7 +10113,7 @@ static CYTHON_INLINE char *__pyx_f_5numpy__util_dtypestring(PyArray_Descr *__pyx __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0; __PYX_ERR(1, 827, __pyx_L1_error) - /* "../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":825 + /* "../../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":825 * raise RuntimeError(u"Format string allocated too short, see comment in numpy.pxd") * * if ((child.byteorder == c'>' and little_endian) or # <<<<<<<<<<<<<< @@ -10129,7 +10122,7 @@ static CYTHON_INLINE char *__pyx_f_5numpy__util_dtypestring(PyArray_Descr *__pyx */ } - /* "../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":837 + /* "../../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":837 * * # Output padding bytes * while offset[0] < new_offset: # <<<<<<<<<<<<<< @@ -10145,7 +10138,7 @@ static CYTHON_INLINE char *__pyx_f_5numpy__util_dtypestring(PyArray_Descr *__pyx __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0; if (!__pyx_t_6) break; - /* "../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":838 + /* "../../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":838 * # Output padding bytes * while offset[0] < new_offset: * f[0] = 120 # "x"; pad byte # <<<<<<<<<<<<<< @@ -10154,7 +10147,7 @@ static CYTHON_INLINE char *__pyx_f_5numpy__util_dtypestring(PyArray_Descr *__pyx */ (__pyx_v_f[0]) = 0x78; - /* "../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":839 + /* "../../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":839 * while offset[0] < new_offset: * f[0] = 120 # "x"; pad byte * f += 1 # <<<<<<<<<<<<<< @@ -10163,7 +10156,7 @@ static CYTHON_INLINE char *__pyx_f_5numpy__util_dtypestring(PyArray_Descr *__pyx */ __pyx_v_f = (__pyx_v_f + 1); - /* "../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":840 + /* "../../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":840 * f[0] = 120 # "x"; pad byte * f += 1 * offset[0] += 1 # <<<<<<<<<<<<<< @@ -10174,7 +10167,7 @@ static CYTHON_INLINE char *__pyx_f_5numpy__util_dtypestring(PyArray_Descr *__pyx (__pyx_v_offset[__pyx_t_8]) = ((__pyx_v_offset[__pyx_t_8]) + 1); } - /* "../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":842 + /* "../../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":842 * offset[0] += 1 * * offset[0] += child.itemsize # <<<<<<<<<<<<<< @@ -10184,7 +10177,7 @@ static CYTHON_INLINE char *__pyx_f_5numpy__util_dtypestring(PyArray_Descr *__pyx __pyx_t_8 = 0; (__pyx_v_offset[__pyx_t_8]) = ((__pyx_v_offset[__pyx_t_8]) + __pyx_v_child->elsize); - /* "../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":844 + /* "../../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":844 * offset[0] += child.itemsize * * if not PyDataType_HASFIELDS(child): # <<<<<<<<<<<<<< @@ -10194,7 +10187,7 @@ static CYTHON_INLINE char *__pyx_f_5numpy__util_dtypestring(PyArray_Descr *__pyx __pyx_t_6 = ((!(PyDataType_HASFIELDS(__pyx_v_child) != 0)) != 0); if (__pyx_t_6) { - /* "../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":845 + /* "../../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":845 * * if not PyDataType_HASFIELDS(child): * t = child.type_num # <<<<<<<<<<<<<< @@ -10206,7 +10199,7 @@ static CYTHON_INLINE char *__pyx_f_5numpy__util_dtypestring(PyArray_Descr *__pyx __Pyx_XDECREF_SET(__pyx_v_t, __pyx_t_4); __pyx_t_4 = 0; - /* "../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":846 + /* "../../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":846 * if not PyDataType_HASFIELDS(child): * t = child.type_num * if end - f < 5: # <<<<<<<<<<<<<< @@ -10216,7 +10209,7 @@ static CYTHON_INLINE char *__pyx_f_5numpy__util_dtypestring(PyArray_Descr *__pyx __pyx_t_6 = (((__pyx_v_end - __pyx_v_f) < 5) != 0); if (__pyx_t_6) { - /* "../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":847 + /* "../../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":847 * t = child.type_num * if end - f < 5: * raise RuntimeError(u"Format string allocated too short.") # <<<<<<<<<<<<<< @@ -10229,7 +10222,7 @@ static CYTHON_INLINE char *__pyx_f_5numpy__util_dtypestring(PyArray_Descr *__pyx __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0; __PYX_ERR(1, 847, __pyx_L1_error) - /* "../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":846 + /* "../../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":846 * if not PyDataType_HASFIELDS(child): * t = child.type_num * if end - f < 5: # <<<<<<<<<<<<<< @@ -10238,7 +10231,7 @@ static CYTHON_INLINE char *__pyx_f_5numpy__util_dtypestring(PyArray_Descr *__pyx */ } - /* "../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":850 + /* "../../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":850 * * # Until ticket #99 is fixed, use integers to avoid warnings * if t == NPY_BYTE: f[0] = 98 #"b" # <<<<<<<<<<<<<< @@ -10256,7 +10249,7 @@ static CYTHON_INLINE char *__pyx_f_5numpy__util_dtypestring(PyArray_Descr *__pyx goto __pyx_L15; } - /* "../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":851 + /* "../../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":851 * # Until ticket #99 is fixed, use integers to avoid warnings * if t == NPY_BYTE: f[0] = 98 #"b" * elif t == NPY_UBYTE: f[0] = 66 #"B" # <<<<<<<<<<<<<< @@ -10274,7 +10267,7 @@ static CYTHON_INLINE char *__pyx_f_5numpy__util_dtypestring(PyArray_Descr *__pyx goto __pyx_L15; } - /* "../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":852 + /* "../../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":852 * if t == NPY_BYTE: f[0] = 98 #"b" * elif t == NPY_UBYTE: f[0] = 66 #"B" * elif t == NPY_SHORT: f[0] = 104 #"h" # <<<<<<<<<<<<<< @@ -10292,7 +10285,7 @@ static CYTHON_INLINE char *__pyx_f_5numpy__util_dtypestring(PyArray_Descr *__pyx goto __pyx_L15; } - /* "../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":853 + /* "../../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":853 * elif t == NPY_UBYTE: f[0] = 66 #"B" * elif t == NPY_SHORT: f[0] = 104 #"h" * elif t == NPY_USHORT: f[0] = 72 #"H" # <<<<<<<<<<<<<< @@ -10310,7 +10303,7 @@ static CYTHON_INLINE char *__pyx_f_5numpy__util_dtypestring(PyArray_Descr *__pyx goto __pyx_L15; } - /* "../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":854 + /* "../../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":854 * elif t == NPY_SHORT: f[0] = 104 #"h" * elif t == NPY_USHORT: f[0] = 72 #"H" * elif t == NPY_INT: f[0] = 105 #"i" # <<<<<<<<<<<<<< @@ -10328,7 +10321,7 @@ static CYTHON_INLINE char *__pyx_f_5numpy__util_dtypestring(PyArray_Descr *__pyx goto __pyx_L15; } - /* "../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":855 + /* "../../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":855 * elif t == NPY_USHORT: f[0] = 72 #"H" * elif t == NPY_INT: f[0] = 105 #"i" * elif t == NPY_UINT: f[0] = 73 #"I" # <<<<<<<<<<<<<< @@ -10346,7 +10339,7 @@ static CYTHON_INLINE char *__pyx_f_5numpy__util_dtypestring(PyArray_Descr *__pyx goto __pyx_L15; } - /* "../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":856 + /* "../../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":856 * elif t == NPY_INT: f[0] = 105 #"i" * elif t == NPY_UINT: f[0] = 73 #"I" * elif t == NPY_LONG: f[0] = 108 #"l" # <<<<<<<<<<<<<< @@ -10364,7 +10357,7 @@ static CYTHON_INLINE char *__pyx_f_5numpy__util_dtypestring(PyArray_Descr *__pyx goto __pyx_L15; } - /* "../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":857 + /* "../../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":857 * elif t == NPY_UINT: f[0] = 73 #"I" * elif t == NPY_LONG: f[0] = 108 #"l" * elif t == NPY_ULONG: f[0] = 76 #"L" # <<<<<<<<<<<<<< @@ -10382,7 +10375,7 @@ static CYTHON_INLINE char *__pyx_f_5numpy__util_dtypestring(PyArray_Descr *__pyx goto __pyx_L15; } - /* "../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":858 + /* "../../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":858 * elif t == NPY_LONG: f[0] = 108 #"l" * elif t == NPY_ULONG: f[0] = 76 #"L" * elif t == NPY_LONGLONG: f[0] = 113 #"q" # <<<<<<<<<<<<<< @@ -10400,7 +10393,7 @@ static CYTHON_INLINE char *__pyx_f_5numpy__util_dtypestring(PyArray_Descr *__pyx goto __pyx_L15; } - /* "../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":859 + /* "../../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":859 * elif t == NPY_ULONG: f[0] = 76 #"L" * elif t == NPY_LONGLONG: f[0] = 113 #"q" * elif t == NPY_ULONGLONG: f[0] = 81 #"Q" # <<<<<<<<<<<<<< @@ -10418,7 +10411,7 @@ static CYTHON_INLINE char *__pyx_f_5numpy__util_dtypestring(PyArray_Descr *__pyx goto __pyx_L15; } - /* "../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":860 + /* "../../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":860 * elif t == NPY_LONGLONG: f[0] = 113 #"q" * elif t == NPY_ULONGLONG: f[0] = 81 #"Q" * elif t == NPY_FLOAT: f[0] = 102 #"f" # <<<<<<<<<<<<<< @@ -10436,7 +10429,7 @@ static CYTHON_INLINE char *__pyx_f_5numpy__util_dtypestring(PyArray_Descr *__pyx goto __pyx_L15; } - /* "../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":861 + /* "../../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":861 * elif t == NPY_ULONGLONG: f[0] = 81 #"Q" * elif t == NPY_FLOAT: f[0] = 102 #"f" * elif t == NPY_DOUBLE: f[0] = 100 #"d" # <<<<<<<<<<<<<< @@ -10454,7 +10447,7 @@ static CYTHON_INLINE char *__pyx_f_5numpy__util_dtypestring(PyArray_Descr *__pyx goto __pyx_L15; } - /* "../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":862 + /* "../../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":862 * elif t == NPY_FLOAT: f[0] = 102 #"f" * elif t == NPY_DOUBLE: f[0] = 100 #"d" * elif t == NPY_LONGDOUBLE: f[0] = 103 #"g" # <<<<<<<<<<<<<< @@ -10472,7 +10465,7 @@ static CYTHON_INLINE char *__pyx_f_5numpy__util_dtypestring(PyArray_Descr *__pyx goto __pyx_L15; } - /* "../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":863 + /* "../../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":863 * elif t == NPY_DOUBLE: f[0] = 100 #"d" * elif t == NPY_LONGDOUBLE: f[0] = 103 #"g" * elif t == NPY_CFLOAT: f[0] = 90; f[1] = 102; f += 1 # Zf # <<<<<<<<<<<<<< @@ -10492,7 +10485,7 @@ static CYTHON_INLINE char *__pyx_f_5numpy__util_dtypestring(PyArray_Descr *__pyx goto __pyx_L15; } - /* "../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":864 + /* "../../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":864 * elif t == NPY_LONGDOUBLE: f[0] = 103 #"g" * elif t == NPY_CFLOAT: f[0] = 90; f[1] = 102; f += 1 # Zf * elif t == NPY_CDOUBLE: f[0] = 90; f[1] = 100; f += 1 # Zd # <<<<<<<<<<<<<< @@ -10512,7 +10505,7 @@ static CYTHON_INLINE char *__pyx_f_5numpy__util_dtypestring(PyArray_Descr *__pyx goto __pyx_L15; } - /* "../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":865 + /* "../../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":865 * elif t == NPY_CFLOAT: f[0] = 90; f[1] = 102; f += 1 # Zf * elif t == NPY_CDOUBLE: f[0] = 90; f[1] = 100; f += 1 # Zd * elif t == NPY_CLONGDOUBLE: f[0] = 90; f[1] = 103; f += 1 # Zg # <<<<<<<<<<<<<< @@ -10532,7 +10525,7 @@ static CYTHON_INLINE char *__pyx_f_5numpy__util_dtypestring(PyArray_Descr *__pyx goto __pyx_L15; } - /* "../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":866 + /* "../../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":866 * elif t == NPY_CDOUBLE: f[0] = 90; f[1] = 100; f += 1 # Zd * elif t == NPY_CLONGDOUBLE: f[0] = 90; f[1] = 103; f += 1 # Zg * elif t == NPY_OBJECT: f[0] = 79 #"O" # <<<<<<<<<<<<<< @@ -10550,7 +10543,7 @@ static CYTHON_INLINE char *__pyx_f_5numpy__util_dtypestring(PyArray_Descr *__pyx goto __pyx_L15; } - /* "../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":868 + /* "../../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":868 * elif t == NPY_OBJECT: f[0] = 79 #"O" * else: * raise ValueError(u"unknown dtype code in numpy.pxd (%d)" % t) # <<<<<<<<<<<<<< @@ -10574,7 +10567,7 @@ static CYTHON_INLINE char *__pyx_f_5numpy__util_dtypestring(PyArray_Descr *__pyx } __pyx_L15:; - /* "../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":869 + /* "../../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":869 * else: * raise ValueError(u"unknown dtype code in numpy.pxd (%d)" % t) * f += 1 # <<<<<<<<<<<<<< @@ -10583,7 +10576,7 @@ static CYTHON_INLINE char *__pyx_f_5numpy__util_dtypestring(PyArray_Descr *__pyx */ __pyx_v_f = (__pyx_v_f + 1); - /* "../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":844 + /* "../../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":844 * offset[0] += child.itemsize * * if not PyDataType_HASFIELDS(child): # <<<<<<<<<<<<<< @@ -10593,7 +10586,7 @@ static CYTHON_INLINE char *__pyx_f_5numpy__util_dtypestring(PyArray_Descr *__pyx goto __pyx_L13; } - /* "../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":873 + /* "../../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":873 * # Cython ignores struct boundary information ("T{...}"), * # so don't output it * f = _util_dtypestring(child, f, end, offset) # <<<<<<<<<<<<<< @@ -10606,7 +10599,7 @@ static CYTHON_INLINE char *__pyx_f_5numpy__util_dtypestring(PyArray_Descr *__pyx } __pyx_L13:; - /* "../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":818 + /* "../../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":818 * cdef tuple fields * * for childname in descr.names: # <<<<<<<<<<<<<< @@ -10616,7 +10609,7 @@ static CYTHON_INLINE char *__pyx_f_5numpy__util_dtypestring(PyArray_Descr *__pyx } __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0; - /* "../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":874 + /* "../../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":874 * # so don't output it * f = _util_dtypestring(child, f, end, offset) * return f # <<<<<<<<<<<<<< @@ -10626,7 +10619,7 @@ static CYTHON_INLINE char *__pyx_f_5numpy__util_dtypestring(PyArray_Descr *__pyx __pyx_r = __pyx_v_f; goto __pyx_L0; - /* "../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":809 + /* "../../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":809 * return () * * cdef inline char* _util_dtypestring(dtype descr, char* f, char* end, int* offset) except NULL: # <<<<<<<<<<<<<< @@ -10651,7 +10644,7 @@ static CYTHON_INLINE char *__pyx_f_5numpy__util_dtypestring(PyArray_Descr *__pyx return __pyx_r; } -/* "../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":990 +/* "../../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":990 * * * cdef inline void set_array_base(ndarray arr, object base): # <<<<<<<<<<<<<< @@ -10666,7 +10659,7 @@ static CYTHON_INLINE void __pyx_f_5numpy_set_array_base(PyArrayObject *__pyx_v_a int __pyx_t_2; __Pyx_RefNannySetupContext("set_array_base", 0); - /* "../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":992 + /* "../../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":992 * cdef inline void set_array_base(ndarray arr, object base): * cdef PyObject* baseptr * if base is None: # <<<<<<<<<<<<<< @@ -10677,7 +10670,7 @@ static CYTHON_INLINE void __pyx_f_5numpy_set_array_base(PyArrayObject *__pyx_v_a __pyx_t_2 = (__pyx_t_1 != 0); if (__pyx_t_2) { - /* "../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":993 + /* "../../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":993 * cdef PyObject* baseptr * if base is None: * baseptr = NULL # <<<<<<<<<<<<<< @@ -10686,7 +10679,7 @@ static CYTHON_INLINE void __pyx_f_5numpy_set_array_base(PyArrayObject *__pyx_v_a */ __pyx_v_baseptr = NULL; - /* "../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":992 + /* "../../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":992 * cdef inline void set_array_base(ndarray arr, object base): * cdef PyObject* baseptr * if base is None: # <<<<<<<<<<<<<< @@ -10696,7 +10689,7 @@ static CYTHON_INLINE void __pyx_f_5numpy_set_array_base(PyArrayObject *__pyx_v_a goto __pyx_L3; } - /* "../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":995 + /* "../../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":995 * baseptr = NULL * else: * Py_INCREF(base) # important to do this before decref below! # <<<<<<<<<<<<<< @@ -10706,7 +10699,7 @@ static CYTHON_INLINE void __pyx_f_5numpy_set_array_base(PyArrayObject *__pyx_v_a /*else*/ { Py_INCREF(__pyx_v_base); - /* "../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":996 + /* "../../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":996 * else: * Py_INCREF(base) # important to do this before decref below! * baseptr = base # <<<<<<<<<<<<<< @@ -10717,7 +10710,7 @@ static CYTHON_INLINE void __pyx_f_5numpy_set_array_base(PyArrayObject *__pyx_v_a } __pyx_L3:; - /* "../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":997 + /* "../../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":997 * Py_INCREF(base) # important to do this before decref below! * baseptr = base * Py_XDECREF(arr.base) # <<<<<<<<<<<<<< @@ -10726,7 +10719,7 @@ static CYTHON_INLINE void __pyx_f_5numpy_set_array_base(PyArrayObject *__pyx_v_a */ Py_XDECREF(__pyx_v_arr->base); - /* "../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":998 + /* "../../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":998 * baseptr = base * Py_XDECREF(arr.base) * arr.base = baseptr # <<<<<<<<<<<<<< @@ -10735,7 +10728,7 @@ static CYTHON_INLINE void __pyx_f_5numpy_set_array_base(PyArrayObject *__pyx_v_a */ __pyx_v_arr->base = __pyx_v_baseptr; - /* "../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":990 + /* "../../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":990 * * * cdef inline void set_array_base(ndarray arr, object base): # <<<<<<<<<<<<<< @@ -10747,7 +10740,7 @@ static CYTHON_INLINE void __pyx_f_5numpy_set_array_base(PyArrayObject *__pyx_v_a __Pyx_RefNannyFinishContext(); } -/* "../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":1000 +/* "../../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":1000 * arr.base = baseptr * * cdef inline object get_array_base(ndarray arr): # <<<<<<<<<<<<<< @@ -10761,7 +10754,7 @@ static CYTHON_INLINE PyObject *__pyx_f_5numpy_get_array_base(PyArrayObject *__py int __pyx_t_1; __Pyx_RefNannySetupContext("get_array_base", 0); - /* "../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":1001 + /* "../../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":1001 * * cdef inline object get_array_base(ndarray arr): * if arr.base is NULL: # <<<<<<<<<<<<<< @@ -10771,7 +10764,7 @@ static CYTHON_INLINE PyObject *__pyx_f_5numpy_get_array_base(PyArrayObject *__py __pyx_t_1 = ((__pyx_v_arr->base == NULL) != 0); if (__pyx_t_1) { - /* "../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":1002 + /* "../../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":1002 * cdef inline object get_array_base(ndarray arr): * if arr.base is NULL: * return None # <<<<<<<<<<<<<< @@ -10783,7 +10776,7 @@ static CYTHON_INLINE PyObject *__pyx_f_5numpy_get_array_base(PyArrayObject *__py __pyx_r = Py_None; goto __pyx_L0; - /* "../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":1001 + /* "../../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":1001 * * cdef inline object get_array_base(ndarray arr): * if arr.base is NULL: # <<<<<<<<<<<<<< @@ -10792,7 +10785,7 @@ static CYTHON_INLINE PyObject *__pyx_f_5numpy_get_array_base(PyArrayObject *__py */ } - /* "../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":1004 + /* "../../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":1004 * return None * else: * return arr.base # <<<<<<<<<<<<<< @@ -10806,7 +10799,7 @@ static CYTHON_INLINE PyObject *__pyx_f_5numpy_get_array_base(PyArrayObject *__py goto __pyx_L0; } - /* "../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":1000 + /* "../../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":1000 * arr.base = baseptr * * cdef inline object get_array_base(ndarray arr): # <<<<<<<<<<<<<< @@ -10821,7 +10814,7 @@ static CYTHON_INLINE PyObject *__pyx_f_5numpy_get_array_base(PyArrayObject *__py return __pyx_r; } -/* "../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":1009 +/* "../../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":1009 * # Versions of the import_* functions which are more suitable for * # Cython code. * cdef inline int import_array() except -1: # <<<<<<<<<<<<<< @@ -10842,7 +10835,7 @@ static CYTHON_INLINE int __pyx_f_5numpy_import_array(void) { PyObject *__pyx_t_8 = NULL; __Pyx_RefNannySetupContext("import_array", 0); - /* "../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":1010 + /* "../../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":1010 * # Cython code. * cdef inline int import_array() except -1: * try: # <<<<<<<<<<<<<< @@ -10858,7 +10851,7 @@ static CYTHON_INLINE int __pyx_f_5numpy_import_array(void) { __Pyx_XGOTREF(__pyx_t_3); /*try:*/ { - /* "../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":1011 + /* "../../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":1011 * cdef inline int import_array() except -1: * try: * _import_array() # <<<<<<<<<<<<<< @@ -10867,7 +10860,7 @@ static CYTHON_INLINE int __pyx_f_5numpy_import_array(void) { */ __pyx_t_4 = _import_array(); if (unlikely(__pyx_t_4 == ((int)-1))) __PYX_ERR(1, 1011, __pyx_L3_error) - /* "../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":1010 + /* "../../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":1010 * # Cython code. * cdef inline int import_array() except -1: * try: # <<<<<<<<<<<<<< @@ -10881,7 +10874,7 @@ static CYTHON_INLINE int __pyx_f_5numpy_import_array(void) { goto __pyx_L8_try_end; __pyx_L3_error:; - /* "../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":1012 + /* "../../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":1012 * try: * _import_array() * except Exception: # <<<<<<<<<<<<<< @@ -10896,7 +10889,7 @@ static CYTHON_INLINE int __pyx_f_5numpy_import_array(void) { __Pyx_GOTREF(__pyx_t_6); __Pyx_GOTREF(__pyx_t_7); - /* "../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":1013 + /* "../../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":1013 * _import_array() * except Exception: * raise ImportError("numpy.core.multiarray failed to import") # <<<<<<<<<<<<<< @@ -10912,7 +10905,7 @@ static CYTHON_INLINE int __pyx_f_5numpy_import_array(void) { goto __pyx_L5_except_error; __pyx_L5_except_error:; - /* "../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":1010 + /* "../../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":1010 * # Cython code. * cdef inline int import_array() except -1: * try: # <<<<<<<<<<<<<< @@ -10927,7 +10920,7 @@ static CYTHON_INLINE int __pyx_f_5numpy_import_array(void) { __pyx_L8_try_end:; } - /* "../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":1009 + /* "../../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":1009 * # Versions of the import_* functions which are more suitable for * # Cython code. * cdef inline int import_array() except -1: # <<<<<<<<<<<<<< @@ -10950,7 +10943,7 @@ static CYTHON_INLINE int __pyx_f_5numpy_import_array(void) { return __pyx_r; } -/* "../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":1015 +/* "../../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":1015 * raise ImportError("numpy.core.multiarray failed to import") * * cdef inline int import_umath() except -1: # <<<<<<<<<<<<<< @@ -10971,7 +10964,7 @@ static CYTHON_INLINE int __pyx_f_5numpy_import_umath(void) { PyObject *__pyx_t_8 = NULL; __Pyx_RefNannySetupContext("import_umath", 0); - /* "../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":1016 + /* "../../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":1016 * * cdef inline int import_umath() except -1: * try: # <<<<<<<<<<<<<< @@ -10987,7 +10980,7 @@ static CYTHON_INLINE int __pyx_f_5numpy_import_umath(void) { __Pyx_XGOTREF(__pyx_t_3); /*try:*/ { - /* "../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":1017 + /* "../../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":1017 * cdef inline int import_umath() except -1: * try: * _import_umath() # <<<<<<<<<<<<<< @@ -10996,7 +10989,7 @@ static CYTHON_INLINE int __pyx_f_5numpy_import_umath(void) { */ __pyx_t_4 = _import_umath(); if (unlikely(__pyx_t_4 == ((int)-1))) __PYX_ERR(1, 1017, __pyx_L3_error) - /* "../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":1016 + /* "../../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":1016 * * cdef inline int import_umath() except -1: * try: # <<<<<<<<<<<<<< @@ -11010,7 +11003,7 @@ static CYTHON_INLINE int __pyx_f_5numpy_import_umath(void) { goto __pyx_L8_try_end; __pyx_L3_error:; - /* "../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":1018 + /* "../../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":1018 * try: * _import_umath() * except Exception: # <<<<<<<<<<<<<< @@ -11025,7 +11018,7 @@ static CYTHON_INLINE int __pyx_f_5numpy_import_umath(void) { __Pyx_GOTREF(__pyx_t_6); __Pyx_GOTREF(__pyx_t_7); - /* "../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":1019 + /* "../../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":1019 * _import_umath() * except Exception: * raise ImportError("numpy.core.umath failed to import") # <<<<<<<<<<<<<< @@ -11041,7 +11034,7 @@ static CYTHON_INLINE int __pyx_f_5numpy_import_umath(void) { goto __pyx_L5_except_error; __pyx_L5_except_error:; - /* "../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":1016 + /* "../../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":1016 * * cdef inline int import_umath() except -1: * try: # <<<<<<<<<<<<<< @@ -11056,7 +11049,7 @@ static CYTHON_INLINE int __pyx_f_5numpy_import_umath(void) { __pyx_L8_try_end:; } - /* "../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":1015 + /* "../../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":1015 * raise ImportError("numpy.core.multiarray failed to import") * * cdef inline int import_umath() except -1: # <<<<<<<<<<<<<< @@ -11079,7 +11072,7 @@ static CYTHON_INLINE int __pyx_f_5numpy_import_umath(void) { return __pyx_r; } -/* "../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":1021 +/* "../../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":1021 * raise ImportError("numpy.core.umath failed to import") * * cdef inline int import_ufunc() except -1: # <<<<<<<<<<<<<< @@ -11100,7 +11093,7 @@ static CYTHON_INLINE int __pyx_f_5numpy_import_ufunc(void) { PyObject *__pyx_t_8 = NULL; __Pyx_RefNannySetupContext("import_ufunc", 0); - /* "../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":1022 + /* "../../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":1022 * * cdef inline int import_ufunc() except -1: * try: # <<<<<<<<<<<<<< @@ -11116,7 +11109,7 @@ static CYTHON_INLINE int __pyx_f_5numpy_import_ufunc(void) { __Pyx_XGOTREF(__pyx_t_3); /*try:*/ { - /* "../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":1023 + /* "../../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":1023 * cdef inline int import_ufunc() except -1: * try: * _import_umath() # <<<<<<<<<<<<<< @@ -11125,7 +11118,7 @@ static CYTHON_INLINE int __pyx_f_5numpy_import_ufunc(void) { */ __pyx_t_4 = _import_umath(); if (unlikely(__pyx_t_4 == ((int)-1))) __PYX_ERR(1, 1023, __pyx_L3_error) - /* "../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":1022 + /* "../../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":1022 * * cdef inline int import_ufunc() except -1: * try: # <<<<<<<<<<<<<< @@ -11139,7 +11132,7 @@ static CYTHON_INLINE int __pyx_f_5numpy_import_ufunc(void) { goto __pyx_L8_try_end; __pyx_L3_error:; - /* "../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":1024 + /* "../../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":1024 * try: * _import_umath() * except Exception: # <<<<<<<<<<<<<< @@ -11153,7 +11146,7 @@ static CYTHON_INLINE int __pyx_f_5numpy_import_ufunc(void) { __Pyx_GOTREF(__pyx_t_6); __Pyx_GOTREF(__pyx_t_7); - /* "../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":1025 + /* "../../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":1025 * _import_umath() * except Exception: * raise ImportError("numpy.core.umath failed to import") # <<<<<<<<<<<<<< @@ -11167,7 +11160,7 @@ static CYTHON_INLINE int __pyx_f_5numpy_import_ufunc(void) { goto __pyx_L5_except_error; __pyx_L5_except_error:; - /* "../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":1022 + /* "../../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":1022 * * cdef inline int import_ufunc() except -1: * try: # <<<<<<<<<<<<<< @@ -11182,7 +11175,7 @@ static CYTHON_INLINE int __pyx_f_5numpy_import_ufunc(void) { __pyx_L8_try_end:; } - /* "../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":1021 + /* "../../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":1021 * raise ImportError("numpy.core.umath failed to import") * * cdef inline int import_ufunc() except -1: # <<<<<<<<<<<<<< @@ -11223,7 +11216,7 @@ static PyModuleDef_Slot __pyx_moduledef_slots[] = { static struct PyModuleDef __pyx_moduledef = { PyModuleDef_HEAD_INIT, "doc2vec_inner", - 0, /* m_doc */ + __pyx_k_Optimized_cython_functions_for_t, /* m_doc */ #if CYTHON_PEP489_MULTI_PHASE_INIT 0, /* m_size */ #else @@ -11242,7 +11235,6 @@ static struct PyModuleDef __pyx_moduledef = { #endif static __Pyx_StringTabEntry __pyx_string_tab[] = { - {&__pyx_n_s_FAST_VERSION, __pyx_k_FAST_VERSION, sizeof(__pyx_k_FAST_VERSION), 0, 0, 1, 1}, {&__pyx_kp_u_Format_string_allocated_too_shor, __pyx_k_Format_string_allocated_too_shor, sizeof(__pyx_k_Format_string_allocated_too_shor), 0, 1, 0, 0}, {&__pyx_kp_u_Format_string_allocated_too_shor_2, __pyx_k_Format_string_allocated_too_shor_2, sizeof(__pyx_k_Format_string_allocated_too_shor_2), 0, 1, 0, 0}, {&__pyx_n_s_ImportError, __pyx_k_ImportError, sizeof(__pyx_k_ImportError), 0, 0, 1, 1}, @@ -11345,7 +11337,6 @@ static __Pyx_StringTabEntry __pyx_string_tab[] = { {&__pyx_n_s_vocabulary, __pyx_k_vocabulary, sizeof(__pyx_k_vocabulary), 0, 0, 1, 1}, {&__pyx_n_s_window, __pyx_k_window, sizeof(__pyx_k_window), 0, 0, 1, 1}, {&__pyx_n_s_window_indexes, __pyx_k_window_indexes, sizeof(__pyx_k_window_indexes), 0, 0, 1, 1}, - {&__pyx_n_s_word2vec, __pyx_k_word2vec, sizeof(__pyx_k_word2vec), 0, 0, 1, 1}, {&__pyx_n_s_word_locks, __pyx_k_word_locks, sizeof(__pyx_k_word_locks), 0, 0, 1, 1}, {&__pyx_n_s_word_locks_2, __pyx_k_word_locks_2, sizeof(__pyx_k_word_locks_2), 0, 0, 1, 1}, {&__pyx_n_s_word_vectors, __pyx_k_word_vectors, sizeof(__pyx_k_word_vectors), 0, 0, 1, 1}, @@ -11358,8 +11349,8 @@ static __Pyx_StringTabEntry __pyx_string_tab[] = { }; static int __Pyx_InitCachedBuiltins(void) { __pyx_builtin_ImportError = __Pyx_GetBuiltinName(__pyx_n_s_ImportError); if (!__pyx_builtin_ImportError) __PYX_ERR(0, 21, __pyx_L1_error) - __pyx_builtin_range = __Pyx_GetBuiltinName(__pyx_n_s_range); if (!__pyx_builtin_range) __PYX_ERR(0, 52, __pyx_L1_error) - __pyx_builtin_enumerate = __Pyx_GetBuiltinName(__pyx_n_s_enumerate); if (!__pyx_builtin_enumerate) __PYX_ERR(0, 319, __pyx_L1_error) + __pyx_builtin_range = __Pyx_GetBuiltinName(__pyx_n_s_range); if (!__pyx_builtin_range) __PYX_ERR(0, 46, __pyx_L1_error) + __pyx_builtin_enumerate = __Pyx_GetBuiltinName(__pyx_n_s_enumerate); if (!__pyx_builtin_enumerate) __PYX_ERR(0, 357, __pyx_L1_error) __pyx_builtin_ValueError = __Pyx_GetBuiltinName(__pyx_n_s_ValueError); if (!__pyx_builtin_ValueError) __PYX_ERR(1, 235, __pyx_L1_error) __pyx_builtin_RuntimeError = __Pyx_GetBuiltinName(__pyx_n_s_RuntimeError); if (!__pyx_builtin_RuntimeError) __PYX_ERR(1, 823, __pyx_L1_error) return 0; @@ -11371,49 +11362,49 @@ static int __Pyx_InitCachedConstants(void) { __Pyx_RefNannyDeclarations __Pyx_RefNannySetupContext("__Pyx_InitCachedConstants", 0); - /* "gensim/models/doc2vec_inner.pyx":291 + /* "gensim/models/doc2vec_inner.pyx":329 * cum_table_len = len(model.vocabulary.cum_table) * if negative or sample: * next_random = (2**24) * model.random.randint(0, 2**24) + model.random.randint(0, 2**24) # <<<<<<<<<<<<<< * * # convert Python structures to primitive types, so we can release the GIL */ - __pyx_tuple_ = PyTuple_Pack(2, __pyx_int_0, __pyx_int_16777216); if (unlikely(!__pyx_tuple_)) __PYX_ERR(0, 291, __pyx_L1_error) + __pyx_tuple_ = PyTuple_Pack(2, __pyx_int_0, __pyx_int_16777216); if (unlikely(!__pyx_tuple_)) __PYX_ERR(0, 329, __pyx_L1_error) __Pyx_GOTREF(__pyx_tuple_); __Pyx_GIVEREF(__pyx_tuple_); - __pyx_tuple__2 = PyTuple_Pack(2, __pyx_int_0, __pyx_int_16777216); if (unlikely(!__pyx_tuple__2)) __PYX_ERR(0, 291, __pyx_L1_error) + __pyx_tuple__2 = PyTuple_Pack(2, __pyx_int_0, __pyx_int_16777216); if (unlikely(!__pyx_tuple__2)) __PYX_ERR(0, 329, __pyx_L1_error) __Pyx_GOTREF(__pyx_tuple__2); __Pyx_GIVEREF(__pyx_tuple__2); - /* "gensim/models/doc2vec_inner.pyx":429 + /* "gensim/models/doc2vec_inner.pyx":512 * cum_table_len = len(model.vocabulary.cum_table) * if negative or sample: * next_random = (2**24) * model.random.randint(0, 2**24) + model.random.randint(0, 2**24) # <<<<<<<<<<<<<< * * # convert Python structures to primitive types, so we can release the GIL */ - __pyx_tuple__3 = PyTuple_Pack(2, __pyx_int_0, __pyx_int_16777216); if (unlikely(!__pyx_tuple__3)) __PYX_ERR(0, 429, __pyx_L1_error) + __pyx_tuple__3 = PyTuple_Pack(2, __pyx_int_0, __pyx_int_16777216); if (unlikely(!__pyx_tuple__3)) __PYX_ERR(0, 512, __pyx_L1_error) __Pyx_GOTREF(__pyx_tuple__3); __Pyx_GIVEREF(__pyx_tuple__3); - __pyx_tuple__4 = PyTuple_Pack(2, __pyx_int_0, __pyx_int_16777216); if (unlikely(!__pyx_tuple__4)) __PYX_ERR(0, 429, __pyx_L1_error) + __pyx_tuple__4 = PyTuple_Pack(2, __pyx_int_0, __pyx_int_16777216); if (unlikely(!__pyx_tuple__4)) __PYX_ERR(0, 512, __pyx_L1_error) __Pyx_GOTREF(__pyx_tuple__4); __Pyx_GIVEREF(__pyx_tuple__4); - /* "gensim/models/doc2vec_inner.pyx":591 + /* "gensim/models/doc2vec_inner.pyx":719 * cum_table_len = len(model.vocabulary.cum_table) * if negative or sample: * next_random = (2**24) * model.random.randint(0, 2**24) + model.random.randint(0, 2**24) # <<<<<<<<<<<<<< * * # convert Python structures to primitive types, so we can release the GIL */ - __pyx_tuple__6 = PyTuple_Pack(2, __pyx_int_0, __pyx_int_16777216); if (unlikely(!__pyx_tuple__6)) __PYX_ERR(0, 591, __pyx_L1_error) + __pyx_tuple__6 = PyTuple_Pack(2, __pyx_int_0, __pyx_int_16777216); if (unlikely(!__pyx_tuple__6)) __PYX_ERR(0, 719, __pyx_L1_error) __Pyx_GOTREF(__pyx_tuple__6); __Pyx_GIVEREF(__pyx_tuple__6); - __pyx_tuple__7 = PyTuple_Pack(2, __pyx_int_0, __pyx_int_16777216); if (unlikely(!__pyx_tuple__7)) __PYX_ERR(0, 591, __pyx_L1_error) + __pyx_tuple__7 = PyTuple_Pack(2, __pyx_int_0, __pyx_int_16777216); if (unlikely(!__pyx_tuple__7)) __PYX_ERR(0, 719, __pyx_L1_error) __Pyx_GOTREF(__pyx_tuple__7); __Pyx_GIVEREF(__pyx_tuple__7); - /* "../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":235 + /* "../../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":235 * if ((flags & pybuf.PyBUF_C_CONTIGUOUS == pybuf.PyBUF_C_CONTIGUOUS) * and not PyArray_CHKFLAGS(self, NPY_C_CONTIGUOUS)): * raise ValueError(u"ndarray is not C contiguous") # <<<<<<<<<<<<<< @@ -11424,7 +11415,7 @@ static int __Pyx_InitCachedConstants(void) { __Pyx_GOTREF(__pyx_tuple__8); __Pyx_GIVEREF(__pyx_tuple__8); - /* "../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":239 + /* "../../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":239 * if ((flags & pybuf.PyBUF_F_CONTIGUOUS == pybuf.PyBUF_F_CONTIGUOUS) * and not PyArray_CHKFLAGS(self, NPY_F_CONTIGUOUS)): * raise ValueError(u"ndarray is not Fortran contiguous") # <<<<<<<<<<<<<< @@ -11435,7 +11426,7 @@ static int __Pyx_InitCachedConstants(void) { __Pyx_GOTREF(__pyx_tuple__9); __Pyx_GIVEREF(__pyx_tuple__9); - /* "../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":276 + /* "../../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":276 * if ((descr.byteorder == c'>' and little_endian) or * (descr.byteorder == c'<' and not little_endian)): * raise ValueError(u"Non-native byte order not supported") # <<<<<<<<<<<<<< @@ -11446,7 +11437,7 @@ static int __Pyx_InitCachedConstants(void) { __Pyx_GOTREF(__pyx_tuple__10); __Pyx_GIVEREF(__pyx_tuple__10); - /* "../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":823 + /* "../../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":823 * * if (end - f) - (new_offset - offset[0]) < 15: * raise RuntimeError(u"Format string allocated too short, see comment in numpy.pxd") # <<<<<<<<<<<<<< @@ -11457,7 +11448,7 @@ static int __Pyx_InitCachedConstants(void) { __Pyx_GOTREF(__pyx_tuple__11); __Pyx_GIVEREF(__pyx_tuple__11); - /* "../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":827 + /* "../../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":827 * if ((child.byteorder == c'>' and little_endian) or * (child.byteorder == c'<' and not little_endian)): * raise ValueError(u"Non-native byte order not supported") # <<<<<<<<<<<<<< @@ -11468,7 +11459,7 @@ static int __Pyx_InitCachedConstants(void) { __Pyx_GOTREF(__pyx_tuple__12); __Pyx_GIVEREF(__pyx_tuple__12); - /* "../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":847 + /* "../../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":847 * t = child.type_num * if end - f < 5: * raise RuntimeError(u"Format string allocated too short.") # <<<<<<<<<<<<<< @@ -11479,7 +11470,7 @@ static int __Pyx_InitCachedConstants(void) { __Pyx_GOTREF(__pyx_tuple__13); __Pyx_GIVEREF(__pyx_tuple__13); - /* "../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":1013 + /* "../../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":1013 * _import_array() * except Exception: * raise ImportError("numpy.core.multiarray failed to import") # <<<<<<<<<<<<<< @@ -11490,7 +11481,7 @@ static int __Pyx_InitCachedConstants(void) { __Pyx_GOTREF(__pyx_tuple__14); __Pyx_GIVEREF(__pyx_tuple__14); - /* "../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":1019 + /* "../../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":1019 * _import_umath() * except Exception: * raise ImportError("numpy.core.umath failed to import") # <<<<<<<<<<<<<< @@ -11501,7 +11492,7 @@ static int __Pyx_InitCachedConstants(void) { __Pyx_GOTREF(__pyx_tuple__15); __Pyx_GIVEREF(__pyx_tuple__15); - /* "../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":1025 + /* "../../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":1025 * _import_umath() * except Exception: * raise ImportError("numpy.core.umath failed to import") # <<<<<<<<<<<<<< @@ -11510,41 +11501,41 @@ static int __Pyx_InitCachedConstants(void) { __Pyx_GOTREF(__pyx_tuple__16); __Pyx_GIVEREF(__pyx_tuple__16); - /* "gensim/models/doc2vec_inner.pyx":227 + /* "gensim/models/doc2vec_inner.pyx":221 * * * def train_document_dbow(model, doc_words, doctag_indexes, alpha, work=None, # <<<<<<<<<<<<<< * train_words=False, learn_doctags=True, learn_words=True, learn_hidden=True, * word_vectors=None, word_locks=None, doctag_vectors=None, doctag_locks=None): */ - __pyx_tuple__18 = PyTuple_Pack(50, __pyx_n_s_model, __pyx_n_s_doc_words, __pyx_n_s_doctag_indexes, __pyx_n_s_alpha, __pyx_n_s_work, __pyx_n_s_train_words, __pyx_n_s_learn_doctags, __pyx_n_s_learn_words, __pyx_n_s_learn_hidden, __pyx_n_s_word_vectors, __pyx_n_s_word_locks, __pyx_n_s_doctag_vectors, __pyx_n_s_doctag_locks, __pyx_n_s_hs, __pyx_n_s_negative, __pyx_n_s_sample, __pyx_n_s_train_words_2, __pyx_n_s_learn_words_2, __pyx_n_s_learn_hidden_2, __pyx_n_s_learn_doctags_2, __pyx_n_s_word_vectors_2, __pyx_n_s_doctag_vectors_2, __pyx_n_s_word_locks_2, __pyx_n_s_doctag_locks_2, __pyx_n_s_work_2, __pyx_n_s_alpha_2, __pyx_n_s_size, __pyx_n_s_codelens, __pyx_n_s_indexes, __pyx_n_s_doctag_indexes_2, __pyx_n_s_reduced_windows, __pyx_n_s_document_len, __pyx_n_s_doctag_len, __pyx_n_s_window, __pyx_n_s_i, __pyx_n_s_j, __pyx_n_s_r, __pyx_n_s_result, __pyx_n_s_syn1, __pyx_n_s_points, __pyx_n_s_codes, __pyx_n_s_syn1neg, __pyx_n_s_cum_table, __pyx_n_s_cum_table_len, __pyx_n_s_next_random, __pyx_n_s_vlookup, __pyx_n_s_token, __pyx_n_s_predict_word, __pyx_n_s_item, __pyx_n_s_k); if (unlikely(!__pyx_tuple__18)) __PYX_ERR(0, 227, __pyx_L1_error) + __pyx_tuple__18 = PyTuple_Pack(50, __pyx_n_s_model, __pyx_n_s_doc_words, __pyx_n_s_doctag_indexes, __pyx_n_s_alpha, __pyx_n_s_work, __pyx_n_s_train_words, __pyx_n_s_learn_doctags, __pyx_n_s_learn_words, __pyx_n_s_learn_hidden, __pyx_n_s_word_vectors, __pyx_n_s_word_locks, __pyx_n_s_doctag_vectors, __pyx_n_s_doctag_locks, __pyx_n_s_hs, __pyx_n_s_negative, __pyx_n_s_sample, __pyx_n_s_train_words_2, __pyx_n_s_learn_words_2, __pyx_n_s_learn_hidden_2, __pyx_n_s_learn_doctags_2, __pyx_n_s_word_vectors_2, __pyx_n_s_doctag_vectors_2, __pyx_n_s_word_locks_2, __pyx_n_s_doctag_locks_2, __pyx_n_s_work_2, __pyx_n_s_alpha_2, __pyx_n_s_size, __pyx_n_s_codelens, __pyx_n_s_indexes, __pyx_n_s_doctag_indexes_2, __pyx_n_s_reduced_windows, __pyx_n_s_document_len, __pyx_n_s_doctag_len, __pyx_n_s_window, __pyx_n_s_i, __pyx_n_s_j, __pyx_n_s_r, __pyx_n_s_result, __pyx_n_s_syn1, __pyx_n_s_points, __pyx_n_s_codes, __pyx_n_s_syn1neg, __pyx_n_s_cum_table, __pyx_n_s_cum_table_len, __pyx_n_s_next_random, __pyx_n_s_vlookup, __pyx_n_s_token, __pyx_n_s_predict_word, __pyx_n_s_item, __pyx_n_s_k); if (unlikely(!__pyx_tuple__18)) __PYX_ERR(0, 221, __pyx_L1_error) __Pyx_GOTREF(__pyx_tuple__18); __Pyx_GIVEREF(__pyx_tuple__18); - __pyx_codeobj__19 = (PyObject*)__Pyx_PyCode_New(13, 0, 50, 0, CO_OPTIMIZED|CO_NEWLOCALS, __pyx_empty_bytes, __pyx_empty_tuple, __pyx_empty_tuple, __pyx_tuple__18, __pyx_empty_tuple, __pyx_empty_tuple, __pyx_kp_s_gensim_models_doc2vec_inner_pyx, __pyx_n_s_train_document_dbow, 227, __pyx_empty_bytes); if (unlikely(!__pyx_codeobj__19)) __PYX_ERR(0, 227, __pyx_L1_error) + __pyx_codeobj__19 = (PyObject*)__Pyx_PyCode_New(13, 0, 50, 0, CO_OPTIMIZED|CO_NEWLOCALS, __pyx_empty_bytes, __pyx_empty_tuple, __pyx_empty_tuple, __pyx_tuple__18, __pyx_empty_tuple, __pyx_empty_tuple, __pyx_kp_s_gensim_models_doc2vec_inner_pyx, __pyx_n_s_train_document_dbow, 221, __pyx_empty_bytes); if (unlikely(!__pyx_codeobj__19)) __PYX_ERR(0, 221, __pyx_L1_error) - /* "gensim/models/doc2vec_inner.pyx":363 + /* "gensim/models/doc2vec_inner.pyx":401 * * * def train_document_dm(model, doc_words, doctag_indexes, alpha, work=None, neu1=None, # <<<<<<<<<<<<<< * learn_doctags=True, learn_words=True, learn_hidden=True, * word_vectors=None, word_locks=None, doctag_vectors=None, doctag_locks=None): */ - __pyx_tuple__20 = PyTuple_Pack(53, __pyx_n_s_model, __pyx_n_s_doc_words, __pyx_n_s_doctag_indexes, __pyx_n_s_alpha, __pyx_n_s_work, __pyx_n_s_neu1, __pyx_n_s_learn_doctags, __pyx_n_s_learn_words, __pyx_n_s_learn_hidden, __pyx_n_s_word_vectors, __pyx_n_s_word_locks, __pyx_n_s_doctag_vectors, __pyx_n_s_doctag_locks, __pyx_n_s_hs, __pyx_n_s_negative, __pyx_n_s_sample, __pyx_n_s_learn_doctags_2, __pyx_n_s_learn_words_2, __pyx_n_s_learn_hidden_2, __pyx_n_s_cbow_mean, __pyx_n_s_count, __pyx_n_s_inv_count, __pyx_n_s_word_vectors_2, __pyx_n_s_doctag_vectors_2, __pyx_n_s_word_locks_2, __pyx_n_s_doctag_locks_2, __pyx_n_s_work_2, __pyx_n_s_neu1_2, __pyx_n_s_alpha_2, __pyx_n_s_size, __pyx_n_s_codelens, __pyx_n_s_indexes, __pyx_n_s_doctag_indexes_2, __pyx_n_s_reduced_windows, __pyx_n_s_document_len, __pyx_n_s_doctag_len, __pyx_n_s_window, __pyx_n_s_i, __pyx_n_s_j, __pyx_n_s_k, __pyx_n_s_m, __pyx_n_s_result, __pyx_n_s_syn1, __pyx_n_s_points, __pyx_n_s_codes, __pyx_n_s_syn1neg, __pyx_n_s_cum_table, __pyx_n_s_cum_table_len, __pyx_n_s_next_random, __pyx_n_s_vlookup, __pyx_n_s_token, __pyx_n_s_predict_word, __pyx_n_s_item); if (unlikely(!__pyx_tuple__20)) __PYX_ERR(0, 363, __pyx_L1_error) + __pyx_tuple__20 = PyTuple_Pack(53, __pyx_n_s_model, __pyx_n_s_doc_words, __pyx_n_s_doctag_indexes, __pyx_n_s_alpha, __pyx_n_s_work, __pyx_n_s_neu1, __pyx_n_s_learn_doctags, __pyx_n_s_learn_words, __pyx_n_s_learn_hidden, __pyx_n_s_word_vectors, __pyx_n_s_word_locks, __pyx_n_s_doctag_vectors, __pyx_n_s_doctag_locks, __pyx_n_s_hs, __pyx_n_s_negative, __pyx_n_s_sample, __pyx_n_s_learn_doctags_2, __pyx_n_s_learn_words_2, __pyx_n_s_learn_hidden_2, __pyx_n_s_cbow_mean, __pyx_n_s_count, __pyx_n_s_inv_count, __pyx_n_s_word_vectors_2, __pyx_n_s_doctag_vectors_2, __pyx_n_s_word_locks_2, __pyx_n_s_doctag_locks_2, __pyx_n_s_work_2, __pyx_n_s_neu1_2, __pyx_n_s_alpha_2, __pyx_n_s_size, __pyx_n_s_codelens, __pyx_n_s_indexes, __pyx_n_s_doctag_indexes_2, __pyx_n_s_reduced_windows, __pyx_n_s_document_len, __pyx_n_s_doctag_len, __pyx_n_s_window, __pyx_n_s_i, __pyx_n_s_j, __pyx_n_s_k, __pyx_n_s_m, __pyx_n_s_result, __pyx_n_s_syn1, __pyx_n_s_points, __pyx_n_s_codes, __pyx_n_s_syn1neg, __pyx_n_s_cum_table, __pyx_n_s_cum_table_len, __pyx_n_s_next_random, __pyx_n_s_vlookup, __pyx_n_s_token, __pyx_n_s_predict_word, __pyx_n_s_item); if (unlikely(!__pyx_tuple__20)) __PYX_ERR(0, 401, __pyx_L1_error) __Pyx_GOTREF(__pyx_tuple__20); __Pyx_GIVEREF(__pyx_tuple__20); - __pyx_codeobj__21 = (PyObject*)__Pyx_PyCode_New(13, 0, 53, 0, CO_OPTIMIZED|CO_NEWLOCALS, __pyx_empty_bytes, __pyx_empty_tuple, __pyx_empty_tuple, __pyx_tuple__20, __pyx_empty_tuple, __pyx_empty_tuple, __pyx_kp_s_gensim_models_doc2vec_inner_pyx, __pyx_n_s_train_document_dm, 363, __pyx_empty_bytes); if (unlikely(!__pyx_codeobj__21)) __PYX_ERR(0, 363, __pyx_L1_error) + __pyx_codeobj__21 = (PyObject*)__Pyx_PyCode_New(13, 0, 53, 0, CO_OPTIMIZED|CO_NEWLOCALS, __pyx_empty_bytes, __pyx_empty_tuple, __pyx_empty_tuple, __pyx_tuple__20, __pyx_empty_tuple, __pyx_empty_tuple, __pyx_kp_s_gensim_models_doc2vec_inner_pyx, __pyx_n_s_train_document_dm, 401, __pyx_empty_bytes); if (unlikely(!__pyx_codeobj__21)) __PYX_ERR(0, 401, __pyx_L1_error) - /* "gensim/models/doc2vec_inner.pyx":521 + /* "gensim/models/doc2vec_inner.pyx":604 * * * def train_document_dm_concat(model, doc_words, doctag_indexes, alpha, work=None, neu1=None, # <<<<<<<<<<<<<< * learn_doctags=True, learn_words=True, learn_hidden=True, * word_vectors=None, word_locks=None, doctag_vectors=None, doctag_locks=None): */ - __pyx_tuple__22 = PyTuple_Pack(53, __pyx_n_s_model, __pyx_n_s_doc_words, __pyx_n_s_doctag_indexes, __pyx_n_s_alpha, __pyx_n_s_work, __pyx_n_s_neu1, __pyx_n_s_learn_doctags, __pyx_n_s_learn_words, __pyx_n_s_learn_hidden, __pyx_n_s_word_vectors, __pyx_n_s_word_locks, __pyx_n_s_doctag_vectors, __pyx_n_s_doctag_locks, __pyx_n_s_hs, __pyx_n_s_negative, __pyx_n_s_sample, __pyx_n_s_learn_doctags_2, __pyx_n_s_learn_words_2, __pyx_n_s_learn_hidden_2, __pyx_n_s_word_vectors_2, __pyx_n_s_doctag_vectors_2, __pyx_n_s_word_locks_2, __pyx_n_s_doctag_locks_2, __pyx_n_s_work_2, __pyx_n_s_neu1_2, __pyx_n_s_alpha_2, __pyx_n_s_layer1_size, __pyx_n_s_vector_size, __pyx_n_s_codelens, __pyx_n_s_indexes, __pyx_n_s_doctag_indexes_2, __pyx_n_s_window_indexes, __pyx_n_s_document_len, __pyx_n_s_doctag_len, __pyx_n_s_window, __pyx_n_s_expected_doctag_len, __pyx_n_s_i, __pyx_n_s_j, __pyx_n_s_k, __pyx_n_s_m, __pyx_n_s_n, __pyx_n_s_result, __pyx_n_s_null_word_index, __pyx_n_s_syn1, __pyx_n_s_points, __pyx_n_s_codes, __pyx_n_s_syn1neg, __pyx_n_s_cum_table, __pyx_n_s_cum_table_len, __pyx_n_s_next_random, __pyx_n_s_vlookup, __pyx_n_s_token, __pyx_n_s_predict_word); if (unlikely(!__pyx_tuple__22)) __PYX_ERR(0, 521, __pyx_L1_error) + __pyx_tuple__22 = PyTuple_Pack(53, __pyx_n_s_model, __pyx_n_s_doc_words, __pyx_n_s_doctag_indexes, __pyx_n_s_alpha, __pyx_n_s_work, __pyx_n_s_neu1, __pyx_n_s_learn_doctags, __pyx_n_s_learn_words, __pyx_n_s_learn_hidden, __pyx_n_s_word_vectors, __pyx_n_s_word_locks, __pyx_n_s_doctag_vectors, __pyx_n_s_doctag_locks, __pyx_n_s_hs, __pyx_n_s_negative, __pyx_n_s_sample, __pyx_n_s_learn_doctags_2, __pyx_n_s_learn_words_2, __pyx_n_s_learn_hidden_2, __pyx_n_s_word_vectors_2, __pyx_n_s_doctag_vectors_2, __pyx_n_s_word_locks_2, __pyx_n_s_doctag_locks_2, __pyx_n_s_work_2, __pyx_n_s_neu1_2, __pyx_n_s_alpha_2, __pyx_n_s_layer1_size, __pyx_n_s_vector_size, __pyx_n_s_codelens, __pyx_n_s_indexes, __pyx_n_s_doctag_indexes_2, __pyx_n_s_window_indexes, __pyx_n_s_document_len, __pyx_n_s_doctag_len, __pyx_n_s_window, __pyx_n_s_expected_doctag_len, __pyx_n_s_i, __pyx_n_s_j, __pyx_n_s_k, __pyx_n_s_m, __pyx_n_s_n, __pyx_n_s_result, __pyx_n_s_null_word_index, __pyx_n_s_syn1, __pyx_n_s_points, __pyx_n_s_codes, __pyx_n_s_syn1neg, __pyx_n_s_cum_table, __pyx_n_s_cum_table_len, __pyx_n_s_next_random, __pyx_n_s_vlookup, __pyx_n_s_token, __pyx_n_s_predict_word); if (unlikely(!__pyx_tuple__22)) __PYX_ERR(0, 604, __pyx_L1_error) __Pyx_GOTREF(__pyx_tuple__22); __Pyx_GIVEREF(__pyx_tuple__22); - __pyx_codeobj__23 = (PyObject*)__Pyx_PyCode_New(13, 0, 53, 0, CO_OPTIMIZED|CO_NEWLOCALS, __pyx_empty_bytes, __pyx_empty_tuple, __pyx_empty_tuple, __pyx_tuple__22, __pyx_empty_tuple, __pyx_empty_tuple, __pyx_kp_s_gensim_models_doc2vec_inner_pyx, __pyx_n_s_train_document_dm_concat, 521, __pyx_empty_bytes); if (unlikely(!__pyx_codeobj__23)) __PYX_ERR(0, 521, __pyx_L1_error) + __pyx_codeobj__23 = (PyObject*)__Pyx_PyCode_New(13, 0, 53, 0, CO_OPTIMIZED|CO_NEWLOCALS, __pyx_empty_bytes, __pyx_empty_tuple, __pyx_empty_tuple, __pyx_tuple__22, __pyx_empty_tuple, __pyx_empty_tuple, __pyx_kp_s_gensim_models_doc2vec_inner_pyx, __pyx_n_s_train_document_dm_concat, 604, __pyx_empty_bytes); if (unlikely(!__pyx_codeobj__23)) __PYX_ERR(0, 604, __pyx_L1_error) __Pyx_RefNannyFinishContext(); return 0; __pyx_L1_error:; @@ -11670,7 +11661,7 @@ static int __pyx_pymod_exec_doc2vec_inner(PyObject *__pyx_pyinit_module) Py_INCREF(__pyx_m); #else #if PY_MAJOR_VERSION < 3 - __pyx_m = Py_InitModule4("doc2vec_inner", __pyx_methods, 0, 0, PYTHON_API_VERSION); Py_XINCREF(__pyx_m); + __pyx_m = Py_InitModule4("doc2vec_inner", __pyx_methods, __pyx_k_Optimized_cython_functions_for_t, 0, PYTHON_API_VERSION); Py_XINCREF(__pyx_m); #else __pyx_m = PyModule_Create(&__pyx_moduledef); #endif @@ -11735,10 +11726,6 @@ static int __pyx_pymod_exec_doc2vec_inner(PyObject *__pyx_pyinit_module) Py_DECREF(__pyx_t_1); __pyx_t_1 = 0; /*--- Function import code ---*/ __pyx_t_2 = __Pyx_ImportModule("gensim.models.word2vec_inner"); if (!__pyx_t_2) __PYX_ERR(0, 1, __pyx_L1_error) - if (__Pyx_ImportFunction(__pyx_t_2, "our_dot_double", (void (**)(void))&__pyx_f_6gensim_6models_14word2vec_inner_our_dot_double, "__pyx_t_6gensim_6models_14word2vec_inner_REAL_t (int const *, float const *, int const *, float const *, int const *)") < 0) __PYX_ERR(0, 1, __pyx_L1_error) - if (__Pyx_ImportFunction(__pyx_t_2, "our_dot_float", (void (**)(void))&__pyx_f_6gensim_6models_14word2vec_inner_our_dot_float, "__pyx_t_6gensim_6models_14word2vec_inner_REAL_t (int const *, float const *, int const *, float const *, int const *)") < 0) __PYX_ERR(0, 1, __pyx_L1_error) - if (__Pyx_ImportFunction(__pyx_t_2, "our_dot_noblas", (void (**)(void))&__pyx_f_6gensim_6models_14word2vec_inner_our_dot_noblas, "__pyx_t_6gensim_6models_14word2vec_inner_REAL_t (int const *, float const *, int const *, float const *, int const *)") < 0) __PYX_ERR(0, 1, __pyx_L1_error) - if (__Pyx_ImportFunction(__pyx_t_2, "our_saxpy_noblas", (void (**)(void))&__pyx_f_6gensim_6models_14word2vec_inner_our_saxpy_noblas, "void (int const *, float const *, float const *, int const *, float *, int const *)") < 0) __PYX_ERR(0, 1, __pyx_L1_error) if (__Pyx_ImportFunction(__pyx_t_2, "bisect_left", (void (**)(void))&__pyx_f_6gensim_6models_14word2vec_inner_bisect_left, "unsigned PY_LONG_LONG (__pyx_t_5numpy_uint32_t *, unsigned PY_LONG_LONG, unsigned PY_LONG_LONG, unsigned PY_LONG_LONG)") < 0) __PYX_ERR(0, 1, __pyx_L1_error) if (__Pyx_ImportFunction(__pyx_t_2, "random_int32", (void (**)(void))&__pyx_f_6gensim_6models_14word2vec_inner_random_int32, "unsigned PY_LONG_LONG (unsigned PY_LONG_LONG *)") < 0) __PYX_ERR(0, 1, __pyx_L1_error) Py_DECREF(__pyx_t_2); __pyx_t_2 = 0; @@ -11747,26 +11734,26 @@ static int __pyx_pymod_exec_doc2vec_inner(PyObject *__pyx_pyinit_module) if (__Pyx_patch_abc() < 0) __PYX_ERR(0, 1, __pyx_L1_error) #endif - /* "gensim/models/doc2vec_inner.pyx":11 - * + /* "gensim/models/doc2vec_inner.pyx":12 + * """Optimized cython functions for training :class:`~gensim.models.doc2vec.Doc2Vec` model.""" * import cython * import numpy as np # <<<<<<<<<<<<<< * from numpy import zeros, float32 as REAL * cimport numpy as np */ - __pyx_t_3 = __Pyx_Import(__pyx_n_s_numpy, 0, -1); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 11, __pyx_L1_error) + __pyx_t_3 = __Pyx_Import(__pyx_n_s_numpy, 0, -1); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 12, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_3); - if (PyDict_SetItem(__pyx_d, __pyx_n_s_np, __pyx_t_3) < 0) __PYX_ERR(0, 11, __pyx_L1_error) + if (PyDict_SetItem(__pyx_d, __pyx_n_s_np, __pyx_t_3) < 0) __PYX_ERR(0, 12, __pyx_L1_error) __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0; - /* "gensim/models/doc2vec_inner.pyx":12 + /* "gensim/models/doc2vec_inner.pyx":13 * import cython * import numpy as np * from numpy import zeros, float32 as REAL # <<<<<<<<<<<<<< * cimport numpy as np * */ - __pyx_t_3 = PyList_New(2); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 12, __pyx_L1_error) + __pyx_t_3 = PyList_New(2); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 13, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_3); __Pyx_INCREF(__pyx_n_s_zeros); __Pyx_GIVEREF(__pyx_n_s_zeros); @@ -11774,16 +11761,16 @@ static int __pyx_pymod_exec_doc2vec_inner(PyObject *__pyx_pyinit_module) __Pyx_INCREF(__pyx_n_s_float32); __Pyx_GIVEREF(__pyx_n_s_float32); PyList_SET_ITEM(__pyx_t_3, 1, __pyx_n_s_float32); - __pyx_t_4 = __Pyx_Import(__pyx_n_s_numpy, __pyx_t_3, -1); if (unlikely(!__pyx_t_4)) __PYX_ERR(0, 12, __pyx_L1_error) + __pyx_t_4 = __Pyx_Import(__pyx_n_s_numpy, __pyx_t_3, -1); if (unlikely(!__pyx_t_4)) __PYX_ERR(0, 13, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_4); __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0; - __pyx_t_3 = __Pyx_ImportFrom(__pyx_t_4, __pyx_n_s_zeros); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 12, __pyx_L1_error) + __pyx_t_3 = __Pyx_ImportFrom(__pyx_t_4, __pyx_n_s_zeros); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 13, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_3); - if (PyDict_SetItem(__pyx_d, __pyx_n_s_zeros, __pyx_t_3) < 0) __PYX_ERR(0, 12, __pyx_L1_error) + if (PyDict_SetItem(__pyx_d, __pyx_n_s_zeros, __pyx_t_3) < 0) __PYX_ERR(0, 13, __pyx_L1_error) __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0; - __pyx_t_3 = __Pyx_ImportFrom(__pyx_t_4, __pyx_n_s_float32); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 12, __pyx_L1_error) + __pyx_t_3 = __Pyx_ImportFrom(__pyx_t_4, __pyx_n_s_float32); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 13, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_3); - if (PyDict_SetItem(__pyx_d, __pyx_n_s_REAL, __pyx_t_3) < 0) __PYX_ERR(0, 12, __pyx_L1_error) + if (PyDict_SetItem(__pyx_d, __pyx_n_s_REAL, __pyx_t_3) < 0) __PYX_ERR(0, 13, __pyx_L1_error) __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0; __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0; @@ -11860,7 +11847,7 @@ static int __pyx_pymod_exec_doc2vec_inner(PyObject *__pyx_pyinit_module) * # in scipy > 0.15, fblas function has been removed * import scipy.linalg.blas as fblas # <<<<<<<<<<<<<< * - * from word2vec_inner cimport bisect_left, random_int32, \ + * from word2vec_inner cimport bisect_left, random_int32, sscal, REAL_t, EXP_TABLE, our_dot, our_saxpy */ __pyx_t_10 = PyList_New(1); if (unlikely(!__pyx_t_10)) __PYX_ERR(0, 23, __pyx_L4_except_error) __Pyx_GOTREF(__pyx_t_10); @@ -11900,28 +11887,7 @@ static int __pyx_pymod_exec_doc2vec_inner(PyObject *__pyx_pyinit_module) __pyx_L7_try_end:; } - /* "gensim/models/doc2vec_inner.pyx":31 - * our_dot_double, our_dot_float, our_dot_noblas, our_saxpy_noblas - * - * from word2vec import FAST_VERSION # <<<<<<<<<<<<<< - * - * DEF MAX_DOCUMENT_LEN = 10000 - */ - __pyx_t_9 = PyList_New(1); if (unlikely(!__pyx_t_9)) __PYX_ERR(0, 31, __pyx_L1_error) - __Pyx_GOTREF(__pyx_t_9); - __Pyx_INCREF(__pyx_n_s_FAST_VERSION); - __Pyx_GIVEREF(__pyx_n_s_FAST_VERSION); - PyList_SET_ITEM(__pyx_t_9, 0, __pyx_n_s_FAST_VERSION); - __pyx_t_4 = __Pyx_Import(__pyx_n_s_word2vec, __pyx_t_9, -1); if (unlikely(!__pyx_t_4)) __PYX_ERR(0, 31, __pyx_L1_error) - __Pyx_GOTREF(__pyx_t_4); - __Pyx_DECREF(__pyx_t_9); __pyx_t_9 = 0; - __pyx_t_9 = __Pyx_ImportFrom(__pyx_t_4, __pyx_n_s_FAST_VERSION); if (unlikely(!__pyx_t_9)) __PYX_ERR(0, 31, __pyx_L1_error) - __Pyx_GOTREF(__pyx_t_9); - if (PyDict_SetItem(__pyx_d, __pyx_n_s_FAST_VERSION, __pyx_t_9) < 0) __PYX_ERR(0, 31, __pyx_L1_error) - __Pyx_DECREF(__pyx_t_9); __pyx_t_9 = 0; - __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0; - - /* "gensim/models/doc2vec_inner.pyx":35 + /* "gensim/models/doc2vec_inner.pyx":29 * DEF MAX_DOCUMENT_LEN = 10000 * * cdef int ONE = 1 # <<<<<<<<<<<<<< @@ -11930,7 +11896,7 @@ static int __pyx_pymod_exec_doc2vec_inner(PyObject *__pyx_pyinit_module) */ __pyx_v_6gensim_6models_13doc2vec_inner_ONE = 1; - /* "gensim/models/doc2vec_inner.pyx":36 + /* "gensim/models/doc2vec_inner.pyx":30 * * cdef int ONE = 1 * cdef REAL_t ONEF = 1.0 # <<<<<<<<<<<<<< @@ -11939,53 +11905,53 @@ static int __pyx_pymod_exec_doc2vec_inner(PyObject *__pyx_pyinit_module) */ __pyx_v_6gensim_6models_13doc2vec_inner_ONEF = ((__pyx_t_6gensim_6models_14word2vec_inner_REAL_t)1.0); - /* "gensim/models/doc2vec_inner.pyx":227 + /* "gensim/models/doc2vec_inner.pyx":221 * * * def train_document_dbow(model, doc_words, doctag_indexes, alpha, work=None, # <<<<<<<<<<<<<< * train_words=False, learn_doctags=True, learn_words=True, learn_hidden=True, * word_vectors=None, word_locks=None, doctag_vectors=None, doctag_locks=None): */ - __pyx_t_4 = PyCFunction_NewEx(&__pyx_mdef_6gensim_6models_13doc2vec_inner_1train_document_dbow, NULL, __pyx_n_s_gensim_models_doc2vec_inner); if (unlikely(!__pyx_t_4)) __PYX_ERR(0, 227, __pyx_L1_error) - __Pyx_GOTREF(__pyx_t_4); - if (PyDict_SetItem(__pyx_d, __pyx_n_s_train_document_dbow, __pyx_t_4) < 0) __PYX_ERR(0, 227, __pyx_L1_error) - __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0; + __pyx_t_9 = PyCFunction_NewEx(&__pyx_mdef_6gensim_6models_13doc2vec_inner_1train_document_dbow, NULL, __pyx_n_s_gensim_models_doc2vec_inner); if (unlikely(!__pyx_t_9)) __PYX_ERR(0, 221, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_9); + if (PyDict_SetItem(__pyx_d, __pyx_n_s_train_document_dbow, __pyx_t_9) < 0) __PYX_ERR(0, 221, __pyx_L1_error) + __Pyx_DECREF(__pyx_t_9); __pyx_t_9 = 0; - /* "gensim/models/doc2vec_inner.pyx":363 + /* "gensim/models/doc2vec_inner.pyx":401 * * * def train_document_dm(model, doc_words, doctag_indexes, alpha, work=None, neu1=None, # <<<<<<<<<<<<<< * learn_doctags=True, learn_words=True, learn_hidden=True, * word_vectors=None, word_locks=None, doctag_vectors=None, doctag_locks=None): */ - __pyx_t_4 = PyCFunction_NewEx(&__pyx_mdef_6gensim_6models_13doc2vec_inner_3train_document_dm, NULL, __pyx_n_s_gensim_models_doc2vec_inner); if (unlikely(!__pyx_t_4)) __PYX_ERR(0, 363, __pyx_L1_error) - __Pyx_GOTREF(__pyx_t_4); - if (PyDict_SetItem(__pyx_d, __pyx_n_s_train_document_dm, __pyx_t_4) < 0) __PYX_ERR(0, 363, __pyx_L1_error) - __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0; + __pyx_t_9 = PyCFunction_NewEx(&__pyx_mdef_6gensim_6models_13doc2vec_inner_3train_document_dm, NULL, __pyx_n_s_gensim_models_doc2vec_inner); if (unlikely(!__pyx_t_9)) __PYX_ERR(0, 401, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_9); + if (PyDict_SetItem(__pyx_d, __pyx_n_s_train_document_dm, __pyx_t_9) < 0) __PYX_ERR(0, 401, __pyx_L1_error) + __Pyx_DECREF(__pyx_t_9); __pyx_t_9 = 0; - /* "gensim/models/doc2vec_inner.pyx":521 + /* "gensim/models/doc2vec_inner.pyx":604 * * * def train_document_dm_concat(model, doc_words, doctag_indexes, alpha, work=None, neu1=None, # <<<<<<<<<<<<<< * learn_doctags=True, learn_words=True, learn_hidden=True, * word_vectors=None, word_locks=None, doctag_vectors=None, doctag_locks=None): */ - __pyx_t_4 = PyCFunction_NewEx(&__pyx_mdef_6gensim_6models_13doc2vec_inner_5train_document_dm_concat, NULL, __pyx_n_s_gensim_models_doc2vec_inner); if (unlikely(!__pyx_t_4)) __PYX_ERR(0, 521, __pyx_L1_error) - __Pyx_GOTREF(__pyx_t_4); - if (PyDict_SetItem(__pyx_d, __pyx_n_s_train_document_dm_concat, __pyx_t_4) < 0) __PYX_ERR(0, 521, __pyx_L1_error) - __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0; + __pyx_t_9 = PyCFunction_NewEx(&__pyx_mdef_6gensim_6models_13doc2vec_inner_5train_document_dm_concat, NULL, __pyx_n_s_gensim_models_doc2vec_inner); if (unlikely(!__pyx_t_9)) __PYX_ERR(0, 604, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_9); + if (PyDict_SetItem(__pyx_d, __pyx_n_s_train_document_dm_concat, __pyx_t_9) < 0) __PYX_ERR(0, 604, __pyx_L1_error) + __Pyx_DECREF(__pyx_t_9); __pyx_t_9 = 0; /* "gensim/models/doc2vec_inner.pyx":1 * #!/usr/bin/env cython # <<<<<<<<<<<<<< * # cython: boundscheck=False * # cython: wraparound=False */ - __pyx_t_4 = __Pyx_PyDict_NewPresized(0); if (unlikely(!__pyx_t_4)) __PYX_ERR(0, 1, __pyx_L1_error) - __Pyx_GOTREF(__pyx_t_4); - if (PyDict_SetItem(__pyx_d, __pyx_n_s_test, __pyx_t_4) < 0) __PYX_ERR(0, 1, __pyx_L1_error) - __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0; + __pyx_t_9 = __Pyx_PyDict_NewPresized(0); if (unlikely(!__pyx_t_9)) __PYX_ERR(0, 1, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_9); + if (PyDict_SetItem(__pyx_d, __pyx_n_s_test, __pyx_t_9) < 0) __PYX_ERR(0, 1, __pyx_L1_error) + __Pyx_DECREF(__pyx_t_9); __pyx_t_9 = 0; - /* "../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":1021 + /* "../../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":1021 * raise ImportError("numpy.core.umath failed to import") * * cdef inline int import_ufunc() except -1: # <<<<<<<<<<<<<< diff --git a/gensim/models/doc2vec_inner.pyx b/gensim/models/doc2vec_inner.pyx index 3facbdde44..f4ed078713 100644 --- a/gensim/models/doc2vec_inner.pyx +++ b/gensim/models/doc2vec_inner.pyx @@ -2,17 +2,17 @@ # cython: boundscheck=False # cython: wraparound=False # cython: cdivision=True +# cython: embedsignature=True # coding: utf-8 # # Copyright (C) 2013 Radim Rehurek # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html - +"""Optimized cython functions for training :class:`~gensim.models.doc2vec.Doc2Vec` model.""" import cython import numpy as np from numpy import zeros, float32 as REAL cimport numpy as np -from libc.math cimport exp from libc.string cimport memset, memcpy # scipy <= 0.15 @@ -22,13 +22,7 @@ except ImportError: # in scipy > 0.15, fblas function has been removed import scipy.linalg.blas as fblas -from word2vec_inner cimport bisect_left, random_int32, \ - scopy, saxpy, sdot, dsdot, snrm2, sscal, \ - REAL_t, EXP_TABLE, \ - our_dot, our_saxpy, \ - our_dot_double, our_dot_float, our_dot_noblas, our_saxpy_noblas - -from word2vec import FAST_VERSION +from word2vec_inner cimport bisect_left, random_int32, sscal, REAL_t, EXP_TABLE, our_dot, our_saxpy DEF MAX_DOCUMENT_LEN = 10000 @@ -227,10 +221,10 @@ cdef unsigned long long fast_document_dmc_neg( def train_document_dbow(model, doc_words, doctag_indexes, alpha, work=None, train_words=False, learn_doctags=True, learn_words=True, learn_hidden=True, word_vectors=None, word_locks=None, doctag_vectors=None, doctag_locks=None): - ""Update distributed bag of words model ("PV-DBOW") by training on a single document. + """Update distributed bag of words model ("PV-DBOW") by training on a single document. Called internally from :meth:`~gensim.models.doc2vec.Doc2Vec.train` and - :meth:`~gensim.models.doc2vec.Doc2Vec.infer_vector()`. + :meth:`~gensim.models.doc2vec.Doc2Vec.infer_vector`. Parameters ---------- @@ -246,8 +240,8 @@ def train_document_dbow(model, doc_words, doctag_indexes, alpha, work=None, work : list of float, optional Updates to be performed on each neuron in the hidden layer of the underlying network. train_words : bool, optional - Word vectors will be updated exactly as per Word2Vec skip-gram training only if **both** - `learn_words` and `train_words` are set to True. + Word vectors will be updated exactly as per Word2Vec skip-gram training only if **both** `learn_words` + and `train_words` are set to True. learn_doctags : bool, optional Whether the tag vectors should be updated. learn_words : bool, optional @@ -255,21 +249,20 @@ def train_document_dbow(model, doc_words, doctag_indexes, alpha, work=None, `learn_words` and `train_words` are set to True. learn_hidden : bool, optional Whether or not the weights of the hidden layer will be updated. - word_vectors : list of list of float, optional - The vector representation for each word in the vocabulary. If None, these will be retrieved from - the model. - word_locks : list of float, optional - A learning lock factor for each weight in the hidden layer. A value of 0 completely - blocks updates, a value of 1 allows full speed learning. - doctag_vectors : list of list of float, optional + word_vectors : numpy.ndarray, optional + The vector representation for each word in the vocabulary. If None, these will be retrieved from the model. + word_locks : numpy.ndarray, optional + A learning lock factor for each weight in the hidden layer for words, value 0 completely blocks updates, + a value of 1 allows to update word-vectors. + doctag_vectors : numpy.ndarray, optional Vector representations of the tags. If None, these will be retrieved from the model. - doctag_locks : list of float, optional - The lock factors for each tag. + doctag_locks : numpy.ndarray, optional + The lock factors for each tag, same as `word_locks`, but for document-vectors. Returns ------- int - Number of words in the input document. + Number of words in the input document that were actually used for training. """ cdef int hs = model.hs @@ -409,11 +402,11 @@ def train_document_dm(model, doc_words, doctag_indexes, alpha, work=None, neu1=N learn_doctags=True, learn_words=True, learn_hidden=True, word_vectors=None, word_locks=None, doctag_vectors=None, doctag_locks=None): """Update distributed memory model ("PV-DM") by training on a single document. + This method implements the DM model with a projection (input) layer that is either the sum or mean of the context + vectors, depending on the model's `dm_mean` configuration field. Called internally from :meth:`~gensim.models.doc2vec.Doc2Vec.train` and - :meth:`~gensim.models.doc2vec.Doc2Vec.infer_vector()`. This method implements - the DM model with a projection (input) layer that is either the sum or mean of - the context vectors, depending on the model's `dm_mean` configuration field. + :meth:`~gensim.models.doc2vec.Doc2Vec.infer_vector`. Parameters ---------- @@ -437,20 +430,20 @@ def train_document_dm(model, doc_words, doctag_indexes, alpha, work=None, neu1=N `learn_words` and `train_words` are set to True. learn_hidden : bool, optional Whether or not the weights of the hidden layer will be updated. - word_vectors : iterable of list of float - Vector representations of each word in the model's vocabulary. - word_locks : listf of float, optional - Lock factors for each word in the vocabulary. 0 blocks training, 1 fully allows it. - doctag_vectors : list of list of float, optional + word_vectors : numpy.ndarray, optional + The vector representation for each word in the vocabulary. If None, these will be retrieved from the model. + word_locks : numpy.ndarray, optional + A learning lock factor for each weight in the hidden layer for words, value 0 completely blocks updates, + a value of 1 allows to update word-vectors. + doctag_vectors : numpy.ndarray, optional Vector representations of the tags. If None, these will be retrieved from the model. - doctag_locks : list of float, optional - The lock factors for each tag. 0 blocks training, 1 fully allows it. + doctag_locks : numpy.ndarray, optional + The lock factors for each tag, same as `word_locks`, but for document-vectors. Returns ------- int - Number of words in the input document that were actually used for training (they were found in the - vocavulary and they were not discarded by negative sampling). + Number of words in the input document that were actually used for training. """ cdef int hs = model.hs @@ -611,11 +604,12 @@ def train_document_dm(model, doc_words, doctag_indexes, alpha, work=None, neu1=N def train_document_dm_concat(model, doc_words, doctag_indexes, alpha, work=None, neu1=None, learn_doctags=True, learn_words=True, learn_hidden=True, word_vectors=None, word_locks=None, doctag_vectors=None, doctag_locks=None): - """Update distributed memory model ("PV-DM") by training on a single document, using a - concatenation of the context window word vectors (rather than a sum or average). This - might be slower since the input at each batch will be significantly larger. + """Update distributed memory model ("PV-DM") by training on a single document, using a concatenation of the context + window word vectors (rather than a sum or average). + This might be slower since the input at each batch will be significantly larger. - Called internally from `Doc2Vec.train()` and `Doc2Vec.infer_vector()`. + Called internally from :meth:`~gensim.models.doc2vec.Doc2Vec.train` and + :meth:`~gensim.models.doc2vec.Doc2Vec.infer_vector`. Parameters ---------- @@ -639,20 +633,20 @@ def train_document_dm_concat(model, doc_words, doctag_indexes, alpha, work=None, `learn_words` and `train_words` are set to True. learn_hidden : bool, optional Whether or not the weights of the hidden layer will be updated. - word_vectors : iterable of list of float, optional - Vector representations of each word in the model's vocabulary. - word_locks : listf of float, optional - Lock factors for each word in the vocabulary. - doctag_vectors : list of list of float, optional + word_vectors : numpy.ndarray, optional + The vector representation for each word in the vocabulary. If None, these will be retrieved from the model. + word_locks : numpy.ndarray, optional + A learning lock factor for each weight in the hidden layer for words, value 0 completely blocks updates, + a value of 1 allows to update word-vectors. + doctag_vectors : numpy.ndarray, optional Vector representations of the tags. If None, these will be retrieved from the model. - doctag_locks : list of float, optional - The lock factors for each tag. + doctag_locks : numpy.ndarray, optional + The lock factors for each tag, same as `word_locks`, but for document-vectors. Returns ------- int - Number of words in the input document that were actually used for training (they were found in the - vocavulary and they were not discarded by negative sampling). + Number of words in the input document that were actually used for training. """ cdef int hs = model.hs From 7cbbac9eb11bee68cde14da5cbe22cd1a2fcedc7 Mon Sep 17 00:00:00 2001 From: ivan Date: Wed, 18 Apr 2018 19:01:18 +0500 Subject: [PATCH 40/41] fix fasttext[1] --- gensim/models/fasttext.py | 59 ++++++++++++++++++++------------------- 1 file changed, 31 insertions(+), 28 deletions(-) diff --git a/gensim/models/fasttext.py b/gensim/models/fasttext.py index 4a2208dc83..397360c2a4 100644 --- a/gensim/models/fasttext.py +++ b/gensim/models/fasttext.py @@ -9,36 +9,37 @@ hierarchical softmax or negative sampling `Enriching Word Vectors with Subword Information `_. -Notes ------ -There are more ways to get word vectors in Gensim than just FastText. -See wrappers for VarEmbed and WordRank or Word2Vec -This module allows training a word embedding from a training corpus with the additional ability -to obtain word vectors for out-of-vocabulary words. +This module allows training a word embedding from a training corpus with the additional ability to obtain word vectors +for out-of-vocabulary words. For a tutorial on gensim's native fasttext, refer to the `noteboook `_. +Notes +----- **Make sure you have a C compiler before installing gensim, to use optimized (compiled) fasttext training** Examples -------- -#. Initialize a model with e.g. :: +* Initialize a model with e.g. :: >>> from gensim.test.utils import common_texts + >>> from gensim.models import FastText >>> >>> model = FastText(size=4, window=3, min_count=1) >>> model.build_vocab(common_texts) -#. Persist a model to disk with :: +* Persist a model to disk with :: >>> model.save("temp_model.w2v") >>> model = FastText.load("temp_model.w2v") # you can continue training with the loaded model! - The word vectors are stored in a KeyedVectors instance in `model.wv`. - This separates the read-only word vector lookup operations in KeyedVectors from the training code in Word2Vec:: - - >>> computer_vec = model.wv['computer'] # numpy vector of a word +* Retrieve word-vector for vocab and out-of-vocab word (this is main feature of current model):: + >>> existent_word = "computer" + >>> computer_vec = model.wv[existent_word] # numpy vector of a word + >>> + >>> oov_word = "graph-out-of-vocab" + >>> oov_vec = model.wv[oov_word] # numpy vector for OOV word #. You can perform various NLP word tasks with the model. Some of them are already built-in :: @@ -58,9 +59,9 @@ >>> >>> similarities = model.wv.evaluate_word_pairs(datapath('wordsim353.tsv')) -#. And on analogies:: +#. And on analogies :: - >>> analogies = model.wv.accuracy(datapath('questions-words.txt')) + >>> analogies_result = model.wv.accuracy(datapath('questions-words.txt')) """ import logging @@ -90,26 +91,26 @@ def train_batch_cbow(model, sentences, alpha, work=None, neu1=None): """Update CBOW model by training on a sequence of sentences. - Each sentence is a list of string tokens, which are looked up in the model's - vocab dictionary. Called internally from :meth:`gensim.models.fasttext.FastText.train()`. + + Called internally from :meth:`~gensim.models.fasttext.FastText.train`. Notes ----- - This is the non-optimized, Python version. If you have cython installed, gensim - will use the optimized version from fasttext_inner instead. + This is the non-optimized, Python version. If you have cython installed, gensim will use the optimized version + from :mod:`gensim.models.fasttext_inner` instead. Parameters ---------- model : :class:`~gensim.models.fasttext.FastText` - `FastText` instance. + Model instance. sentences : iterable of list of str - Iterable of the sentences directly from disk/network. + Iterable of the sentences. alpha : float Learning rate. work : :class:`numpy.ndarray`, optional - Private working memory for each worker. + UNUSED. neu1 : :class:`numpy.ndarray`, optional - Private working memory for each worker. + UNUSED. Returns ------- int @@ -148,13 +149,13 @@ def train_batch_cbow(model, sentences, alpha, work=None, neu1=None): def train_batch_sg(model, sentences, alpha, work=None, neu1=None): """Update skip-gram model by training on a sequence of sentences. - Each sentence is a list of string tokens, which are looked up in the model's - vocab dictionary. Called internally from :meth:`gensim.models.fasttext.FastText.train()`. + + Called internally from :meth:`~gensim.models.fasttext.FastText.train`. Notes ----- - This is the non-optimized, Python version. If you have cython installed, gensim - will use the optimized version from fasttext_inner instead. + This is the non-optimized, Python version. If you have cython installed, gensim will use the optimized version + from :mod:`gensim.models.fasttext_inner` instead. Parameters ---------- @@ -165,9 +166,9 @@ def train_batch_sg(model, sentences, alpha, work=None, neu1=None): alpha : float Learning rate. work : :class:`numpy.ndarray`, optional - Private working memory for each worker. + UNUSED. neu1 : :class:`numpy.ndarray`, optional - Private working memory for each worker. + UNUSED. Returns ------- @@ -878,6 +879,7 @@ def accuracy(self, questions, restrict_vocab=30000, most_similar=None, case_inse class FastTextVocab(Word2VecVocab): + """Vocabulary used by :class:`~gensim.models.fasttext.FastText`.""" def __init__(self, max_vocab_size=None, min_count=5, sample=1e-3, sorted_vocab=True, null_word=0): super(FastTextVocab, self).__init__( max_vocab_size=max_vocab_size, min_count=min_count, sample=sample, @@ -892,6 +894,7 @@ def prepare_vocab(self, hs, negative, wv, update=False, keep_raw_vocab=False, tr class FastTextTrainables(Word2VecTrainables): + """Represents the inner shallow neural network used to train :class:`~gensim.models.fasttext.FastText`.""" def __init__(self, vector_size=100, seed=1, hashfxn=hash, bucket=2000000): super(FastTextTrainables, self).__init__( vector_size=vector_size, seed=seed, hashfxn=hashfxn) From 0e9e6c5f3dadca52adb0b247dc56b71a411fa24b Mon Sep 17 00:00:00 2001 From: ivan Date: Wed, 18 Apr 2018 19:14:48 +0500 Subject: [PATCH 41/41] reformat example sections --- gensim/models/doc2vec.py | 28 +++++++-------- gensim/models/fasttext.py | 72 +++++++++++++++++++++------------------ 2 files changed, 53 insertions(+), 47 deletions(-) diff --git a/gensim/models/doc2vec.py b/gensim/models/doc2vec.py index 56833e9ad7..c4eb6c6a4a 100644 --- a/gensim/models/doc2vec.py +++ b/gensim/models/doc2vec.py @@ -24,27 +24,27 @@ Examples -------- -* Initialize a model with e.g. :: +Initialize & train a model - >>> from gensim.test.utils import common_texts, get_tmpfile - >>> from gensim.models.doc2vec import Doc2Vec, TaggedDocument - >>> - >>> documents = [TaggedDocument(word, [i]) for i, word in enumerate(common_texts)] - >>> model = Doc2Vec(documents, vector_size=5, window=2, min_count=1, workers=4) +>>> from gensim.test.utils import common_texts, get_tmpfile +>>> from gensim.models.doc2vec import Doc2Vec, TaggedDocument +>>> +>>> documents = [TaggedDocument(word, [i]) for i, word in enumerate(common_texts)] +>>> model = Doc2Vec(documents, vector_size=5, window=2, min_count=1, workers=4) -* Persist a model to disk with :: +Persist a model to disk - >>> tmp_f = get_tmpfile("model") - >>> model.save(tmp_f) - >>> model = Doc2Vec.load(tmp_f) # you can continue training with the loaded model! +>>> tmp_f = get_tmpfile("model") +>>> model.save(tmp_f) +>>> model = Doc2Vec.load(tmp_f) # you can continue training with the loaded model! -* If you're finished training a model (=no more updates, only querying, reduce memory usage), you can do :: +If you're finished training a model (=no more updates, only querying, reduce memory usage), you can do - >>> model.delete_temporary_training_data(keep_doctags_vectors=True, keep_inference=True) +>>> model.delete_temporary_training_data(keep_doctags_vectors=True, keep_inference=True) -* Infer vector for new document :: +Infer vector for new document - >>> vector = model.infer_vector(["system", "response"]) +>>> vector = model.infer_vector(["system", "response"]) """ import logging diff --git a/gensim/models/fasttext.py b/gensim/models/fasttext.py index 397360c2a4..1bc0611479 100644 --- a/gensim/models/fasttext.py +++ b/gensim/models/fasttext.py @@ -23,45 +23,51 @@ Examples -------- -* Initialize a model with e.g. :: - >>> from gensim.test.utils import common_texts - >>> from gensim.models import FastText - >>> - >>> model = FastText(size=4, window=3, min_count=1) - >>> model.build_vocab(common_texts) +Initialize and train a model -* Persist a model to disk with :: - >>> model.save("temp_model.w2v") - >>> model = FastText.load("temp_model.w2v") # you can continue training with the loaded model! +>>> from gensim.test.utils import common_texts, get_tmpfile +>>> from gensim.models import FastText +>>> +>>> model = FastText(size=4, window=3, min_count=1) +>>> model.build_vocab(common_texts) +>>> model.train(common_texts, epochs=1, total_examples=model.corpus_count) -* Retrieve word-vector for vocab and out-of-vocab word (this is main feature of current model):: - >>> existent_word = "computer" - >>> computer_vec = model.wv[existent_word] # numpy vector of a word - >>> - >>> oov_word = "graph-out-of-vocab" - >>> oov_vec = model.wv[oov_word] # numpy vector for OOV word +Persist a model to disk with -#. You can perform various NLP word tasks with the model. Some of them are already built-in :: +>>> tmp_fname = get_tmpfile("temp_fasttext.model") +>>> +>>> model.save(tmp_fname) +>>> model = FastText.load(tmp_fname) # you can continue training with the loaded model! - >>> similarities = model.wv.most_similar(positive=['computer', 'human'], negative=['interface']) - >>> most_similar = similarities[0] +Retrieve word-vector for vocab and out-of-vocab word (this is main feature of current model) - >>> similarities = model.wv.most_similar_cosmul(positive=['computer', 'human'], negative=['interface']) - >>> most_similar = similarities[0] +>>> existent_word = "computer" +>>> computer_vec = model.wv[existent_word] # numpy vector of a word +>>> +>>> oov_word = "graph-out-of-vocab" +>>> oov_vec = model.wv[oov_word] # numpy vector for OOV word - >>> not_matching = model.wv.doesnt_match("human computer interface tree".split()) +You can perform various NLP word tasks with the model, some of them are already built-in - >>> sim_score = model.wv.similarity('computer', 'human') +>>> similarities = model.wv.most_similar(positive=['computer', 'human'], negative=['interface']) +>>> most_similar = similarities[0] +>>> +>>> similarities = model.wv.most_similar_cosmul(positive=['computer', 'human'], negative=['interface']) +>>> most_similar = similarities[0] +>>> +>>> not_matching = model.wv.doesnt_match("human computer interface tree".split()) +>>> +>>> sim_score = model.wv.similarity('computer', 'human') -#. Correlation with human opinion on word similarity :: +Correlation with human opinion on word similarity - >>> from gensim.test.utils import datapath - >>> - >>> similarities = model.wv.evaluate_word_pairs(datapath('wordsim353.tsv')) +>>> from gensim.test.utils import datapath +>>> +>>> similarities = model.wv.evaluate_word_pairs(datapath('wordsim353.tsv')) -#. And on analogies :: +And on analogies - >>> analogies_result = model.wv.accuracy(datapath('questions-words.txt')) +>>> analogies_result = model.wv.accuracy(datapath('questions-words.txt')) """ import logging @@ -202,9 +208,9 @@ class FastText(BaseWordEmbeddingsModel): """Class for training, using and evaluating word representations learned using method described in `Enriching Word Vectors with Subword Information `_, aka FastText. - The model can be stored/loaded via its :meth:`~gensim.models.fasttext.FastText.save()` and - :meth:`~gensim.models.fasttext.FastText.load()` methods, or loaded in a format compatible with the original - fasttext implementation via :meth:`~gensim.models.fasttext.FastText.load_fasttext_format()`. + The model can be stored/loaded via its :meth:`~gensim.models.fasttext.FastText.save` and + :meth:`~gensim.models.fasttext.FastText.load` methods, or loaded in a format compatible with the original + fasttext implementation via :meth:`~gensim.models.fasttext.FastText.load_fasttext_format`. Some important attributes are the following: @@ -214,7 +220,7 @@ class FastText(BaseWordEmbeddingsModel): compute embeddings even for **unseen** words (that do not exist in the vocabulary), as the aggregate of the n-grams included in the word. After training the model, this attribute can be used directly to query those embeddings in various ways. Check the module level docstring from some examples. - self.vocabulary : :class:'~gensim.models.fasttext.FastTextVocab' + self.vocabulary : :class:'~gensim.models.fasttext.FastTextVocab` This object represents the vocabulary (sometimes called Dictionary in gensim) of the model. Besides keeping track of all unique words, this object provides extra functionality, such as constructing a huffman tree (frequent words are closer to the root), or discarding extremely rare words. @@ -225,7 +231,7 @@ class FastText(BaseWordEmbeddingsModel): You can think of it as a NN with a single projection and hidden layer which we train on the corpus. The weights are then used as our embeddings. An important difference however between the two models, is the scoring function used to compute the loss. In the case of FastText, this is modified in word to also account - for the internal structure of words, besides their cooccurence counts. + for the internal structure of words, besides their concurrence counts. """ def __init__(self, sentences=None, sg=0, hs=0, size=100, alpha=0.025, window=5, min_count=5,