From 3f30e1e711c108b9ef2e2ae336ab4838280eb256 Mon Sep 17 00:00:00 2001
From: jodevak <ahmadkhaldi94@gmail.com>
Date: Mon, 25 Sep 2017 17:47:04 +0300
Subject: [PATCH 01/20] fix build vocab speed issue, and new function to build
 vocab from previously provided word frequencies table

---
 gensim/models/word2vec.py | 391 ++++++++++++++++++--------------------
 1 file changed, 180 insertions(+), 211 deletions(-)

diff --git a/gensim/models/word2vec.py b/gensim/models/word2vec.py
index ce7de6330c..fea1cdc990 100644
--- a/gensim/models/word2vec.py
+++ b/gensim/models/word2vec.py
@@ -114,16 +114,18 @@
 except ImportError:
     from Queue import Queue, Empty
 
-from numpy import exp, log, dot, zeros, outer, random, dtype, float32 as REAL,\
-    uint32, seterr, array, uint8, vstack, fromstring, sqrt,\
-    empty, sum as np_sum, ones, logaddexp
+from numpy import exp, log, dot, zeros, outer, random, dtype, float32 as REAL, \
+    double, uint32, seterr, array, uint8, vstack, fromstring, sqrt, newaxis, \
+    ndarray, empty, sum as np_sum, prod, ones, ascontiguousarray, vstack, logaddexp
 
 from scipy.special import expit
 
 from gensim import utils, matutils  # utility fnc for pickling, common scipy operations etc
+from gensim.corpora.dictionary import Dictionary
 from six import iteritems, itervalues, string_types
 from six.moves import xrange
 from types import GeneratorType
+from scipy import stats
 
 logger = logging.getLogger(__name__)
 
@@ -136,6 +138,7 @@
     FAST_VERSION = -1
     MAX_WORDS_IN_BATCH = 10000
 
+
     def train_batch_sg(model, sentences, alpha, work=None, compute_loss=False):
         """
         Update skip-gram model by training on a sequence of sentences.
@@ -150,7 +153,7 @@ def train_batch_sg(model, sentences, alpha, work=None, compute_loss=False):
         result = 0
         for sentence in sentences:
             word_vocabs = [model.wv.vocab[w] for w in sentence if w in model.wv.vocab and
-                           model.wv.vocab[w].sample_int > model.random.rand() * 2**32]
+                           model.wv.vocab[w].sample_int > model.random.rand() * 2 ** 32]
             for pos, word in enumerate(word_vocabs):
                 reduced_window = model.random.randint(model.window)  # `b` in the original word2vec code
 
@@ -159,13 +162,13 @@ def train_batch_sg(model, sentences, alpha, work=None, compute_loss=False):
                 for pos2, word2 in enumerate(word_vocabs[start:(pos + model.window + 1 - reduced_window)], start):
                     # don't train on the `word` itself
                     if pos2 != pos:
-                        train_sg_pair(
-                            model, model.wv.index2word[word.index], word2.index, alpha, compute_loss=compute_loss
-                        )
+                        train_sg_pair(model, model.wv.index2word[word.index], word2.index, alpha,
+                                      compute_loss=compute_loss)
 
             result += len(word_vocabs)
         return result
 
+
     def train_batch_cbow(model, sentences, alpha, work=None, neu1=None, compute_loss=False):
         """
         Update CBOW model by training on a sequence of sentences.
@@ -180,7 +183,7 @@ def train_batch_cbow(model, sentences, alpha, work=None, neu1=None, compute_loss
         result = 0
         for sentence in sentences:
             word_vocabs = [model.wv.vocab[w] for w in sentence if w in model.wv.vocab and
-                           model.wv.vocab[w].sample_int > model.random.rand() * 2**32]
+                           model.wv.vocab[w].sample_int > model.random.rand() * 2 ** 32]
             for pos, word in enumerate(word_vocabs):
                 reduced_window = model.random.randint(model.window)  # `b` in the original word2vec code
                 start = max(0, pos - model.window + reduced_window)
@@ -193,6 +196,7 @@ def train_batch_cbow(model, sentences, alpha, work=None, neu1=None, compute_loss
             result += len(word_vocabs)
         return result
 
+
     def score_sentence_sg(model, sentence, work=None):
         """
         Obtain likelihood score for a single sentence in a fitted skip-gram representaion.
@@ -222,6 +226,7 @@ def score_sentence_sg(model, sentence, work=None):
 
         return log_prob_sentence
 
+
     def score_sentence_cbow(model, sentence, work=None, neu1=None):
         """
         Obtain likelihood score for a single sentence in a fitted CBOW representaion.
@@ -254,32 +259,18 @@ def score_sentence_cbow(model, sentence, work=None, neu1=None):
 
 
 def train_sg_pair(model, word, context_index, alpha, learn_vectors=True, learn_hidden=True,
-                  context_vectors=None, context_locks=None, compute_loss=False, is_ft=False):
+                  context_vectors=None, context_locks=None, compute_loss=False):
     if context_vectors is None:
-        if is_ft:
-            context_vectors_vocab = model.wv.syn0_vocab
-            context_vectors_ngrams = model.wv.syn0_ngrams
-        else:
-            context_vectors = model.wv.syn0
+        context_vectors = model.wv.syn0
     if context_locks is None:
-        if is_ft:
-            context_locks_vocab = model.syn0_vocab_lockf
-            context_locks_ngrams = model.syn0_ngrams_lockf
-        else:
-            context_locks = model.syn0_lockf
+        context_locks = model.syn0_lockf
 
     if word not in model.wv.vocab:
         return
     predict_word = model.wv.vocab[word]  # target word (NN output)
 
-    if is_ft:
-        l1_vocab = context_vectors_vocab[context_index[0]]
-        l1_ngrams = np_sum(context_vectors_ngrams[context_index[1:]], axis=0)
-        if context_index:
-            l1 = np_sum([l1_vocab, l1_ngrams], axis=0) / len(context_index)
-    else:
-        l1 = context_vectors[context_index]  # input word (NN input/projection layer)
-        lock_factor = context_locks[context_index]
+    l1 = context_vectors[context_index]  # input word (NN input/projection layer)
+    lock_factor = context_locks[context_index]
 
     neu1e = zeros(l1.shape)
 
@@ -295,7 +286,7 @@ def train_sg_pair(model, word, context_index, alpha, learn_vectors=True, learn_h
 
         # loss component corresponding to hierarchical softmax
         if compute_loss:
-            sgn = (-1.0)**predict_word.code  # `ch` function, 0 -> 1, 1 -> -1
+            sgn = (-1.0) ** predict_word.code  # `ch` function, 0 -> 1, 1 -> -1
             lprob = -log(expit(-sgn * prod_term))
             model.running_training_loss += sum(lprob)
 
@@ -320,30 +311,12 @@ def train_sg_pair(model, word, context_index, alpha, learn_vectors=True, learn_h
             model.running_training_loss -= log(expit(prod_term[0]))  # for the output word
 
     if learn_vectors:
-        if is_ft:
-            model.wv.syn0_vocab[context_index[0]] += neu1e * context_locks_vocab[context_index[0]]
-            for i in context_index[1:]:
-                model.wv.syn0_ngrams[i] += neu1e * context_locks_ngrams[i]
-        else:
-            l1 += neu1e * lock_factor  # learn input -> hidden (mutates model.wv.syn0[word2.index], if that is l1)
+        l1 += neu1e * lock_factor  # learn input -> hidden (mutates model.wv.syn0[word2.index], if that is l1)
     return neu1e
 
 
-def train_cbow_pair(model, word, input_word_indices, l1, alpha, learn_vectors=True, learn_hidden=True, compute_loss=False,
-                    context_vectors=None, context_locks=None, is_ft=False):
-    if context_vectors is None:
-        if is_ft:
-            context_vectors_vocab = model.wv.syn0_vocab
-            context_vectors_ngrams = model.wv.syn0_ngrams
-        else:
-            context_vectors = model.wv.syn0
-    if context_locks is None:
-        if is_ft:
-            context_locks_vocab = model.syn0_vocab_lockf
-            context_locks_ngrams = model.syn0_ngrams_lockf
-        else:
-            context_locks = model.syn0_lockf
-
+def train_cbow_pair(model, word, input_word_indices, l1, alpha, learn_vectors=True, learn_hidden=True,
+                    compute_loss=False):
     neu1e = zeros(l1.shape)
 
     if model.hs:
@@ -357,7 +330,7 @@ def train_cbow_pair(model, word, input_word_indices, l1, alpha, learn_vectors=Tr
 
         # loss component corresponding to hierarchical softmax
         if compute_loss:
-            sgn = (-1.0)**word.code  # ch function, 0-> 1, 1 -> -1
+            sgn = (-1.0) ** word.code  # ch function, 0-> 1, 1 -> -1
             model.running_training_loss += sum(-log(expit(-sgn * prod_term)))
 
     if model.negative:
@@ -382,18 +355,10 @@ def train_cbow_pair(model, word, input_word_indices, l1, alpha, learn_vectors=Tr
 
     if learn_vectors:
         # learn input -> hidden, here for all words in the window separately
-        if is_ft:
-            if not model.cbow_mean and input_word_indices:
-                neu1e /= (len(input_word_indices[0]) + len(input_word_indices[1]))
-            for i in input_word_indices[0]:
-                context_vectors_vocab[i] += neu1e * context_locks_vocab[i]
-            for i in input_word_indices[1]:
-                context_vectors_ngrams[i] += neu1e * context_locks_ngrams[i]
-        else:
-            if not model.cbow_mean and input_word_indices:
-                neu1e /= len(input_word_indices)
-            for i in input_word_indices:
-                context_vectors[i] += neu1e * context_locks[i]
+        if not model.cbow_mean and input_word_indices:
+            neu1e /= len(input_word_indices)
+        for i in input_word_indices:
+            model.wv.syn0[i] += neu1e * model.syn0_lockf[i]
 
     return neu1e
 
@@ -401,14 +366,14 @@ def train_cbow_pair(model, word, input_word_indices, l1, alpha, learn_vectors=Tr
 def score_sg_pair(model, word, word2):
     l1 = model.wv.syn0[word2.index]
     l2a = deepcopy(model.syn1[word.point])  # 2d matrix, codelen x layer1_size
-    sgn = (-1.0)**word.code  # ch function, 0-> 1, 1 -> -1
+    sgn = (-1.0) ** word.code  # ch function, 0-> 1, 1 -> -1
     lprob = -logaddexp(0, -sgn * dot(l1, l2a.T))
     return sum(lprob)
 
 
 def score_cbow_pair(model, word, l1):
     l2a = model.syn1[word.point]  # 2d matrix, codelen x layer1_size
-    sgn = (-1.0)**word.code  # ch function, 0-> 1, 1 -> -1
+    sgn = (-1.0) ** word.code  # ch function, 0-> 1, 1 -> -1
     lprob = -logaddexp(0, -sgn * dot(l1, l2a.T))
     return sum(lprob)
 
@@ -425,10 +390,11 @@ class Word2Vec(utils.SaveLoad):
 
     """
 
-    def __init__(self, sentences=None, size=100, alpha=0.025, window=5, min_count=5,
-                 max_vocab_size=None, sample=1e-3, seed=1, workers=3, min_alpha=0.0001,
-                 sg=0, hs=0, negative=5, cbow_mean=1, hashfxn=hash, iter=5, null_word=0,
-                 trim_rule=None, sorted_vocab=1, batch_words=MAX_WORDS_IN_BATCH, compute_loss=False):
+    def __init__(
+            self, sentences=None, size=100, alpha=0.025, window=5, min_count=5,
+            max_vocab_size=None, sample=1e-3, seed=1, workers=3, min_alpha=0.0001,
+            sg=0, hs=0, negative=5, cbow_mean=1, hashfxn=hash, iter=5, null_word=0,
+            trim_rule=None, sorted_vocab=1, batch_words=MAX_WORDS_IN_BATCH, compute_loss=False):
         """
         Initialize the model from an iterable of `sentences`. Each sentence is a
         list of words (unicode strings) that will be used for training.
@@ -502,9 +468,9 @@ def __init__(self, sentences=None, size=100, alpha=0.025, window=5, min_count=5,
         self.load = call_on_class_only
 
         if FAST_VERSION == -1:
-            logger.warning('Slow version of %s is being used', __name__)
+            logger.warning('Slow version of {0} is being used'.format(__name__))
         else:
-            logger.debug('Fast version of %s is being used', __name__)
+            logger.debug('Fast version of {0} is being used'.format(__name__))
 
         self.initialize_word_vectors()
         self.sg = int(sg)
@@ -540,19 +506,18 @@ def __init__(self, sentences=None, size=100, alpha=0.025, window=5, min_count=5,
             if isinstance(sentences, GeneratorType):
                 raise TypeError("You can't pass a generator as the sentences argument. Try an iterator.")
             self.build_vocab(sentences, trim_rule=trim_rule)
-            self.train(sentences, total_examples=self.corpus_count, epochs=self.iter, start_alpha=self.alpha, end_alpha=self.min_alpha)
+            self.train(sentences, total_examples=self.corpus_count, epochs=self.iter,
+                       start_alpha=self.alpha, end_alpha=self.min_alpha)
         else:
             if trim_rule is not None:
                 logger.warning(
-                    "The rule, if given, is only used to prune vocabulary during build_vocab() "
-                    "and is not stored as part of the model. Model initialized without sentences. "
-                    "trim_rule provided, if any, will be ignored."
-                )
+                    "The rule, if given, is only used to prune vocabulary during build_vocab() and is not stored as part of the model. ")
+                logger.warning("Model initialized without sentences. trim_rule provided, if any, will be ignored.")
 
     def initialize_word_vectors(self):
         self.wv = KeyedVectors()
 
-    def make_cum_table(self, power=0.75, domain=2**31 - 1):
+    def make_cum_table(self, power=0.75, domain=2 ** 31 - 1):
         """
         Create a cumulative-distribution table using stored vocabulary word counts for
         drawing random words in the negative-sampling training routines.
@@ -569,10 +534,10 @@ def make_cum_table(self, power=0.75, domain=2**31 - 1):
         # compute sum of all power (Z in paper)
         train_words_pow = 0.0
         for word_index in xrange(vocab_size):
-            train_words_pow += self.wv.vocab[self.wv.index2word[word_index]].count**power
+            train_words_pow += self.wv.vocab[self.wv.index2word[word_index]].count ** power
         cumulative = 0.0
         for word_index in xrange(vocab_size):
-            cumulative += self.wv.vocab[self.wv.index2word[word_index]].count**power
+            cumulative += self.wv.vocab[self.wv.index2word[word_index]].count ** power
             self.cum_table[word_index] = round(cumulative / train_words_pow * domain)
         if len(self.cum_table) > 0:
             assert self.cum_table[-1] == domain
@@ -590,9 +555,8 @@ def create_binary_tree(self):
         heapq.heapify(heap)
         for i in xrange(len(self.wv.vocab) - 1):
             min1, min2 = heapq.heappop(heap), heapq.heappop(heap)
-            heapq.heappush(
-                heap, Vocab(count=min1.count + min2.count, index=i + len(self.wv.vocab), left=min1, right=min2)
-            )
+            heapq.heappush(heap,
+                           Vocab(count=min1.count + min2.count, index=i + len(self.wv.vocab), left=min1, right=min2))
 
         # recurse over the tree, assigning a binary code to each vocabulary word
         if heap:
@@ -612,16 +576,31 @@ def create_binary_tree(self):
             logger.info("built huffman tree with maximum node depth %i", max_depth)
 
     def build_vocab(self, sentences, keep_raw_vocab=False, trim_rule=None, progress_per=10000, update=False):
+
         """
         Build vocabulary from a sequence of sentences (can be a once-only generator stream).
         Each sentence must be a list of unicode strings.
-
         """
         self.scan_vocab(sentences, progress_per=progress_per, trim_rule=trim_rule)  # initial survey
         self.scale_vocab(keep_raw_vocab=keep_raw_vocab, trim_rule=trim_rule, update=update)  # trim by min_count & precalculate downsampling
         self.finalize_vocab(update=update)  # build tables & arrays
 
+    def build_vocab_from_freq(self, word_freq, keep_raw_vocab=False, corpus_count=None, trim_rule=None, update=False):
+
+        """
+        Build vocabulary from a dictionary of word frequencies
+        """
+        logger.info("Processing provided word frequencies")
+        vocab = defaultdict(int, word_freq)
+
+        self.corpus_count = corpus_count if corpus_count else 0
+        self.raw_vocab = vocab
+
+        self.scale_vocab(keep_raw_vocab=keep_raw_vocab, trim_rule=trim_rule, update=update)  # trim by min_count & precalculate downsampling
+        self.finalize_vocab(update=update)  # build tables & arrays
+
     def scan_vocab(self, sentences, progress_per=10000, trim_rule=None):
+
         """Do an initial scan of all words appearing in sentences."""
         logger.info("collecting all words and their counts")
         sentence_no = -1
@@ -629,37 +608,33 @@ def scan_vocab(self, sentences, progress_per=10000, trim_rule=None):
         min_reduce = 1
         vocab = defaultdict(int)
         checked_string_types = 0
+
         for sentence_no, sentence in enumerate(sentences):
             if not checked_string_types:
                 if isinstance(sentence, string_types):
                     logger.warning(
-                        "Each 'sentences' item should be a list of words (usually unicode strings). "
-                        "First item here is instead plain %s.",
-                        type(sentence)
+                        "Each 'sentences' item should be a list of words (usually unicode strings)."
+                        "First item here is instead plain %s.", type(sentence)
                     )
                 checked_string_types += 1
             if sentence_no % progress_per == 0:
-                logger.info(
-                    "PROGRESS: at sentence #%i, processed %i words, keeping %i word types",
-                    sentence_no, sum(itervalues(vocab)) + total_words, len(vocab)
-                )
+                logger.info("PROGRESS: at sentence #%i, processed %i words, keeping %i word types",
+                            sentence_no, total_words, len(vocab))
+
             for word in sentence:
                 vocab[word] += 1
+                total_words += 1
 
             if self.max_vocab_size and len(vocab) > self.max_vocab_size:
-                total_words += utils.prune_vocab(vocab, min_reduce, trim_rule=trim_rule)
+                utils.prune_vocab(vocab, min_reduce, trim_rule=trim_rule)
                 min_reduce += 1
 
-        total_words += sum(itervalues(vocab))
-        logger.info(
-            "collected %i word types from a corpus of %i raw words and %i sentences",
-            len(vocab), total_words, sentence_no + 1
-        )
+        logger.info("collected %i word types from a corpus of %i raw words and %i sentences", len(vocab), total_words, sentence_no + 1)
         self.corpus_count = sentence_no + 1
         self.raw_vocab = vocab
 
-    def scale_vocab(self, min_count=None, sample=None, dry_run=False,
-                    keep_raw_vocab=False, trim_rule=None, update=False):
+    def scale_vocab(self, min_count=None, sample=None, dry_run=False, keep_raw_vocab=False, trim_rule=None,
+                    update=False):
         """
         Apply vocabulary settings for `min_count` (discarding less-frequent words)
         and `sample` (controlling the downsampling of more-frequent words).
@@ -700,16 +675,12 @@ def scale_vocab(self, min_count=None, sample=None, dry_run=False,
                     drop_total += v
             original_unique_total = len(retain_words) + drop_unique
             retain_unique_pct = len(retain_words) * 100 / max(original_unique_total, 1)
-            logger.info(
-                "min_count=%d retains %i unique words (%i%% of original %i, drops %i)",
-                min_count, len(retain_words), retain_unique_pct, original_unique_total, drop_unique
-            )
+            logger.info("min_count=%d retains %i unique words (%i%% of original %i, drops %i)",
+                        min_count, len(retain_words), retain_unique_pct, original_unique_total, drop_unique)
             original_total = retain_total + drop_total
             retain_pct = retain_total * 100 / max(original_total, 1)
-            logger.info(
-                "min_count=%d leaves %i word corpus (%i%% of original %i, drops %i)",
-                min_count, retain_total, retain_pct, original_total, drop_total
-            )
+            logger.info("min_count=%d leaves %i word corpus (%i%% of original %i, drops %i)",
+                        min_count, retain_total, retain_pct, original_total, drop_total)
         else:
             logger.info("Updating model with new vocabulary")
             new_total = pre_exist_total = 0
@@ -733,12 +704,10 @@ def scale_vocab(self, min_count=None, sample=None, dry_run=False,
             original_unique_total = len(pre_exist_words) + len(new_words) + drop_unique
             pre_exist_unique_pct = len(pre_exist_words) * 100 / max(original_unique_total, 1)
             new_unique_pct = len(new_words) * 100 / max(original_unique_total, 1)
-            logger.info(
-                "New added %i unique words (%i%% of original %i) "
-                "and increased the count of %i pre-existing words (%i%% of original %i)",
-                len(new_words), new_unique_pct, original_unique_total, len(pre_exist_words),
-                pre_exist_unique_pct, original_unique_total
-            )
+            logger.info("""New added %i unique words (%i%% of original %i)
+                        and increased the count of %i pre-existing words (%i%% of original %i)""",
+                        len(new_words), new_unique_pct, original_unique_total,
+                        len(pre_exist_words), pre_exist_unique_pct, original_unique_total)
             retain_words = new_words + pre_exist_words
             retain_total = new_total + pre_exist_total
 
@@ -764,23 +733,22 @@ def scale_vocab(self, min_count=None, sample=None, dry_run=False,
                 word_probability = 1.0
                 downsample_total += v
             if not dry_run:
-                self.wv.vocab[w].sample_int = int(round(word_probability * 2**32))
+                self.wv.vocab[w].sample_int = int(round(word_probability * 2 ** 32))
 
         if not dry_run and not keep_raw_vocab:
             logger.info("deleting the raw counts dictionary of %i items", len(self.raw_vocab))
             self.raw_vocab = defaultdict(int)
 
         logger.info("sample=%g downsamples %i most-common words", sample, downsample_unique)
-        logger.info(
-            "downsampling leaves estimated %i word corpus (%.1f%% of prior %i)",
-            downsample_total, downsample_total * 100.0 / max(retain_total, 1), retain_total
-        )
+        logger.info("downsampling leaves estimated %i word corpus (%.1f%% of prior %i)",
+                    downsample_total, downsample_total * 100.0 / max(retain_total, 1), retain_total)
 
-        # return from each step: words-affected, resulting-corpus-size, extra memory estimates
-        report_values = {
-            'drop_unique': drop_unique, 'retain_total': retain_total, 'downsample_unique': downsample_unique,
-            'downsample_total': int(downsample_total), 'memory': self.estimate_memory(vocab_size=len(retain_words))
-        }
+        # return from each step: words-affected, resulting-corpus-size
+        report_values = {'drop_unique': drop_unique, 'retain_total': retain_total,
+                         'downsample_unique': downsample_unique, 'downsample_total': int(downsample_total)}
+
+        # print extra memory estimates
+        report_values['memory'] = self.estimate_memory(vocab_size=len(retain_words))
 
         return report_values
 
@@ -846,7 +814,8 @@ def _raw_word_count(self, job):
         return sum(len(sentence) for sentence in job)
 
     def train(self, sentences, total_examples=None, total_words=None,
-              epochs=None, start_alpha=None, end_alpha=None, word_count=0,
+              epochs=None, start_alpha=None, end_alpha=None,
+              word_count=0,
               queue_factor=2, report_delay=1.0, compute_loss=None):
         """
         Update the model's neural weights from a sequence of sentences (can be a once-only generator stream).
@@ -862,13 +831,11 @@ def train(self, sentences, total_examples=None, total_words=None,
         explicit `epochs` argument MUST be provided. In the common and recommended case, where `train()`
         is only called once, the model's cached `iter` value should be supplied as `epochs` value.
         """
-        if self.model_trimmed_post_training:
+        if (self.model_trimmed_post_training):
             raise RuntimeError("Parameters for training were discarded using model_trimmed_post_training method")
         if FAST_VERSION < 0:
-            warnings.warn(
-                "C extension not loaded for Word2Vec, training will be slow. "
-                "Install a C compiler and reinstall gensim for fast training."
-            )
+            warnings.warn("C extension not loaded for Word2Vec, training will be slow. "
+                          "Install a C compiler and reinstall gensim for fast training.")
             self.neg_labels = []
             if self.negative > 0:
                 # precompute negative labels optimization for pure-python training
@@ -882,8 +849,8 @@ def train(self, sentences, total_examples=None, total_words=None,
         logger.info(
             "training model with %i workers on %i vocabulary and %i features, "
             "using sg=%s hs=%s sample=%s negative=%s window=%s",
-            self.workers, len(self.wv.vocab), self.layer1_size, self.sg, self.hs, self.sample, self.negative, self.window
-        )
+            self.workers, len(self.wv.vocab), self.layer1_size, self.sg,
+            self.hs, self.sample, self.negative, self.window)
 
         if not self.wv.vocab:
             raise RuntimeError("you must first build vocabulary before training the model")
@@ -894,15 +861,11 @@ def train(self, sentences, total_examples=None, total_words=None,
             raise ValueError(
                 "The number of sentences in the training corpus is missing. Did you load the model via KeyedVectors.load_word2vec_format?"
                 "Models loaded via load_word2vec_format don't support further training. "
-                "Instead start with a blank model, scan_vocab on the new corpus, "
-                "intersect_word2vec_format with the old model, then train."
-            )
+                "Instead start with a blank model, scan_vocab on the new corpus, intersect_word2vec_format with the old model, then train.")
 
         if total_words is None and total_examples is None:
             raise ValueError(
-                "You must specify either total_examples or total_words, for proper alpha and progress calculations. "
-                "The usual value is total_examples=model.corpus_count."
-            )
+                "You must specify either total_examples or total_words, for proper alpha and progress calculations. The usual value is total_examples=model.corpus_count.")
         if epochs is None:
             raise ValueError("You must specify an explict epochs count. The usual value is epochs=model.iter.")
         start_alpha = start_alpha or self.alpha
@@ -937,7 +900,9 @@ def job_producer():
             pushed_words, pushed_examples = 0, 0
             next_alpha = start_alpha
             if next_alpha > self.min_alpha_yet_reached:
-                logger.warning("Effective 'alpha' higher than previous training cycles")
+                logger.warning(
+                    "Effective 'alpha' higher than previous training cycles"
+                )
             self.min_alpha_yet_reached = next_alpha
             job_no = 0
 
@@ -953,8 +918,7 @@ def job_producer():
                     # no => submit the existing job
                     logger.debug(
                         "queueing job #%i (%i words, %i sentences) at alpha %.05f",
-                        job_no, batch_size, len(job_batch), next_alpha
-                    )
+                        job_no, batch_size, len(job_batch), next_alpha)
                     job_no += 1
                     job_queue.put((job_batch, next_alpha))
 
@@ -978,15 +942,15 @@ def job_producer():
             if job_batch:
                 logger.debug(
                     "queueing job #%i (%i words, %i sentences) at alpha %.05f",
-                    job_no, batch_size, len(job_batch), next_alpha
-                )
+                    job_no, batch_size, len(job_batch), next_alpha)
                 job_no += 1
                 job_queue.put((job_batch, next_alpha))
 
             if job_no == 0 and self.train_count == 0:
                 logger.warning(
                     "train() called with an empty iterator (if not intended, "
-                    "be sure to provide a corpus that offers restartable iteration = an iterable)."
+                    "be sure to provide a corpus that offers restartable "
+                    "iteration = an iterable)."
                 )
 
             # give the workers heads up that they can finish -- no more work!
@@ -1031,31 +995,34 @@ def job_producer():
                     logger.info(
                         "PROGRESS: at %.2f%% examples, %.0f words/s, in_qsize %i, out_qsize %i",
                         100.0 * example_count / total_examples, trained_word_count / elapsed,
-                        utils.qsize(job_queue), utils.qsize(progress_queue)
-                    )
+                        utils.qsize(job_queue), utils.qsize(progress_queue))
                 else:
                     # words-based progress %
                     logger.info(
                         "PROGRESS: at %.2f%% words, %.0f words/s, in_qsize %i, out_qsize %i",
                         100.0 * raw_word_count / total_words, trained_word_count / elapsed,
-                        utils.qsize(job_queue), utils.qsize(progress_queue)
-                    )
+                        utils.qsize(job_queue), utils.qsize(progress_queue))
                 next_report = elapsed + report_delay
 
         # all done; report the final stats
         elapsed = default_timer() - start
         logger.info(
             "training on %i raw words (%i effective words) took %.1fs, %.0f effective words/s",
-            raw_word_count, trained_word_count, elapsed, trained_word_count / elapsed
-        )
+            raw_word_count, trained_word_count, elapsed, trained_word_count / elapsed)
         if job_tally < 10 * self.workers:
-            logger.warning("under 10 jobs per worker: consider setting a smaller `batch_words' for smoother alpha decay")
+            logger.warning(
+                "under 10 jobs per worker: consider setting a smaller `batch_words' for smoother alpha decay"
+            )
 
         # check that the input corpus hasn't changed during iteration
         if total_examples and total_examples != example_count:
-            logger.warning("supplied example count (%i) did not equal expected count (%i)", example_count, total_examples)
+            logger.warning(
+                "supplied example count (%i) did not equal expected count (%i)", example_count, total_examples
+            )
         if total_words and total_words != raw_word_count:
-            logger.warning("supplied raw word count (%i) did not equal expected count (%i)", raw_word_count, total_words)
+            logger.warning(
+                "supplied raw word count (%i) did not equal expected count (%i)", raw_word_count, total_words
+            )
 
         self.train_count += 1  # number of times train() has been called
         self.total_train_time += elapsed
@@ -1082,25 +1049,21 @@ def score(self, sentences, total_sentences=int(1e6), chunksize=100, queue_factor
 
         """
         if FAST_VERSION < 0:
-            warnings.warn(
-                "C extension compilation failed, scoring will be slow. "
-                "Install a C compiler and reinstall gensim for fastness."
-            )
+            warnings.warn("C extension compilation failed, scoring will be slow. "
+                          "Install a C compiler and reinstall gensim for fastness.")
 
         logger.info(
             "scoring sentences with %i workers on %i vocabulary and %i features, "
             "using sg=%s hs=%s sample=%s and negative=%s",
-            self.workers, len(self.wv.vocab), self.layer1_size, self.sg, self.hs, self.sample, self.negative
-        )
+            self.workers, len(self.wv.vocab), self.layer1_size, self.sg, self.hs, self.sample, self.negative)
 
         if not self.wv.vocab:
             raise RuntimeError("you must first build vocabulary before scoring new data")
 
         if not self.hs:
-            raise RuntimeError(
-                "We have currently only implemented score for the hierarchical softmax scheme, "
-                "so you need to have run word2vec with hs=1 and negative=0 for this to work."
-            )
+            raise RuntimeError("We have currently only implemented score \
+                    for the hierarchical softmax scheme, so you need to have \
+                    run word2vec with hs=1 and negative=0 for this to work.")
 
         def worker_loop():
             """Compute log probability for each sentence, lifting lists of sentences from the jobs queue."""
@@ -1146,14 +1109,15 @@ def worker_loop():
                 if (job_no - 1) * chunksize > total_sentences:
                     logger.warning(
                         "terminating after %i sentences (set higher total_sentences if you want more).",
-                        total_sentences
-                    )
+                        total_sentences)
                     job_no -= 1
                     raise StopIteration()
                 logger.debug("putting job #%i in the queue", job_no)
                 job_queue.put(items)
             except StopIteration:
-                logger.info("reached end of input; waiting to finish %i outstanding jobs", job_no - done_jobs + 1)
+                logger.info(
+                    "reached end of input; waiting to finish %i outstanding jobs",
+                    job_no - done_jobs + 1)
                 for _ in xrange(self.workers):
                     job_queue.put(None)  # give the workers heads up that they can finish -- no more work!
                 push_done = True
@@ -1166,8 +1130,7 @@ def worker_loop():
                     if elapsed >= next_report:
                         logger.info(
                             "PROGRESS: at %.2f%% sentences, %.0f sentences/s",
-                            100.0 * sentence_count, sentence_count / elapsed
-                        )
+                            100.0 * sentence_count, sentence_count / elapsed)
                         next_report = elapsed + report_delay  # don't flood log, wait report_delay seconds
                 else:
                     # loop ended by job count; really done
@@ -1179,8 +1142,7 @@ def worker_loop():
         self.clear_sims()
         logger.info(
             "scoring %i sentences took %.1fs, %.0f sentences/s",
-            sentence_count, elapsed, sentence_count / elapsed
-        )
+            sentence_count, elapsed, sentence_count / elapsed)
         return sentence_scores[:sentence_count]
 
     def clear_sims(self):
@@ -1207,10 +1169,9 @@ def update_weights(self):
 
         # Raise an error if an online update is run before initial training on a corpus
         if not len(self.wv.syn0):
-            raise RuntimeError(
-                "You cannot do an online vocabulary-update of a model which has no prior vocabulary. "
-                "First build the vocabulary of your model with a corpus before doing an online update."
-            )
+            raise RuntimeError("You cannot do an online vocabulary-update of a model which has no prior vocabulary. " \
+                               "First build the vocabulary of your model with a corpus " \
+                               "before doing an online update.")
 
         self.wv.syn0 = vstack([self.wv.syn0, newsyn0])
 
@@ -1259,16 +1220,16 @@ def intersect_word2vec_format(self, fname, lockf=0.0, binary=False, encoding='ut
         training. Use 1.0 to allow further training updates of merged vectors.
         """
         overlap_count = 0
-        logger.info("loading projection weights from %s", fname)
+        logger.info("loading projection weights from %s" % (fname))
         with utils.smart_open(fname) as fin:
             header = utils.to_unicode(fin.readline(), encoding=encoding)
-            vocab_size, vector_size = (int(x) for x in header.split())  # throws for invalid file format
+            vocab_size, vector_size = map(int, header.split())  # throws for invalid file format
             if not vector_size == self.vector_size:
                 raise ValueError("incompatible vector size %d in file %s" % (vector_size, fname))
                 # TOCONSIDER: maybe mismatched vectors still useful enough to merge (truncating/padding)?
             if binary:
                 binary_len = dtype(REAL).itemsize * vector_size
-                for _ in xrange(vocab_size):
+                for line_no in xrange(vocab_size):
                     # mixed text and binary: read text first, then binary
                     word = []
                     while True:
@@ -1287,15 +1248,15 @@ def intersect_word2vec_format(self, fname, lockf=0.0, binary=False, encoding='ut
                 for line_no, line in enumerate(fin):
                     parts = utils.to_unicode(line.rstrip(), encoding=encoding, errors=unicode_errors).split(" ")
                     if len(parts) != vector_size + 1:
-                        raise ValueError("invalid vector on line %s (is this really the text format?)" % line_no)
-                    word, weights = parts[0], [REAL(x) for x in parts[1:]]
+                        raise ValueError("invalid vector on line %s (is this really the text format?)" % (line_no))
+                    word, weights = parts[0], list(map(REAL, parts[1:]))
                     if word in self.wv.vocab:
                         overlap_count += 1
                         self.wv.syn0[self.wv.vocab[word].index] = weights
                         self.syn0_lockf[self.wv.vocab[word].index] = lockf  # lock-factor: 0.0 stops further changes
-        logger.info("merged %d vectors into %s matrix from %s", overlap_count, self.wv.syn0.shape, fname)
+        logger.info("merged %d vectors into %s matrix from %s" % (overlap_count, self.wv.syn0.shape, fname))
 
-    def most_similar(self, positive=None, negative=None, topn=10, restrict_vocab=None, indexer=None):
+    def most_similar(self, positive=[], negative=[], topn=10, restrict_vocab=None, indexer=None):
         """
         Deprecated. Use self.wv.most_similar() instead.
         Refer to the documentation for `gensim.models.KeyedVectors.most_similar`
@@ -1309,7 +1270,7 @@ def wmdistance(self, document1, document2):
         """
         return self.wv.wmdistance(document1, document2)
 
-    def most_similar_cosmul(self, positive=None, negative=None, topn=10):
+    def most_similar_cosmul(self, positive=[], negative=[], topn=10):
         """
         Deprecated. Use self.wv.most_similar_cosmul() instead.
         Refer to the documentation for `gensim.models.KeyedVectors.most_similar_cosmul`
@@ -1368,10 +1329,9 @@ def n_similarity(self, ws1, ws2):
     def predict_output_word(self, context_words_list, topn=10):
         """Report the probability distribution of the center word given the context words as input to the trained model."""
         if not self.negative:
-            raise RuntimeError(
-                "We have currently only implemented predict_output_word for the negative sampling scheme, "
-                "so you need to have run word2vec with negative > 0 for this to work."
-            )
+            raise RuntimeError("We have currently only implemented predict_output_word "
+                               "for the negative sampling scheme, so you need to have "
+                               "run word2vec with negative > 0 for this to work.")
 
         if not hasattr(self.wv, 'syn0') or not hasattr(self, 'syn1neg'):
             raise RuntimeError("Parameters required for predicting the output words not found.")
@@ -1390,7 +1350,8 @@ def predict_output_word(self, context_words_list, topn=10):
         prob_values = exp(dot(l1, self.syn1neg.T))  # propagate hidden -> output and take softmax to get probabilities
         prob_values /= sum(prob_values)
         top_indices = matutils.argsort(prob_values, topn=topn, reverse=True)
-        return [(self.wv.index2word[index1], prob_values[index1]) for index1 in top_indices]  # returning the most probable output words with their probabilities
+        return [(self.wv.index2word[index1], prob_values[index1]) for index1 in
+                top_indices]  # returning the most probable output words with their probabilities
 
     def init_sims(self, replace=False):
         """
@@ -1412,10 +1373,8 @@ def estimate_memory(self, vocab_size=None, report=None):
         if self.negative:
             report['syn1neg'] = vocab_size * self.layer1_size * dtype(REAL).itemsize
         report['total'] = sum(report.values())
-        logger.info(
-            "estimated required memory for %i words and %i dimensions: %i bytes",
-            vocab_size, self.vector_size, report['total']
-        )
+        logger.info("estimated required memory for %i words and %i dimensions: %i bytes",
+                    vocab_size, self.vector_size, report['total'])
         return report
 
     @staticmethod
@@ -1434,7 +1393,8 @@ def log_evaluate_word_pairs(pearson, spearman, oov, pairs):
         """
         return KeyedVectors.log_evaluate_word_pairs(pearson, spearman, oov, pairs)
 
-    def evaluate_word_pairs(self, pairs, delimiter='\t', restrict_vocab=300000, case_insensitive=True, dummy4unknown=False):
+    def evaluate_word_pairs(self, pairs, delimiter='\t', restrict_vocab=300000, case_insensitive=True,
+                            dummy4unknown=False):
         """
         Deprecated. Use self.wv.evaluate_word_pairs() instead.
         Refer to the documentation for `gensim.models.KeyedVectors.evaluate_word_pairs`
@@ -1442,14 +1402,12 @@ def evaluate_word_pairs(self, pairs, delimiter='\t', restrict_vocab=300000, case
         return self.wv.evaluate_word_pairs(pairs, delimiter, restrict_vocab, case_insensitive, dummy4unknown)
 
     def __str__(self):
-        return "%s(vocab=%s, size=%s, alpha=%s)" % (self.__class__.__name__, len(self.wv.index2word), self.vector_size, self.alpha)
+        return "%s(vocab=%s, size=%s, alpha=%s)" % (
+            self.__class__.__name__, len(self.wv.index2word), self.vector_size, self.alpha)
 
     def _minimize_model(self, save_syn1=False, save_syn1neg=False, save_syn0_lockf=False):
         warnings.warn(
-            "This method would be deprecated in the future. "
-            "Keep just_word_vectors = model.wv to retain just the KeyedVectors instance "
-            "for read-only querying of word vectors."
-        )
+            "This method would be deprecated in the future. Keep just_word_vectors = model.wv to retain just the KeyedVectors instance for read-only querying of word vectors.")
         if save_syn1 and save_syn1neg and save_syn0_lockf:
             return
         if hasattr(self, 'syn1') and not save_syn1:
@@ -1492,7 +1450,7 @@ def load(cls, *args, **kwargs):
             if hasattr(v, 'sample_int'):
                 break  # already 0.12.0+ style int probabilities
             elif hasattr(v, 'sample_probability'):
-                v.sample_int = int(round(v.sample_probability * 2**32))
+                v.sample_int = int(round(v.sample_probability * 2 ** 32))
                 del v.sample_probability
         if not hasattr(model, 'syn0_lockf') and hasattr(model, 'syn0'):
             model.syn0_lockf = ones(len(model.wv.syn0), dtype=REAL)
@@ -1516,7 +1474,7 @@ def _load_specials(self, *args, **kwargs):
 
     @classmethod
     def load_word2vec_format(cls, fname, fvocab=None, binary=False, encoding='utf8', unicode_errors='strict',
-                         limit=None, datatype=REAL):
+                             limit=None, datatype=REAL):
         """Deprecated. Use gensim.models.KeyedVectors.load_word2vec_format instead."""
         raise DeprecationWarning("Deprecated. Use gensim.models.KeyedVectors.load_word2vec_format instead.")
 
@@ -1623,7 +1581,7 @@ def __iter__(self):
                     line = utils.to_unicode(line).split()
                     i = 0
                     while i < len(line):
-                        yield line[i: i + self.max_sentence_length]
+                        yield line[i:i + self.max_sentence_length]
                         i += self.max_sentence_length
 
 
@@ -1641,7 +1599,7 @@ def __init__(self, source, max_sentence_length=MAX_WORDS_IN_BATCH, limit=None):
 
         Example::
 
-            sentences = PathLineSentences(os.getcwd() + '\\corpus\\')
+            sentences = LineSentencePath(os.getcwd() + '\\corpus\\')
 
         The files in the directory should be either text files, .bz2 files, or .gz files.
 
@@ -1655,19 +1613,19 @@ def __init__(self, source, max_sentence_length=MAX_WORDS_IN_BATCH, limit=None):
             self.input_files = [self.source]  # force code compatibility with list of files
         elif os.path.isdir(self.source):
             self.source = os.path.join(self.source, '')  # ensures os-specific slash at end of path
-            logging.debug('reading directory %s', self.source)
+            logging.debug('reading directory ' + self.source)
             self.input_files = os.listdir(self.source)
             self.input_files = [self.source + file for file in self.input_files]  # make full paths
             self.input_files.sort()  # makes sure it happens in filename order
         else:  # not a file or a directory, then we can't do anything with it
             raise ValueError('input is neither a file nor a path')
 
-        logging.info('files read into PathLineSentences:%s', '\n'.join(self.input_files))
+        logging.info('files read into PathLineSentences:' + '\n'.join(self.input_files))
 
     def __iter__(self):
-        """iterate through the files"""
+        '''iterate through the files'''
         for file_name in self.input_files:
-            logging.info('reading file %s', file_name)
+            logging.info('reading file ' + file_name)
             with utils.smart_open(file_name) as fin:
                 for line in itertools.islice(fin, self.limit):
                     line = utils.to_unicode(line).split()
@@ -1680,6 +1638,7 @@ def __iter__(self):
 # Example: ./word2vec.py -train data.txt -output vec.txt -size 200 -window 5 -sample 1e-4 -negative 5 -hs 0 -binary 0 -cbow 1 -iter 3
 if __name__ == "__main__":
     import argparse
+
     logging.basicConfig(
         format='%(asctime)s : %(threadName)s : %(levelname)s : %(message)s',
         level=logging.INFO)
@@ -1692,7 +1651,7 @@ def __iter__(self):
         print(globals()['__doc__'] % locals())
         sys.exit(1)
 
-    from gensim.models.word2vec import Word2Vec  # noqa:F811 avoid referencing __main__ in pickle
+    from gensim.models.word2vec import Word2Vec  # avoid referencing __main__ in pickle
 
     seterr(all='raise')  # don't ignore numpy errors
 
@@ -1701,14 +1660,23 @@ def __iter__(self):
     parser.add_argument("-output", help="Use file OUTPUT to save the resulting word vectors")
     parser.add_argument("-window", help="Set max skip length WINDOW between words; default is 5", type=int, default=5)
     parser.add_argument("-size", help="Set size of word vectors; default is 100", type=int, default=100)
-    parser.add_argument("-sample", help="Set threshold for occurrence of words. Those that appear with higher frequency in the training data will be randomly down-sampled; default is 1e-3, useful range is (0, 1e-5)", type=float, default=1e-3)
-    parser.add_argument("-hs", help="Use Hierarchical Softmax; default is 0 (not used)", type=int, default=0, choices=[0, 1])
-    parser.add_argument("-negative", help="Number of negative examples; default is 5, common values are 3 - 10 (0 = not used)", type=int, default=5)
+    parser.add_argument("-sample",
+                        help="Set threshold for occurrence of words. Those that appear with higher frequency in the training data will be randomly down-sampled; default is 1e-3, useful range is (0, 1e-5)",
+                        type=float, default=1e-3)
+    parser.add_argument("-hs", help="Use Hierarchical Softmax; default is 0 (not used)", type=int, default=0,
+                        choices=[0, 1])
+    parser.add_argument("-negative",
+                        help="Number of negative examples; default is 5, common values are 3 - 10 (0 = not used)",
+                        type=int, default=5)
     parser.add_argument("-threads", help="Use THREADS threads (default 12)", type=int, default=12)
     parser.add_argument("-iter", help="Run more training iterations (default 5)", type=int, default=5)
-    parser.add_argument("-min_count", help="This will discard words that appear less than MIN_COUNT times; default is 5", type=int, default=5)
-    parser.add_argument("-cbow", help="Use the continuous bag of words model; default is 1 (use 0 for skip-gram model)", type=int, default=1, choices=[0, 1])
-    parser.add_argument("-binary", help="Save the resulting vectors in binary mode; default is 0 (off)", type=int, default=0, choices=[0, 1])
+    parser.add_argument("-min_count",
+                        help="This will discard words that appear less than MIN_COUNT times; default is 5", type=int,
+                        default=5)
+    parser.add_argument("-cbow", help="Use the continuous bag of words model; default is 1 (use 0 for skip-gram model)",
+                        type=int, default=1, choices=[0, 1])
+    parser.add_argument("-binary", help="Save the resulting vectors in binary mode; default is 0 (off)", type=int,
+                        default=0, choices=[0, 1])
     parser.add_argument("-accuracy", help="Use questions from file ACCURACY to evaluate the model")
 
     args = parser.parse_args()
@@ -1723,8 +1691,7 @@ def __iter__(self):
     model = Word2Vec(
         corpus, size=args.size, min_count=args.min_count, workers=args.threads,
         window=args.window, sample=args.sample, sg=skipgram, hs=args.hs,
-        negative=args.negative, cbow_mean=1, iter=args.iter
-    )
+        negative=args.negative, cbow_mean=1, iter=args.iter)
 
     if args.output:
         outfile = args.output
@@ -1741,3 +1708,5 @@ def __iter__(self):
         model.accuracy(args.accuracy)
 
     logger.info("finished running %s", program)
+
+

From c4f387eddf6ebe6f20c1b31f2a301115b23b71b7 Mon Sep 17 00:00:00 2001
From: jodevak <ahmadkhaldi94@gmail.com>
Date: Mon, 25 Sep 2017 18:11:44 +0300
Subject: [PATCH 02/20] fix build vocab speed issue, function build vocab from
 previously provided word frequencies table

---
 gensim/models/word2vec.py | 376 +++++++++++++++++++++-----------------
 1 file changed, 212 insertions(+), 164 deletions(-)

diff --git a/gensim/models/word2vec.py b/gensim/models/word2vec.py
index fea1cdc990..2e6eb89cb2 100644
--- a/gensim/models/word2vec.py
+++ b/gensim/models/word2vec.py
@@ -114,18 +114,16 @@
 except ImportError:
     from Queue import Queue, Empty
 
-from numpy import exp, log, dot, zeros, outer, random, dtype, float32 as REAL, \
-    double, uint32, seterr, array, uint8, vstack, fromstring, sqrt, newaxis, \
-    ndarray, empty, sum as np_sum, prod, ones, ascontiguousarray, vstack, logaddexp
+from numpy import exp, log, dot, zeros, outer, random, dtype, float32 as REAL,\
+    uint32, seterr, array, uint8, vstack, fromstring, sqrt,\
+    empty, sum as np_sum, ones, logaddexp
 
 from scipy.special import expit
 
 from gensim import utils, matutils  # utility fnc for pickling, common scipy operations etc
-from gensim.corpora.dictionary import Dictionary
 from six import iteritems, itervalues, string_types
 from six.moves import xrange
 from types import GeneratorType
-from scipy import stats
 
 logger = logging.getLogger(__name__)
 
@@ -138,7 +136,6 @@
     FAST_VERSION = -1
     MAX_WORDS_IN_BATCH = 10000
 
-
     def train_batch_sg(model, sentences, alpha, work=None, compute_loss=False):
         """
         Update skip-gram model by training on a sequence of sentences.
@@ -153,7 +150,7 @@ def train_batch_sg(model, sentences, alpha, work=None, compute_loss=False):
         result = 0
         for sentence in sentences:
             word_vocabs = [model.wv.vocab[w] for w in sentence if w in model.wv.vocab and
-                           model.wv.vocab[w].sample_int > model.random.rand() * 2 ** 32]
+                           model.wv.vocab[w].sample_int > model.random.rand() * 2**32]
             for pos, word in enumerate(word_vocabs):
                 reduced_window = model.random.randint(model.window)  # `b` in the original word2vec code
 
@@ -162,13 +159,13 @@ def train_batch_sg(model, sentences, alpha, work=None, compute_loss=False):
                 for pos2, word2 in enumerate(word_vocabs[start:(pos + model.window + 1 - reduced_window)], start):
                     # don't train on the `word` itself
                     if pos2 != pos:
-                        train_sg_pair(model, model.wv.index2word[word.index], word2.index, alpha,
-                                      compute_loss=compute_loss)
+                        train_sg_pair(
+                            model, model.wv.index2word[word.index], word2.index, alpha, compute_loss=compute_loss
+                        )
 
             result += len(word_vocabs)
         return result
 
-
     def train_batch_cbow(model, sentences, alpha, work=None, neu1=None, compute_loss=False):
         """
         Update CBOW model by training on a sequence of sentences.
@@ -183,7 +180,7 @@ def train_batch_cbow(model, sentences, alpha, work=None, neu1=None, compute_loss
         result = 0
         for sentence in sentences:
             word_vocabs = [model.wv.vocab[w] for w in sentence if w in model.wv.vocab and
-                           model.wv.vocab[w].sample_int > model.random.rand() * 2 ** 32]
+                           model.wv.vocab[w].sample_int > model.random.rand() * 2**32]
             for pos, word in enumerate(word_vocabs):
                 reduced_window = model.random.randint(model.window)  # `b` in the original word2vec code
                 start = max(0, pos - model.window + reduced_window)
@@ -196,7 +193,6 @@ def train_batch_cbow(model, sentences, alpha, work=None, neu1=None, compute_loss
             result += len(word_vocabs)
         return result
 
-
     def score_sentence_sg(model, sentence, work=None):
         """
         Obtain likelihood score for a single sentence in a fitted skip-gram representaion.
@@ -226,7 +222,6 @@ def score_sentence_sg(model, sentence, work=None):
 
         return log_prob_sentence
 
-
     def score_sentence_cbow(model, sentence, work=None, neu1=None):
         """
         Obtain likelihood score for a single sentence in a fitted CBOW representaion.
@@ -259,18 +254,32 @@ def score_sentence_cbow(model, sentence, work=None, neu1=None):
 
 
 def train_sg_pair(model, word, context_index, alpha, learn_vectors=True, learn_hidden=True,
-                  context_vectors=None, context_locks=None, compute_loss=False):
+                  context_vectors=None, context_locks=None, compute_loss=False, is_ft=False):
     if context_vectors is None:
-        context_vectors = model.wv.syn0
+        if is_ft:
+            context_vectors_vocab = model.wv.syn0_vocab
+            context_vectors_ngrams = model.wv.syn0_ngrams
+        else:
+            context_vectors = model.wv.syn0
     if context_locks is None:
-        context_locks = model.syn0_lockf
+        if is_ft:
+            context_locks_vocab = model.syn0_vocab_lockf
+            context_locks_ngrams = model.syn0_ngrams_lockf
+        else:
+            context_locks = model.syn0_lockf
 
     if word not in model.wv.vocab:
         return
     predict_word = model.wv.vocab[word]  # target word (NN output)
 
-    l1 = context_vectors[context_index]  # input word (NN input/projection layer)
-    lock_factor = context_locks[context_index]
+    if is_ft:
+        l1_vocab = context_vectors_vocab[context_index[0]]
+        l1_ngrams = np_sum(context_vectors_ngrams[context_index[1:]], axis=0)
+        if context_index:
+            l1 = np_sum([l1_vocab, l1_ngrams], axis=0) / len(context_index)
+    else:
+        l1 = context_vectors[context_index]  # input word (NN input/projection layer)
+        lock_factor = context_locks[context_index]
 
     neu1e = zeros(l1.shape)
 
@@ -286,7 +295,7 @@ def train_sg_pair(model, word, context_index, alpha, learn_vectors=True, learn_h
 
         # loss component corresponding to hierarchical softmax
         if compute_loss:
-            sgn = (-1.0) ** predict_word.code  # `ch` function, 0 -> 1, 1 -> -1
+            sgn = (-1.0)**predict_word.code  # `ch` function, 0 -> 1, 1 -> -1
             lprob = -log(expit(-sgn * prod_term))
             model.running_training_loss += sum(lprob)
 
@@ -311,12 +320,30 @@ def train_sg_pair(model, word, context_index, alpha, learn_vectors=True, learn_h
             model.running_training_loss -= log(expit(prod_term[0]))  # for the output word
 
     if learn_vectors:
-        l1 += neu1e * lock_factor  # learn input -> hidden (mutates model.wv.syn0[word2.index], if that is l1)
+        if is_ft:
+            model.wv.syn0_vocab[context_index[0]] += neu1e * context_locks_vocab[context_index[0]]
+            for i in context_index[1:]:
+                model.wv.syn0_ngrams[i] += neu1e * context_locks_ngrams[i]
+        else:
+            l1 += neu1e * lock_factor  # learn input -> hidden (mutates model.wv.syn0[word2.index], if that is l1)
     return neu1e
 
 
-def train_cbow_pair(model, word, input_word_indices, l1, alpha, learn_vectors=True, learn_hidden=True,
-                    compute_loss=False):
+def train_cbow_pair(model, word, input_word_indices, l1, alpha, learn_vectors=True, learn_hidden=True, compute_loss=False,
+                    context_vectors=None, context_locks=None, is_ft=False):
+    if context_vectors is None:
+        if is_ft:
+            context_vectors_vocab = model.wv.syn0_vocab
+            context_vectors_ngrams = model.wv.syn0_ngrams
+        else:
+            context_vectors = model.wv.syn0
+    if context_locks is None:
+        if is_ft:
+            context_locks_vocab = model.syn0_vocab_lockf
+            context_locks_ngrams = model.syn0_ngrams_lockf
+        else:
+            context_locks = model.syn0_lockf
+
     neu1e = zeros(l1.shape)
 
     if model.hs:
@@ -330,7 +357,7 @@ def train_cbow_pair(model, word, input_word_indices, l1, alpha, learn_vectors=Tr
 
         # loss component corresponding to hierarchical softmax
         if compute_loss:
-            sgn = (-1.0) ** word.code  # ch function, 0-> 1, 1 -> -1
+            sgn = (-1.0)**word.code  # ch function, 0-> 1, 1 -> -1
             model.running_training_loss += sum(-log(expit(-sgn * prod_term)))
 
     if model.negative:
@@ -355,10 +382,18 @@ def train_cbow_pair(model, word, input_word_indices, l1, alpha, learn_vectors=Tr
 
     if learn_vectors:
         # learn input -> hidden, here for all words in the window separately
-        if not model.cbow_mean and input_word_indices:
-            neu1e /= len(input_word_indices)
-        for i in input_word_indices:
-            model.wv.syn0[i] += neu1e * model.syn0_lockf[i]
+        if is_ft:
+            if not model.cbow_mean and input_word_indices:
+                neu1e /= (len(input_word_indices[0]) + len(input_word_indices[1]))
+            for i in input_word_indices[0]:
+                context_vectors_vocab[i] += neu1e * context_locks_vocab[i]
+            for i in input_word_indices[1]:
+                context_vectors_ngrams[i] += neu1e * context_locks_ngrams[i]
+        else:
+            if not model.cbow_mean and input_word_indices:
+                neu1e /= len(input_word_indices)
+            for i in input_word_indices:
+                context_vectors[i] += neu1e * context_locks[i]
 
     return neu1e
 
@@ -366,14 +401,14 @@ def train_cbow_pair(model, word, input_word_indices, l1, alpha, learn_vectors=Tr
 def score_sg_pair(model, word, word2):
     l1 = model.wv.syn0[word2.index]
     l2a = deepcopy(model.syn1[word.point])  # 2d matrix, codelen x layer1_size
-    sgn = (-1.0) ** word.code  # ch function, 0-> 1, 1 -> -1
+    sgn = (-1.0)**word.code  # ch function, 0-> 1, 1 -> -1
     lprob = -logaddexp(0, -sgn * dot(l1, l2a.T))
     return sum(lprob)
 
 
 def score_cbow_pair(model, word, l1):
     l2a = model.syn1[word.point]  # 2d matrix, codelen x layer1_size
-    sgn = (-1.0) ** word.code  # ch function, 0-> 1, 1 -> -1
+    sgn = (-1.0)**word.code  # ch function, 0-> 1, 1 -> -1
     lprob = -logaddexp(0, -sgn * dot(l1, l2a.T))
     return sum(lprob)
 
@@ -390,11 +425,10 @@ class Word2Vec(utils.SaveLoad):
 
     """
 
-    def __init__(
-            self, sentences=None, size=100, alpha=0.025, window=5, min_count=5,
-            max_vocab_size=None, sample=1e-3, seed=1, workers=3, min_alpha=0.0001,
-            sg=0, hs=0, negative=5, cbow_mean=1, hashfxn=hash, iter=5, null_word=0,
-            trim_rule=None, sorted_vocab=1, batch_words=MAX_WORDS_IN_BATCH, compute_loss=False):
+    def __init__(self, sentences=None, size=100, alpha=0.025, window=5, min_count=5,
+                 max_vocab_size=None, sample=1e-3, seed=1, workers=3, min_alpha=0.0001,
+                 sg=0, hs=0, negative=5, cbow_mean=1, hashfxn=hash, iter=5, null_word=0,
+                 trim_rule=None, sorted_vocab=1, batch_words=MAX_WORDS_IN_BATCH, compute_loss=False):
         """
         Initialize the model from an iterable of `sentences`. Each sentence is a
         list of words (unicode strings) that will be used for training.
@@ -468,9 +502,9 @@ def __init__(
         self.load = call_on_class_only
 
         if FAST_VERSION == -1:
-            logger.warning('Slow version of {0} is being used'.format(__name__))
+            logger.warning('Slow version of %s is being used', __name__)
         else:
-            logger.debug('Fast version of {0} is being used'.format(__name__))
+            logger.debug('Fast version of %s is being used', __name__)
 
         self.initialize_word_vectors()
         self.sg = int(sg)
@@ -506,18 +540,19 @@ def __init__(
             if isinstance(sentences, GeneratorType):
                 raise TypeError("You can't pass a generator as the sentences argument. Try an iterator.")
             self.build_vocab(sentences, trim_rule=trim_rule)
-            self.train(sentences, total_examples=self.corpus_count, epochs=self.iter,
-                       start_alpha=self.alpha, end_alpha=self.min_alpha)
+            self.train(sentences, total_examples=self.corpus_count, epochs=self.iter, start_alpha=self.alpha, end_alpha=self.min_alpha)
         else:
             if trim_rule is not None:
                 logger.warning(
-                    "The rule, if given, is only used to prune vocabulary during build_vocab() and is not stored as part of the model. ")
-                logger.warning("Model initialized without sentences. trim_rule provided, if any, will be ignored.")
+                    "The rule, if given, is only used to prune vocabulary during build_vocab() "
+                    "and is not stored as part of the model. Model initialized without sentences. "
+                    "trim_rule provided, if any, will be ignored."
+                )
 
     def initialize_word_vectors(self):
         self.wv = KeyedVectors()
 
-    def make_cum_table(self, power=0.75, domain=2 ** 31 - 1):
+    def make_cum_table(self, power=0.75, domain=2**31 - 1):
         """
         Create a cumulative-distribution table using stored vocabulary word counts for
         drawing random words in the negative-sampling training routines.
@@ -534,10 +569,10 @@ def make_cum_table(self, power=0.75, domain=2 ** 31 - 1):
         # compute sum of all power (Z in paper)
         train_words_pow = 0.0
         for word_index in xrange(vocab_size):
-            train_words_pow += self.wv.vocab[self.wv.index2word[word_index]].count ** power
+            train_words_pow += self.wv.vocab[self.wv.index2word[word_index]].count**power
         cumulative = 0.0
         for word_index in xrange(vocab_size):
-            cumulative += self.wv.vocab[self.wv.index2word[word_index]].count ** power
+            cumulative += self.wv.vocab[self.wv.index2word[word_index]].count**power
             self.cum_table[word_index] = round(cumulative / train_words_pow * domain)
         if len(self.cum_table) > 0:
             assert self.cum_table[-1] == domain
@@ -555,8 +590,9 @@ def create_binary_tree(self):
         heapq.heapify(heap)
         for i in xrange(len(self.wv.vocab) - 1):
             min1, min2 = heapq.heappop(heap), heapq.heappop(heap)
-            heapq.heappush(heap,
-                           Vocab(count=min1.count + min2.count, index=i + len(self.wv.vocab), left=min1, right=min2))
+            heapq.heappush(
+                heap, Vocab(count=min1.count + min2.count, index=i + len(self.wv.vocab), left=min1, right=min2)
+            )
 
         # recurse over the tree, assigning a binary code to each vocabulary word
         if heap:
@@ -585,6 +621,7 @@ def build_vocab(self, sentences, keep_raw_vocab=False, trim_rule=None, progress_
         self.scale_vocab(keep_raw_vocab=keep_raw_vocab, trim_rule=trim_rule, update=update)  # trim by min_count & precalculate downsampling
         self.finalize_vocab(update=update)  # build tables & arrays
 
+
     def build_vocab_from_freq(self, word_freq, keep_raw_vocab=False, corpus_count=None, trim_rule=None, update=False):
 
         """
@@ -599,8 +636,8 @@ def build_vocab_from_freq(self, word_freq, keep_raw_vocab=False, corpus_count=No
         self.scale_vocab(keep_raw_vocab=keep_raw_vocab, trim_rule=trim_rule, update=update)  # trim by min_count & precalculate downsampling
         self.finalize_vocab(update=update)  # build tables & arrays
 
-    def scan_vocab(self, sentences, progress_per=10000, trim_rule=None):
 
+    def scan_vocab(self, sentences, progress_per=10000, trim_rule=None):
         """Do an initial scan of all words appearing in sentences."""
         logger.info("collecting all words and their counts")
         sentence_no = -1
@@ -608,33 +645,37 @@ def scan_vocab(self, sentences, progress_per=10000, trim_rule=None):
         min_reduce = 1
         vocab = defaultdict(int)
         checked_string_types = 0
-
         for sentence_no, sentence in enumerate(sentences):
             if not checked_string_types:
                 if isinstance(sentence, string_types):
                     logger.warning(
-                        "Each 'sentences' item should be a list of words (usually unicode strings)."
-                        "First item here is instead plain %s.", type(sentence)
+                        "Each 'sentences' item should be a list of words (usually unicode strings). "
+                        "First item here is instead plain %s.",
+                        type(sentence)
                     )
                 checked_string_types += 1
             if sentence_no % progress_per == 0:
-                logger.info("PROGRESS: at sentence #%i, processed %i words, keeping %i word types",
-                            sentence_no, total_words, len(vocab))
-
+                logger.info(
+                    "PROGRESS: at sentence #%i, processed %i words, keeping %i word types",
+                    sentence_no, sum(itervalues(vocab)) + total_words, len(vocab)
+                )
             for word in sentence:
                 vocab[word] += 1
-                total_words += 1
 
             if self.max_vocab_size and len(vocab) > self.max_vocab_size:
-                utils.prune_vocab(vocab, min_reduce, trim_rule=trim_rule)
+                total_words += utils.prune_vocab(vocab, min_reduce, trim_rule=trim_rule)
                 min_reduce += 1
 
-        logger.info("collected %i word types from a corpus of %i raw words and %i sentences", len(vocab), total_words, sentence_no + 1)
+        total_words += sum(itervalues(vocab))
+        logger.info(
+            "collected %i word types from a corpus of %i raw words and %i sentences",
+            len(vocab), total_words, sentence_no + 1
+        )
         self.corpus_count = sentence_no + 1
         self.raw_vocab = vocab
 
-    def scale_vocab(self, min_count=None, sample=None, dry_run=False, keep_raw_vocab=False, trim_rule=None,
-                    update=False):
+    def scale_vocab(self, min_count=None, sample=None, dry_run=False,
+                    keep_raw_vocab=False, trim_rule=None, update=False):
         """
         Apply vocabulary settings for `min_count` (discarding less-frequent words)
         and `sample` (controlling the downsampling of more-frequent words).
@@ -675,12 +716,16 @@ def scale_vocab(self, min_count=None, sample=None, dry_run=False, keep_raw_vocab
                     drop_total += v
             original_unique_total = len(retain_words) + drop_unique
             retain_unique_pct = len(retain_words) * 100 / max(original_unique_total, 1)
-            logger.info("min_count=%d retains %i unique words (%i%% of original %i, drops %i)",
-                        min_count, len(retain_words), retain_unique_pct, original_unique_total, drop_unique)
+            logger.info(
+                "min_count=%d retains %i unique words (%i%% of original %i, drops %i)",
+                min_count, len(retain_words), retain_unique_pct, original_unique_total, drop_unique
+            )
             original_total = retain_total + drop_total
             retain_pct = retain_total * 100 / max(original_total, 1)
-            logger.info("min_count=%d leaves %i word corpus (%i%% of original %i, drops %i)",
-                        min_count, retain_total, retain_pct, original_total, drop_total)
+            logger.info(
+                "min_count=%d leaves %i word corpus (%i%% of original %i, drops %i)",
+                min_count, retain_total, retain_pct, original_total, drop_total
+            )
         else:
             logger.info("Updating model with new vocabulary")
             new_total = pre_exist_total = 0
@@ -704,10 +749,12 @@ def scale_vocab(self, min_count=None, sample=None, dry_run=False, keep_raw_vocab
             original_unique_total = len(pre_exist_words) + len(new_words) + drop_unique
             pre_exist_unique_pct = len(pre_exist_words) * 100 / max(original_unique_total, 1)
             new_unique_pct = len(new_words) * 100 / max(original_unique_total, 1)
-            logger.info("""New added %i unique words (%i%% of original %i)
-                        and increased the count of %i pre-existing words (%i%% of original %i)""",
-                        len(new_words), new_unique_pct, original_unique_total,
-                        len(pre_exist_words), pre_exist_unique_pct, original_unique_total)
+            logger.info(
+                "New added %i unique words (%i%% of original %i) "
+                "and increased the count of %i pre-existing words (%i%% of original %i)",
+                len(new_words), new_unique_pct, original_unique_total, len(pre_exist_words),
+                pre_exist_unique_pct, original_unique_total
+            )
             retain_words = new_words + pre_exist_words
             retain_total = new_total + pre_exist_total
 
@@ -733,22 +780,23 @@ def scale_vocab(self, min_count=None, sample=None, dry_run=False, keep_raw_vocab
                 word_probability = 1.0
                 downsample_total += v
             if not dry_run:
-                self.wv.vocab[w].sample_int = int(round(word_probability * 2 ** 32))
+                self.wv.vocab[w].sample_int = int(round(word_probability * 2**32))
 
         if not dry_run and not keep_raw_vocab:
             logger.info("deleting the raw counts dictionary of %i items", len(self.raw_vocab))
             self.raw_vocab = defaultdict(int)
 
         logger.info("sample=%g downsamples %i most-common words", sample, downsample_unique)
-        logger.info("downsampling leaves estimated %i word corpus (%.1f%% of prior %i)",
-                    downsample_total, downsample_total * 100.0 / max(retain_total, 1), retain_total)
-
-        # return from each step: words-affected, resulting-corpus-size
-        report_values = {'drop_unique': drop_unique, 'retain_total': retain_total,
-                         'downsample_unique': downsample_unique, 'downsample_total': int(downsample_total)}
+        logger.info(
+            "downsampling leaves estimated %i word corpus (%.1f%% of prior %i)",
+            downsample_total, downsample_total * 100.0 / max(retain_total, 1), retain_total
+        )
 
-        # print extra memory estimates
-        report_values['memory'] = self.estimate_memory(vocab_size=len(retain_words))
+        # return from each step: words-affected, resulting-corpus-size, extra memory estimates
+        report_values = {
+            'drop_unique': drop_unique, 'retain_total': retain_total, 'downsample_unique': downsample_unique,
+            'downsample_total': int(downsample_total), 'memory': self.estimate_memory(vocab_size=len(retain_words))
+        }
 
         return report_values
 
@@ -814,8 +862,7 @@ def _raw_word_count(self, job):
         return sum(len(sentence) for sentence in job)
 
     def train(self, sentences, total_examples=None, total_words=None,
-              epochs=None, start_alpha=None, end_alpha=None,
-              word_count=0,
+              epochs=None, start_alpha=None, end_alpha=None, word_count=0,
               queue_factor=2, report_delay=1.0, compute_loss=None):
         """
         Update the model's neural weights from a sequence of sentences (can be a once-only generator stream).
@@ -831,11 +878,13 @@ def train(self, sentences, total_examples=None, total_words=None,
         explicit `epochs` argument MUST be provided. In the common and recommended case, where `train()`
         is only called once, the model's cached `iter` value should be supplied as `epochs` value.
         """
-        if (self.model_trimmed_post_training):
+        if self.model_trimmed_post_training:
             raise RuntimeError("Parameters for training were discarded using model_trimmed_post_training method")
         if FAST_VERSION < 0:
-            warnings.warn("C extension not loaded for Word2Vec, training will be slow. "
-                          "Install a C compiler and reinstall gensim for fast training.")
+            warnings.warn(
+                "C extension not loaded for Word2Vec, training will be slow. "
+                "Install a C compiler and reinstall gensim for fast training."
+            )
             self.neg_labels = []
             if self.negative > 0:
                 # precompute negative labels optimization for pure-python training
@@ -849,8 +898,8 @@ def train(self, sentences, total_examples=None, total_words=None,
         logger.info(
             "training model with %i workers on %i vocabulary and %i features, "
             "using sg=%s hs=%s sample=%s negative=%s window=%s",
-            self.workers, len(self.wv.vocab), self.layer1_size, self.sg,
-            self.hs, self.sample, self.negative, self.window)
+            self.workers, len(self.wv.vocab), self.layer1_size, self.sg, self.hs, self.sample, self.negative, self.window
+        )
 
         if not self.wv.vocab:
             raise RuntimeError("you must first build vocabulary before training the model")
@@ -861,11 +910,15 @@ def train(self, sentences, total_examples=None, total_words=None,
             raise ValueError(
                 "The number of sentences in the training corpus is missing. Did you load the model via KeyedVectors.load_word2vec_format?"
                 "Models loaded via load_word2vec_format don't support further training. "
-                "Instead start with a blank model, scan_vocab on the new corpus, intersect_word2vec_format with the old model, then train.")
+                "Instead start with a blank model, scan_vocab on the new corpus, "
+                "intersect_word2vec_format with the old model, then train."
+            )
 
         if total_words is None and total_examples is None:
             raise ValueError(
-                "You must specify either total_examples or total_words, for proper alpha and progress calculations. The usual value is total_examples=model.corpus_count.")
+                "You must specify either total_examples or total_words, for proper alpha and progress calculations. "
+                "The usual value is total_examples=model.corpus_count."
+            )
         if epochs is None:
             raise ValueError("You must specify an explict epochs count. The usual value is epochs=model.iter.")
         start_alpha = start_alpha or self.alpha
@@ -900,9 +953,7 @@ def job_producer():
             pushed_words, pushed_examples = 0, 0
             next_alpha = start_alpha
             if next_alpha > self.min_alpha_yet_reached:
-                logger.warning(
-                    "Effective 'alpha' higher than previous training cycles"
-                )
+                logger.warning("Effective 'alpha' higher than previous training cycles")
             self.min_alpha_yet_reached = next_alpha
             job_no = 0
 
@@ -918,7 +969,8 @@ def job_producer():
                     # no => submit the existing job
                     logger.debug(
                         "queueing job #%i (%i words, %i sentences) at alpha %.05f",
-                        job_no, batch_size, len(job_batch), next_alpha)
+                        job_no, batch_size, len(job_batch), next_alpha
+                    )
                     job_no += 1
                     job_queue.put((job_batch, next_alpha))
 
@@ -942,15 +994,15 @@ def job_producer():
             if job_batch:
                 logger.debug(
                     "queueing job #%i (%i words, %i sentences) at alpha %.05f",
-                    job_no, batch_size, len(job_batch), next_alpha)
+                    job_no, batch_size, len(job_batch), next_alpha
+                )
                 job_no += 1
                 job_queue.put((job_batch, next_alpha))
 
             if job_no == 0 and self.train_count == 0:
                 logger.warning(
                     "train() called with an empty iterator (if not intended, "
-                    "be sure to provide a corpus that offers restartable "
-                    "iteration = an iterable)."
+                    "be sure to provide a corpus that offers restartable iteration = an iterable)."
                 )
 
             # give the workers heads up that they can finish -- no more work!
@@ -995,34 +1047,31 @@ def job_producer():
                     logger.info(
                         "PROGRESS: at %.2f%% examples, %.0f words/s, in_qsize %i, out_qsize %i",
                         100.0 * example_count / total_examples, trained_word_count / elapsed,
-                        utils.qsize(job_queue), utils.qsize(progress_queue))
+                        utils.qsize(job_queue), utils.qsize(progress_queue)
+                    )
                 else:
                     # words-based progress %
                     logger.info(
                         "PROGRESS: at %.2f%% words, %.0f words/s, in_qsize %i, out_qsize %i",
                         100.0 * raw_word_count / total_words, trained_word_count / elapsed,
-                        utils.qsize(job_queue), utils.qsize(progress_queue))
+                        utils.qsize(job_queue), utils.qsize(progress_queue)
+                    )
                 next_report = elapsed + report_delay
 
         # all done; report the final stats
         elapsed = default_timer() - start
         logger.info(
             "training on %i raw words (%i effective words) took %.1fs, %.0f effective words/s",
-            raw_word_count, trained_word_count, elapsed, trained_word_count / elapsed)
+            raw_word_count, trained_word_count, elapsed, trained_word_count / elapsed
+        )
         if job_tally < 10 * self.workers:
-            logger.warning(
-                "under 10 jobs per worker: consider setting a smaller `batch_words' for smoother alpha decay"
-            )
+            logger.warning("under 10 jobs per worker: consider setting a smaller `batch_words' for smoother alpha decay")
 
         # check that the input corpus hasn't changed during iteration
         if total_examples and total_examples != example_count:
-            logger.warning(
-                "supplied example count (%i) did not equal expected count (%i)", example_count, total_examples
-            )
+            logger.warning("supplied example count (%i) did not equal expected count (%i)", example_count, total_examples)
         if total_words and total_words != raw_word_count:
-            logger.warning(
-                "supplied raw word count (%i) did not equal expected count (%i)", raw_word_count, total_words
-            )
+            logger.warning("supplied raw word count (%i) did not equal expected count (%i)", raw_word_count, total_words)
 
         self.train_count += 1  # number of times train() has been called
         self.total_train_time += elapsed
@@ -1049,21 +1098,25 @@ def score(self, sentences, total_sentences=int(1e6), chunksize=100, queue_factor
 
         """
         if FAST_VERSION < 0:
-            warnings.warn("C extension compilation failed, scoring will be slow. "
-                          "Install a C compiler and reinstall gensim for fastness.")
+            warnings.warn(
+                "C extension compilation failed, scoring will be slow. "
+                "Install a C compiler and reinstall gensim for fastness."
+            )
 
         logger.info(
             "scoring sentences with %i workers on %i vocabulary and %i features, "
             "using sg=%s hs=%s sample=%s and negative=%s",
-            self.workers, len(self.wv.vocab), self.layer1_size, self.sg, self.hs, self.sample, self.negative)
+            self.workers, len(self.wv.vocab), self.layer1_size, self.sg, self.hs, self.sample, self.negative
+        )
 
         if not self.wv.vocab:
             raise RuntimeError("you must first build vocabulary before scoring new data")
 
         if not self.hs:
-            raise RuntimeError("We have currently only implemented score \
-                    for the hierarchical softmax scheme, so you need to have \
-                    run word2vec with hs=1 and negative=0 for this to work.")
+            raise RuntimeError(
+                "We have currently only implemented score for the hierarchical softmax scheme, "
+                "so you need to have run word2vec with hs=1 and negative=0 for this to work."
+            )
 
         def worker_loop():
             """Compute log probability for each sentence, lifting lists of sentences from the jobs queue."""
@@ -1109,15 +1162,14 @@ def worker_loop():
                 if (job_no - 1) * chunksize > total_sentences:
                     logger.warning(
                         "terminating after %i sentences (set higher total_sentences if you want more).",
-                        total_sentences)
+                        total_sentences
+                    )
                     job_no -= 1
                     raise StopIteration()
                 logger.debug("putting job #%i in the queue", job_no)
                 job_queue.put(items)
             except StopIteration:
-                logger.info(
-                    "reached end of input; waiting to finish %i outstanding jobs",
-                    job_no - done_jobs + 1)
+                logger.info("reached end of input; waiting to finish %i outstanding jobs", job_no - done_jobs + 1)
                 for _ in xrange(self.workers):
                     job_queue.put(None)  # give the workers heads up that they can finish -- no more work!
                 push_done = True
@@ -1130,7 +1182,8 @@ def worker_loop():
                     if elapsed >= next_report:
                         logger.info(
                             "PROGRESS: at %.2f%% sentences, %.0f sentences/s",
-                            100.0 * sentence_count, sentence_count / elapsed)
+                            100.0 * sentence_count, sentence_count / elapsed
+                        )
                         next_report = elapsed + report_delay  # don't flood log, wait report_delay seconds
                 else:
                     # loop ended by job count; really done
@@ -1142,7 +1195,8 @@ def worker_loop():
         self.clear_sims()
         logger.info(
             "scoring %i sentences took %.1fs, %.0f sentences/s",
-            sentence_count, elapsed, sentence_count / elapsed)
+            sentence_count, elapsed, sentence_count / elapsed
+        )
         return sentence_scores[:sentence_count]
 
     def clear_sims(self):
@@ -1169,9 +1223,10 @@ def update_weights(self):
 
         # Raise an error if an online update is run before initial training on a corpus
         if not len(self.wv.syn0):
-            raise RuntimeError("You cannot do an online vocabulary-update of a model which has no prior vocabulary. " \
-                               "First build the vocabulary of your model with a corpus " \
-                               "before doing an online update.")
+            raise RuntimeError(
+                "You cannot do an online vocabulary-update of a model which has no prior vocabulary. "
+                "First build the vocabulary of your model with a corpus before doing an online update."
+            )
 
         self.wv.syn0 = vstack([self.wv.syn0, newsyn0])
 
@@ -1220,16 +1275,16 @@ def intersect_word2vec_format(self, fname, lockf=0.0, binary=False, encoding='ut
         training. Use 1.0 to allow further training updates of merged vectors.
         """
         overlap_count = 0
-        logger.info("loading projection weights from %s" % (fname))
+        logger.info("loading projection weights from %s", fname)
         with utils.smart_open(fname) as fin:
             header = utils.to_unicode(fin.readline(), encoding=encoding)
-            vocab_size, vector_size = map(int, header.split())  # throws for invalid file format
+            vocab_size, vector_size = (int(x) for x in header.split())  # throws for invalid file format
             if not vector_size == self.vector_size:
                 raise ValueError("incompatible vector size %d in file %s" % (vector_size, fname))
                 # TOCONSIDER: maybe mismatched vectors still useful enough to merge (truncating/padding)?
             if binary:
                 binary_len = dtype(REAL).itemsize * vector_size
-                for line_no in xrange(vocab_size):
+                for _ in xrange(vocab_size):
                     # mixed text and binary: read text first, then binary
                     word = []
                     while True:
@@ -1248,15 +1303,15 @@ def intersect_word2vec_format(self, fname, lockf=0.0, binary=False, encoding='ut
                 for line_no, line in enumerate(fin):
                     parts = utils.to_unicode(line.rstrip(), encoding=encoding, errors=unicode_errors).split(" ")
                     if len(parts) != vector_size + 1:
-                        raise ValueError("invalid vector on line %s (is this really the text format?)" % (line_no))
-                    word, weights = parts[0], list(map(REAL, parts[1:]))
+                        raise ValueError("invalid vector on line %s (is this really the text format?)" % line_no)
+                    word, weights = parts[0], [REAL(x) for x in parts[1:]]
                     if word in self.wv.vocab:
                         overlap_count += 1
                         self.wv.syn0[self.wv.vocab[word].index] = weights
                         self.syn0_lockf[self.wv.vocab[word].index] = lockf  # lock-factor: 0.0 stops further changes
-        logger.info("merged %d vectors into %s matrix from %s" % (overlap_count, self.wv.syn0.shape, fname))
+        logger.info("merged %d vectors into %s matrix from %s", overlap_count, self.wv.syn0.shape, fname)
 
-    def most_similar(self, positive=[], negative=[], topn=10, restrict_vocab=None, indexer=None):
+    def most_similar(self, positive=None, negative=None, topn=10, restrict_vocab=None, indexer=None):
         """
         Deprecated. Use self.wv.most_similar() instead.
         Refer to the documentation for `gensim.models.KeyedVectors.most_similar`
@@ -1270,7 +1325,7 @@ def wmdistance(self, document1, document2):
         """
         return self.wv.wmdistance(document1, document2)
 
-    def most_similar_cosmul(self, positive=[], negative=[], topn=10):
+    def most_similar_cosmul(self, positive=None, negative=None, topn=10):
         """
         Deprecated. Use self.wv.most_similar_cosmul() instead.
         Refer to the documentation for `gensim.models.KeyedVectors.most_similar_cosmul`
@@ -1329,9 +1384,10 @@ def n_similarity(self, ws1, ws2):
     def predict_output_word(self, context_words_list, topn=10):
         """Report the probability distribution of the center word given the context words as input to the trained model."""
         if not self.negative:
-            raise RuntimeError("We have currently only implemented predict_output_word "
-                               "for the negative sampling scheme, so you need to have "
-                               "run word2vec with negative > 0 for this to work.")
+            raise RuntimeError(
+                "We have currently only implemented predict_output_word for the negative sampling scheme, "
+                "so you need to have run word2vec with negative > 0 for this to work."
+            )
 
         if not hasattr(self.wv, 'syn0') or not hasattr(self, 'syn1neg'):
             raise RuntimeError("Parameters required for predicting the output words not found.")
@@ -1350,8 +1406,7 @@ def predict_output_word(self, context_words_list, topn=10):
         prob_values = exp(dot(l1, self.syn1neg.T))  # propagate hidden -> output and take softmax to get probabilities
         prob_values /= sum(prob_values)
         top_indices = matutils.argsort(prob_values, topn=topn, reverse=True)
-        return [(self.wv.index2word[index1], prob_values[index1]) for index1 in
-                top_indices]  # returning the most probable output words with their probabilities
+        return [(self.wv.index2word[index1], prob_values[index1]) for index1 in top_indices]  # returning the most probable output words with their probabilities
 
     def init_sims(self, replace=False):
         """
@@ -1373,8 +1428,10 @@ def estimate_memory(self, vocab_size=None, report=None):
         if self.negative:
             report['syn1neg'] = vocab_size * self.layer1_size * dtype(REAL).itemsize
         report['total'] = sum(report.values())
-        logger.info("estimated required memory for %i words and %i dimensions: %i bytes",
-                    vocab_size, self.vector_size, report['total'])
+        logger.info(
+            "estimated required memory for %i words and %i dimensions: %i bytes",
+            vocab_size, self.vector_size, report['total']
+        )
         return report
 
     @staticmethod
@@ -1393,8 +1450,7 @@ def log_evaluate_word_pairs(pearson, spearman, oov, pairs):
         """
         return KeyedVectors.log_evaluate_word_pairs(pearson, spearman, oov, pairs)
 
-    def evaluate_word_pairs(self, pairs, delimiter='\t', restrict_vocab=300000, case_insensitive=True,
-                            dummy4unknown=False):
+    def evaluate_word_pairs(self, pairs, delimiter='\t', restrict_vocab=300000, case_insensitive=True, dummy4unknown=False):
         """
         Deprecated. Use self.wv.evaluate_word_pairs() instead.
         Refer to the documentation for `gensim.models.KeyedVectors.evaluate_word_pairs`
@@ -1402,12 +1458,14 @@ def evaluate_word_pairs(self, pairs, delimiter='\t', restrict_vocab=300000, case
         return self.wv.evaluate_word_pairs(pairs, delimiter, restrict_vocab, case_insensitive, dummy4unknown)
 
     def __str__(self):
-        return "%s(vocab=%s, size=%s, alpha=%s)" % (
-            self.__class__.__name__, len(self.wv.index2word), self.vector_size, self.alpha)
+        return "%s(vocab=%s, size=%s, alpha=%s)" % (self.__class__.__name__, len(self.wv.index2word), self.vector_size, self.alpha)
 
     def _minimize_model(self, save_syn1=False, save_syn1neg=False, save_syn0_lockf=False):
         warnings.warn(
-            "This method would be deprecated in the future. Keep just_word_vectors = model.wv to retain just the KeyedVectors instance for read-only querying of word vectors.")
+            "This method would be deprecated in the future. "
+            "Keep just_word_vectors = model.wv to retain just the KeyedVectors instance "
+            "for read-only querying of word vectors."
+        )
         if save_syn1 and save_syn1neg and save_syn0_lockf:
             return
         if hasattr(self, 'syn1') and not save_syn1:
@@ -1450,7 +1508,7 @@ def load(cls, *args, **kwargs):
             if hasattr(v, 'sample_int'):
                 break  # already 0.12.0+ style int probabilities
             elif hasattr(v, 'sample_probability'):
-                v.sample_int = int(round(v.sample_probability * 2 ** 32))
+                v.sample_int = int(round(v.sample_probability * 2**32))
                 del v.sample_probability
         if not hasattr(model, 'syn0_lockf') and hasattr(model, 'syn0'):
             model.syn0_lockf = ones(len(model.wv.syn0), dtype=REAL)
@@ -1474,7 +1532,7 @@ def _load_specials(self, *args, **kwargs):
 
     @classmethod
     def load_word2vec_format(cls, fname, fvocab=None, binary=False, encoding='utf8', unicode_errors='strict',
-                             limit=None, datatype=REAL):
+                         limit=None, datatype=REAL):
         """Deprecated. Use gensim.models.KeyedVectors.load_word2vec_format instead."""
         raise DeprecationWarning("Deprecated. Use gensim.models.KeyedVectors.load_word2vec_format instead.")
 
@@ -1581,7 +1639,7 @@ def __iter__(self):
                     line = utils.to_unicode(line).split()
                     i = 0
                     while i < len(line):
-                        yield line[i:i + self.max_sentence_length]
+                        yield line[i: i + self.max_sentence_length]
                         i += self.max_sentence_length
 
 
@@ -1599,7 +1657,7 @@ def __init__(self, source, max_sentence_length=MAX_WORDS_IN_BATCH, limit=None):
 
         Example::
 
-            sentences = LineSentencePath(os.getcwd() + '\\corpus\\')
+            sentences = PathLineSentences(os.getcwd() + '\\corpus\\')
 
         The files in the directory should be either text files, .bz2 files, or .gz files.
 
@@ -1613,19 +1671,19 @@ def __init__(self, source, max_sentence_length=MAX_WORDS_IN_BATCH, limit=None):
             self.input_files = [self.source]  # force code compatibility with list of files
         elif os.path.isdir(self.source):
             self.source = os.path.join(self.source, '')  # ensures os-specific slash at end of path
-            logging.debug('reading directory ' + self.source)
+            logging.debug('reading directory %s', self.source)
             self.input_files = os.listdir(self.source)
             self.input_files = [self.source + file for file in self.input_files]  # make full paths
             self.input_files.sort()  # makes sure it happens in filename order
         else:  # not a file or a directory, then we can't do anything with it
             raise ValueError('input is neither a file nor a path')
 
-        logging.info('files read into PathLineSentences:' + '\n'.join(self.input_files))
+        logging.info('files read into PathLineSentences:%s', '\n'.join(self.input_files))
 
     def __iter__(self):
-        '''iterate through the files'''
+        """iterate through the files"""
         for file_name in self.input_files:
-            logging.info('reading file ' + file_name)
+            logging.info('reading file %s', file_name)
             with utils.smart_open(file_name) as fin:
                 for line in itertools.islice(fin, self.limit):
                     line = utils.to_unicode(line).split()
@@ -1638,7 +1696,6 @@ def __iter__(self):
 # Example: ./word2vec.py -train data.txt -output vec.txt -size 200 -window 5 -sample 1e-4 -negative 5 -hs 0 -binary 0 -cbow 1 -iter 3
 if __name__ == "__main__":
     import argparse
-
     logging.basicConfig(
         format='%(asctime)s : %(threadName)s : %(levelname)s : %(message)s',
         level=logging.INFO)
@@ -1651,7 +1708,7 @@ def __iter__(self):
         print(globals()['__doc__'] % locals())
         sys.exit(1)
 
-    from gensim.models.word2vec import Word2Vec  # avoid referencing __main__ in pickle
+    from gensim.models.word2vec import Word2Vec  # noqa:F811 avoid referencing __main__ in pickle
 
     seterr(all='raise')  # don't ignore numpy errors
 
@@ -1660,23 +1717,14 @@ def __iter__(self):
     parser.add_argument("-output", help="Use file OUTPUT to save the resulting word vectors")
     parser.add_argument("-window", help="Set max skip length WINDOW between words; default is 5", type=int, default=5)
     parser.add_argument("-size", help="Set size of word vectors; default is 100", type=int, default=100)
-    parser.add_argument("-sample",
-                        help="Set threshold for occurrence of words. Those that appear with higher frequency in the training data will be randomly down-sampled; default is 1e-3, useful range is (0, 1e-5)",
-                        type=float, default=1e-3)
-    parser.add_argument("-hs", help="Use Hierarchical Softmax; default is 0 (not used)", type=int, default=0,
-                        choices=[0, 1])
-    parser.add_argument("-negative",
-                        help="Number of negative examples; default is 5, common values are 3 - 10 (0 = not used)",
-                        type=int, default=5)
+    parser.add_argument("-sample", help="Set threshold for occurrence of words. Those that appear with higher frequency in the training data will be randomly down-sampled; default is 1e-3, useful range is (0, 1e-5)", type=float, default=1e-3)
+    parser.add_argument("-hs", help="Use Hierarchical Softmax; default is 0 (not used)", type=int, default=0, choices=[0, 1])
+    parser.add_argument("-negative", help="Number of negative examples; default is 5, common values are 3 - 10 (0 = not used)", type=int, default=5)
     parser.add_argument("-threads", help="Use THREADS threads (default 12)", type=int, default=12)
     parser.add_argument("-iter", help="Run more training iterations (default 5)", type=int, default=5)
-    parser.add_argument("-min_count",
-                        help="This will discard words that appear less than MIN_COUNT times; default is 5", type=int,
-                        default=5)
-    parser.add_argument("-cbow", help="Use the continuous bag of words model; default is 1 (use 0 for skip-gram model)",
-                        type=int, default=1, choices=[0, 1])
-    parser.add_argument("-binary", help="Save the resulting vectors in binary mode; default is 0 (off)", type=int,
-                        default=0, choices=[0, 1])
+    parser.add_argument("-min_count", help="This will discard words that appear less than MIN_COUNT times; default is 5", type=int, default=5)
+    parser.add_argument("-cbow", help="Use the continuous bag of words model; default is 1 (use 0 for skip-gram model)", type=int, default=1, choices=[0, 1])
+    parser.add_argument("-binary", help="Save the resulting vectors in binary mode; default is 0 (off)", type=int, default=0, choices=[0, 1])
     parser.add_argument("-accuracy", help="Use questions from file ACCURACY to evaluate the model")
 
     args = parser.parse_args()
@@ -1691,7 +1739,8 @@ def __iter__(self):
     model = Word2Vec(
         corpus, size=args.size, min_count=args.min_count, workers=args.threads,
         window=args.window, sample=args.sample, sg=skipgram, hs=args.hs,
-        negative=args.negative, cbow_mean=1, iter=args.iter)
+        negative=args.negative, cbow_mean=1, iter=args.iter
+    )
 
     if args.output:
         outfile = args.output
@@ -1709,4 +1758,3 @@ def __iter__(self):
 
     logger.info("finished running %s", program)
 
-

From 8abd58b58cc35521cb930ffee78c723b481877ab Mon Sep 17 00:00:00 2001
From: jodevak <ahmadkhaldi94@gmail.com>
Date: Mon, 25 Sep 2017 18:17:38 +0300
Subject: [PATCH 03/20] fix build vocab speed issue, function build vocab from
 previously provided word frequencies table

---
 gensim/models/word2vec.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/gensim/models/word2vec.py b/gensim/models/word2vec.py
index 2e6eb89cb2..aada500661 100644
--- a/gensim/models/word2vec.py
+++ b/gensim/models/word2vec.py
@@ -612,7 +612,6 @@ def create_binary_tree(self):
             logger.info("built huffman tree with maximum node depth %i", max_depth)
 
     def build_vocab(self, sentences, keep_raw_vocab=False, trim_rule=None, progress_per=10000, update=False):
-
         """
         Build vocabulary from a sequence of sentences (can be a once-only generator stream).
         Each sentence must be a list of unicode strings.
@@ -623,7 +622,6 @@ def build_vocab(self, sentences, keep_raw_vocab=False, trim_rule=None, progress_
 
 
     def build_vocab_from_freq(self, word_freq, keep_raw_vocab=False, corpus_count=None, trim_rule=None, update=False):
-
         """
         Build vocabulary from a dictionary of word frequencies
         """
@@ -1758,3 +1756,4 @@ def __iter__(self):
 
     logger.info("finished running %s", program)
 
+

From 8ec04332d3f8314635d51433a3c78e55de0cb695 Mon Sep 17 00:00:00 2001
From: jodevak <ahmadkhaldi94@gmail.com>
Date: Mon, 25 Sep 2017 18:27:04 +0300
Subject: [PATCH 04/20] fix build vocab speed issue, function build vocab from
 previously provided word frequencies table

---
 gensim/models/word2vec.py | 24 +++++++++---------------
 1 file changed, 9 insertions(+), 15 deletions(-)

diff --git a/gensim/models/word2vec.py b/gensim/models/word2vec.py
index aada500661..20f1fc560c 100644
--- a/gensim/models/word2vec.py
+++ b/gensim/models/word2vec.py
@@ -643,32 +643,28 @@ def scan_vocab(self, sentences, progress_per=10000, trim_rule=None):
         min_reduce = 1
         vocab = defaultdict(int)
         checked_string_types = 0
+
         for sentence_no, sentence in enumerate(sentences):
             if not checked_string_types:
                 if isinstance(sentence, string_types):
                     logger.warning(
-                        "Each 'sentences' item should be a list of words (usually unicode strings). "
-                        "First item here is instead plain %s.",
-                        type(sentence)
+                        "Each 'sentences' item should be a list of words (usually unicode strings)."
+                        "First item here is instead plain %s.", type(sentence)
                     )
                 checked_string_types += 1
             if sentence_no % progress_per == 0:
-                logger.info(
-                    "PROGRESS: at sentence #%i, processed %i words, keeping %i word types",
-                    sentence_no, sum(itervalues(vocab)) + total_words, len(vocab)
-                )
+                logger.info("PROGRESS: at sentence #%i, processed %i words, keeping %i word types",
+                            sentence_no, total_words, len(vocab))
+
             for word in sentence:
                 vocab[word] += 1
+                total_words += 1
 
             if self.max_vocab_size and len(vocab) > self.max_vocab_size:
-                total_words += utils.prune_vocab(vocab, min_reduce, trim_rule=trim_rule)
+                utils.prune_vocab(vocab, min_reduce, trim_rule=trim_rule)
                 min_reduce += 1
 
-        total_words += sum(itervalues(vocab))
-        logger.info(
-            "collected %i word types from a corpus of %i raw words and %i sentences",
-            len(vocab), total_words, sentence_no + 1
-        )
+        logger.info("collected %i word types from a corpus of %i raw words and %i sentences", len(vocab), total_words, sentence_no + 1)
         self.corpus_count = sentence_no + 1
         self.raw_vocab = vocab
 
@@ -1755,5 +1751,3 @@ def __iter__(self):
         model.accuracy(args.accuracy)
 
     logger.info("finished running %s", program)
-
-

From b9f3a5f81d8dd4c6e953e0dd353bc89b0ea61b2f Mon Sep 17 00:00:00 2001
From: jodevak <ahmadkhaldi94@gmail.com>
Date: Mon, 16 Oct 2017 21:26:16 +0300
Subject: [PATCH 05/20] Removing the extra blank lines, documentation in
 numpy-style to build_vocab_from_freq, and hanging indents in build_vocab

---
 gensim/models/word2vec.py | 132 ++++++++++++++++++++++++++------------
 1 file changed, 92 insertions(+), 40 deletions(-)

diff --git a/gensim/models/word2vec.py b/gensim/models/word2vec.py
index 20f1fc560c..d4d92aed56 100644
--- a/gensim/models/word2vec.py
+++ b/gensim/models/word2vec.py
@@ -114,8 +114,8 @@
 except ImportError:
     from Queue import Queue, Empty
 
-from numpy import exp, log, dot, zeros, outer, random, dtype, float32 as REAL,\
-    uint32, seterr, array, uint8, vstack, fromstring, sqrt,\
+from numpy import exp, log, dot, zeros, outer, random, dtype, float32 as REAL, \
+    uint32, seterr, array, uint8, vstack, fromstring, sqrt, \
     empty, sum as np_sum, ones, logaddexp
 
 from scipy.special import expit
@@ -136,6 +136,7 @@
     FAST_VERSION = -1
     MAX_WORDS_IN_BATCH = 10000
 
+
     def train_batch_sg(model, sentences, alpha, work=None, compute_loss=False):
         """
         Update skip-gram model by training on a sequence of sentences.
@@ -150,7 +151,7 @@ def train_batch_sg(model, sentences, alpha, work=None, compute_loss=False):
         result = 0
         for sentence in sentences:
             word_vocabs = [model.wv.vocab[w] for w in sentence if w in model.wv.vocab and
-                           model.wv.vocab[w].sample_int > model.random.rand() * 2**32]
+                           model.wv.vocab[w].sample_int > model.random.rand() * 2 ** 32]
             for pos, word in enumerate(word_vocabs):
                 reduced_window = model.random.randint(model.window)  # `b` in the original word2vec code
 
@@ -166,6 +167,7 @@ def train_batch_sg(model, sentences, alpha, work=None, compute_loss=False):
             result += len(word_vocabs)
         return result
 
+
     def train_batch_cbow(model, sentences, alpha, work=None, neu1=None, compute_loss=False):
         """
         Update CBOW model by training on a sequence of sentences.
@@ -180,7 +182,7 @@ def train_batch_cbow(model, sentences, alpha, work=None, neu1=None, compute_loss
         result = 0
         for sentence in sentences:
             word_vocabs = [model.wv.vocab[w] for w in sentence if w in model.wv.vocab and
-                           model.wv.vocab[w].sample_int > model.random.rand() * 2**32]
+                           model.wv.vocab[w].sample_int > model.random.rand() * 2 ** 32]
             for pos, word in enumerate(word_vocabs):
                 reduced_window = model.random.randint(model.window)  # `b` in the original word2vec code
                 start = max(0, pos - model.window + reduced_window)
@@ -193,6 +195,7 @@ def train_batch_cbow(model, sentences, alpha, work=None, neu1=None, compute_loss
             result += len(word_vocabs)
         return result
 
+
     def score_sentence_sg(model, sentence, work=None):
         """
         Obtain likelihood score for a single sentence in a fitted skip-gram representaion.
@@ -222,6 +225,7 @@ def score_sentence_sg(model, sentence, work=None):
 
         return log_prob_sentence
 
+
     def score_sentence_cbow(model, sentence, work=None, neu1=None):
         """
         Obtain likelihood score for a single sentence in a fitted CBOW representaion.
@@ -295,7 +299,7 @@ def train_sg_pair(model, word, context_index, alpha, learn_vectors=True, learn_h
 
         # loss component corresponding to hierarchical softmax
         if compute_loss:
-            sgn = (-1.0)**predict_word.code  # `ch` function, 0 -> 1, 1 -> -1
+            sgn = (-1.0) ** predict_word.code  # `ch` function, 0 -> 1, 1 -> -1
             lprob = -log(expit(-sgn * prod_term))
             model.running_training_loss += sum(lprob)
 
@@ -329,7 +333,8 @@ def train_sg_pair(model, word, context_index, alpha, learn_vectors=True, learn_h
     return neu1e
 
 
-def train_cbow_pair(model, word, input_word_indices, l1, alpha, learn_vectors=True, learn_hidden=True, compute_loss=False,
+def train_cbow_pair(model, word, input_word_indices, l1, alpha, learn_vectors=True, learn_hidden=True,
+                    compute_loss=False,
                     context_vectors=None, context_locks=None, is_ft=False):
     if context_vectors is None:
         if is_ft:
@@ -357,7 +362,7 @@ def train_cbow_pair(model, word, input_word_indices, l1, alpha, learn_vectors=Tr
 
         # loss component corresponding to hierarchical softmax
         if compute_loss:
-            sgn = (-1.0)**word.code  # ch function, 0-> 1, 1 -> -1
+            sgn = (-1.0) ** word.code  # ch function, 0-> 1, 1 -> -1
             model.running_training_loss += sum(-log(expit(-sgn * prod_term)))
 
     if model.negative:
@@ -401,14 +406,14 @@ def train_cbow_pair(model, word, input_word_indices, l1, alpha, learn_vectors=Tr
 def score_sg_pair(model, word, word2):
     l1 = model.wv.syn0[word2.index]
     l2a = deepcopy(model.syn1[word.point])  # 2d matrix, codelen x layer1_size
-    sgn = (-1.0)**word.code  # ch function, 0-> 1, 1 -> -1
+    sgn = (-1.0) ** word.code  # ch function, 0-> 1, 1 -> -1
     lprob = -logaddexp(0, -sgn * dot(l1, l2a.T))
     return sum(lprob)
 
 
 def score_cbow_pair(model, word, l1):
     l2a = model.syn1[word.point]  # 2d matrix, codelen x layer1_size
-    sgn = (-1.0)**word.code  # ch function, 0-> 1, 1 -> -1
+    sgn = (-1.0) ** word.code  # ch function, 0-> 1, 1 -> -1
     lprob = -logaddexp(0, -sgn * dot(l1, l2a.T))
     return sum(lprob)
 
@@ -540,7 +545,8 @@ def __init__(self, sentences=None, size=100, alpha=0.025, window=5, min_count=5,
             if isinstance(sentences, GeneratorType):
                 raise TypeError("You can't pass a generator as the sentences argument. Try an iterator.")
             self.build_vocab(sentences, trim_rule=trim_rule)
-            self.train(sentences, total_examples=self.corpus_count, epochs=self.iter, start_alpha=self.alpha, end_alpha=self.min_alpha)
+            self.train(sentences, total_examples=self.corpus_count, epochs=self.iter, start_alpha=self.alpha,
+                       end_alpha=self.min_alpha)
         else:
             if trim_rule is not None:
                 logger.warning(
@@ -552,7 +558,7 @@ def __init__(self, sentences=None, size=100, alpha=0.025, window=5, min_count=5,
     def initialize_word_vectors(self):
         self.wv = KeyedVectors()
 
-    def make_cum_table(self, power=0.75, domain=2**31 - 1):
+    def make_cum_table(self, power=0.75, domain=2 ** 31 - 1):
         """
         Create a cumulative-distribution table using stored vocabulary word counts for
         drawing random words in the negative-sampling training routines.
@@ -569,10 +575,10 @@ def make_cum_table(self, power=0.75, domain=2**31 - 1):
         # compute sum of all power (Z in paper)
         train_words_pow = 0.0
         for word_index in xrange(vocab_size):
-            train_words_pow += self.wv.vocab[self.wv.index2word[word_index]].count**power
+            train_words_pow += self.wv.vocab[self.wv.index2word[word_index]].count ** power
         cumulative = 0.0
         for word_index in xrange(vocab_size):
-            cumulative += self.wv.vocab[self.wv.index2word[word_index]].count**power
+            cumulative += self.wv.vocab[self.wv.index2word[word_index]].count ** power
             self.cum_table[word_index] = round(cumulative / train_words_pow * domain)
         if len(self.cum_table) > 0:
             assert self.cum_table[-1] == domain
@@ -617,13 +623,38 @@ def build_vocab(self, sentences, keep_raw_vocab=False, trim_rule=None, progress_
         Each sentence must be a list of unicode strings.
         """
         self.scan_vocab(sentences, progress_per=progress_per, trim_rule=trim_rule)  # initial survey
-        self.scale_vocab(keep_raw_vocab=keep_raw_vocab, trim_rule=trim_rule, update=update)  # trim by min_count & precalculate downsampling
+        self.scale_vocab(keep_raw_vocab=keep_raw_vocab, trim_rule=trim_rule,
+                         update=update)  # trim by min_count & precalculate downsampling
         self.finalize_vocab(update=update)  # build tables & arrays
 
-
     def build_vocab_from_freq(self, word_freq, keep_raw_vocab=False, corpus_count=None, trim_rule=None, update=False):
         """
-        Build vocabulary from a dictionary of word frequencies
+        Build vocabulary from a dictionary of word frequencies.
+        Build model vocabulary from a passed dictionary that contains (word,word count).
+        Words must be of type unicode strings.
+
+        Parameters
+        ----------
+        `word_freq` : dict
+            Word,Word_Count dictionary.
+        `keep_raw_vocab` : bool
+            If not true, delete the raw vocabulary after the scaling is done and free up RAM.
+        `corpus_count`: int
+            Even if no corpus is provided, this argument can set corpus_count explicitly.
+        `trim_rule` = vocabulary trimming rule, specifies whether certain words should remain
+        in the vocabulary, be trimmed away, or handled using the default (discard if word count < min_count).
+        Can be None (min_count will be used), or a callable that accepts parameters (word, count, min_count) and
+        returns either `utils.RULE_DISCARD`, `utils.RULE_KEEP` or `utils.RULE_DEFAULT`.
+        `update`: bool
+            If true, the new provided words in `word_freq` dict will be added to model's vocab.
+
+        Returns
+        --------
+        None
+
+        Examples
+        --------
+        >>> build_vocab_from_freq({"Word1":15,"Word2":20}, update=True)
         """
         logger.info("Processing provided word frequencies")
         vocab = defaultdict(int, word_freq)
@@ -631,10 +662,10 @@ def build_vocab_from_freq(self, word_freq, keep_raw_vocab=False, corpus_count=No
         self.corpus_count = corpus_count if corpus_count else 0
         self.raw_vocab = vocab
 
-        self.scale_vocab(keep_raw_vocab=keep_raw_vocab, trim_rule=trim_rule, update=update)  # trim by min_count & precalculate downsampling
+        self.scale_vocab(keep_raw_vocab=keep_raw_vocab, trim_rule=trim_rule,
+                         update=update)  # trim by min_count & precalculate downsampling
         self.finalize_vocab(update=update)  # build tables & arrays
 
-
     def scan_vocab(self, sentences, progress_per=10000, trim_rule=None):
         """Do an initial scan of all words appearing in sentences."""
         logger.info("collecting all words and their counts")
@@ -647,13 +678,16 @@ def scan_vocab(self, sentences, progress_per=10000, trim_rule=None):
         for sentence_no, sentence in enumerate(sentences):
             if not checked_string_types:
                 if isinstance(sentence, string_types):
-                    logger.warning(
-                        "Each 'sentences' item should be a list of words (usually unicode strings)."
-                        "First item here is instead plain %s.", type(sentence)
-                    )
+                    logger.warning("Each 'sentences' "
+                                   "item should be a list of words "
+                                   "(usually unicode strings)."
+                                   "First item here is instead plain %s.", type(sentence)
+                                   )
                 checked_string_types += 1
             if sentence_no % progress_per == 0:
-                logger.info("PROGRESS: at sentence #%i, processed %i words, keeping %i word types",
+                logger.info("PROGRESS: at sentence #%i,"
+                            " processed %i words, "
+                            "keeping %i word types",
                             sentence_no, total_words, len(vocab))
 
             for word in sentence:
@@ -664,7 +698,8 @@ def scan_vocab(self, sentences, progress_per=10000, trim_rule=None):
                 utils.prune_vocab(vocab, min_reduce, trim_rule=trim_rule)
                 min_reduce += 1
 
-        logger.info("collected %i word types from a corpus of %i raw words and %i sentences", len(vocab), total_words, sentence_no + 1)
+        logger.info("collected %i word types from a corpus of %i raw words and %i sentences", len(vocab), total_words,
+                    sentence_no + 1)
         self.corpus_count = sentence_no + 1
         self.raw_vocab = vocab
 
@@ -774,7 +809,7 @@ def scale_vocab(self, min_count=None, sample=None, dry_run=False,
                 word_probability = 1.0
                 downsample_total += v
             if not dry_run:
-                self.wv.vocab[w].sample_int = int(round(word_probability * 2**32))
+                self.wv.vocab[w].sample_int = int(round(word_probability * 2 ** 32))
 
         if not dry_run and not keep_raw_vocab:
             logger.info("deleting the raw counts dictionary of %i items", len(self.raw_vocab))
@@ -892,7 +927,8 @@ def train(self, sentences, total_examples=None, total_words=None,
         logger.info(
             "training model with %i workers on %i vocabulary and %i features, "
             "using sg=%s hs=%s sample=%s negative=%s window=%s",
-            self.workers, len(self.wv.vocab), self.layer1_size, self.sg, self.hs, self.sample, self.negative, self.window
+            self.workers, len(self.wv.vocab), self.layer1_size, self.sg, self.hs, self.sample, self.negative,
+            self.window
         )
 
         if not self.wv.vocab:
@@ -1059,13 +1095,16 @@ def job_producer():
             raw_word_count, trained_word_count, elapsed, trained_word_count / elapsed
         )
         if job_tally < 10 * self.workers:
-            logger.warning("under 10 jobs per worker: consider setting a smaller `batch_words' for smoother alpha decay")
+            logger.warning(
+                "under 10 jobs per worker: consider setting a smaller `batch_words' for smoother alpha decay")
 
         # check that the input corpus hasn't changed during iteration
         if total_examples and total_examples != example_count:
-            logger.warning("supplied example count (%i) did not equal expected count (%i)", example_count, total_examples)
+            logger.warning("supplied example count (%i) did not equal expected count (%i)", example_count,
+                           total_examples)
         if total_words and total_words != raw_word_count:
-            logger.warning("supplied raw word count (%i) did not equal expected count (%i)", raw_word_count, total_words)
+            logger.warning("supplied raw word count (%i) did not equal expected count (%i)", raw_word_count,
+                           total_words)
 
         self.train_count += 1  # number of times train() has been called
         self.total_train_time += elapsed
@@ -1400,7 +1439,8 @@ def predict_output_word(self, context_words_list, topn=10):
         prob_values = exp(dot(l1, self.syn1neg.T))  # propagate hidden -> output and take softmax to get probabilities
         prob_values /= sum(prob_values)
         top_indices = matutils.argsort(prob_values, topn=topn, reverse=True)
-        return [(self.wv.index2word[index1], prob_values[index1]) for index1 in top_indices]  # returning the most probable output words with their probabilities
+        return [(self.wv.index2word[index1], prob_values[index1]) for index1 in
+                top_indices]  # returning the most probable output words with their probabilities
 
     def init_sims(self, replace=False):
         """
@@ -1444,7 +1484,8 @@ def log_evaluate_word_pairs(pearson, spearman, oov, pairs):
         """
         return KeyedVectors.log_evaluate_word_pairs(pearson, spearman, oov, pairs)
 
-    def evaluate_word_pairs(self, pairs, delimiter='\t', restrict_vocab=300000, case_insensitive=True, dummy4unknown=False):
+    def evaluate_word_pairs(self, pairs, delimiter='\t', restrict_vocab=300000, case_insensitive=True,
+                            dummy4unknown=False):
         """
         Deprecated. Use self.wv.evaluate_word_pairs() instead.
         Refer to the documentation for `gensim.models.KeyedVectors.evaluate_word_pairs`
@@ -1452,7 +1493,8 @@ def evaluate_word_pairs(self, pairs, delimiter='\t', restrict_vocab=300000, case
         return self.wv.evaluate_word_pairs(pairs, delimiter, restrict_vocab, case_insensitive, dummy4unknown)
 
     def __str__(self):
-        return "%s(vocab=%s, size=%s, alpha=%s)" % (self.__class__.__name__, len(self.wv.index2word), self.vector_size, self.alpha)
+        return "%s(vocab=%s, size=%s, alpha=%s)" % (
+            self.__class__.__name__, len(self.wv.index2word), self.vector_size, self.alpha)
 
     def _minimize_model(self, save_syn1=False, save_syn1neg=False, save_syn0_lockf=False):
         warnings.warn(
@@ -1502,7 +1544,7 @@ def load(cls, *args, **kwargs):
             if hasattr(v, 'sample_int'):
                 break  # already 0.12.0+ style int probabilities
             elif hasattr(v, 'sample_probability'):
-                v.sample_int = int(round(v.sample_probability * 2**32))
+                v.sample_int = int(round(v.sample_probability * 2 ** 32))
                 del v.sample_probability
         if not hasattr(model, 'syn0_lockf') and hasattr(model, 'syn0'):
             model.syn0_lockf = ones(len(model.wv.syn0), dtype=REAL)
@@ -1526,7 +1568,7 @@ def _load_specials(self, *args, **kwargs):
 
     @classmethod
     def load_word2vec_format(cls, fname, fvocab=None, binary=False, encoding='utf8', unicode_errors='strict',
-                         limit=None, datatype=REAL):
+                             limit=None, datatype=REAL):
         """Deprecated. Use gensim.models.KeyedVectors.load_word2vec_format instead."""
         raise DeprecationWarning("Deprecated. Use gensim.models.KeyedVectors.load_word2vec_format instead.")
 
@@ -1690,6 +1732,7 @@ def __iter__(self):
 # Example: ./word2vec.py -train data.txt -output vec.txt -size 200 -window 5 -sample 1e-4 -negative 5 -hs 0 -binary 0 -cbow 1 -iter 3
 if __name__ == "__main__":
     import argparse
+
     logging.basicConfig(
         format='%(asctime)s : %(threadName)s : %(levelname)s : %(message)s',
         level=logging.INFO)
@@ -1711,14 +1754,23 @@ def __iter__(self):
     parser.add_argument("-output", help="Use file OUTPUT to save the resulting word vectors")
     parser.add_argument("-window", help="Set max skip length WINDOW between words; default is 5", type=int, default=5)
     parser.add_argument("-size", help="Set size of word vectors; default is 100", type=int, default=100)
-    parser.add_argument("-sample", help="Set threshold for occurrence of words. Those that appear with higher frequency in the training data will be randomly down-sampled; default is 1e-3, useful range is (0, 1e-5)", type=float, default=1e-3)
-    parser.add_argument("-hs", help="Use Hierarchical Softmax; default is 0 (not used)", type=int, default=0, choices=[0, 1])
-    parser.add_argument("-negative", help="Number of negative examples; default is 5, common values are 3 - 10 (0 = not used)", type=int, default=5)
+    parser.add_argument("-sample",
+                        help="Set threshold for occurrence of words. Those that appear with higher frequency in the training data will be randomly down-sampled; default is 1e-3, useful range is (0, 1e-5)",
+                        type=float, default=1e-3)
+    parser.add_argument("-hs", help="Use Hierarchical Softmax; default is 0 (not used)", type=int, default=0,
+                        choices=[0, 1])
+    parser.add_argument("-negative",
+                        help="Number of negative examples; default is 5, common values are 3 - 10 (0 = not used)",
+                        type=int, default=5)
     parser.add_argument("-threads", help="Use THREADS threads (default 12)", type=int, default=12)
     parser.add_argument("-iter", help="Run more training iterations (default 5)", type=int, default=5)
-    parser.add_argument("-min_count", help="This will discard words that appear less than MIN_COUNT times; default is 5", type=int, default=5)
-    parser.add_argument("-cbow", help="Use the continuous bag of words model; default is 1 (use 0 for skip-gram model)", type=int, default=1, choices=[0, 1])
-    parser.add_argument("-binary", help="Save the resulting vectors in binary mode; default is 0 (off)", type=int, default=0, choices=[0, 1])
+    parser.add_argument("-min_count",
+                        help="This will discard words that appear less than MIN_COUNT times; default is 5", type=int,
+                        default=5)
+    parser.add_argument("-cbow", help="Use the continuous bag of words model; default is 1 (use 0 for skip-gram model)",
+                        type=int, default=1, choices=[0, 1])
+    parser.add_argument("-binary", help="Save the resulting vectors in binary mode; default is 0 (off)", type=int,
+                        default=0, choices=[0, 1])
     parser.add_argument("-accuracy", help="Use questions from file ACCURACY to evaluate the model")
 
     args = parser.parse_args()

From 0a5e8d6cd6b1d7e7dbe94b03ec14ab03a56d637f Mon Sep 17 00:00:00 2001
From: jodevak <ahmadkhaldi94@gmail.com>
Date: Tue, 17 Oct 2017 13:00:40 +0300
Subject: [PATCH 06/20] Fixing Indentation

---
 gensim/models/word2vec.py | 115 +++++++++++++++-----------------------
 1 file changed, 45 insertions(+), 70 deletions(-)

diff --git a/gensim/models/word2vec.py b/gensim/models/word2vec.py
index d4d92aed56..7e52c060ec 100644
--- a/gensim/models/word2vec.py
+++ b/gensim/models/word2vec.py
@@ -114,8 +114,8 @@
 except ImportError:
     from Queue import Queue, Empty
 
-from numpy import exp, log, dot, zeros, outer, random, dtype, float32 as REAL, \
-    uint32, seterr, array, uint8, vstack, fromstring, sqrt, \
+from numpy import exp, log, dot, zeros, outer, random, dtype, float32 as REAL,\
+    uint32, seterr, array, uint8, vstack, fromstring, sqrt,\
     empty, sum as np_sum, ones, logaddexp
 
 from scipy.special import expit
@@ -136,7 +136,6 @@
     FAST_VERSION = -1
     MAX_WORDS_IN_BATCH = 10000
 
-
     def train_batch_sg(model, sentences, alpha, work=None, compute_loss=False):
         """
         Update skip-gram model by training on a sequence of sentences.
@@ -151,7 +150,7 @@ def train_batch_sg(model, sentences, alpha, work=None, compute_loss=False):
         result = 0
         for sentence in sentences:
             word_vocabs = [model.wv.vocab[w] for w in sentence if w in model.wv.vocab and
-                           model.wv.vocab[w].sample_int > model.random.rand() * 2 ** 32]
+                           model.wv.vocab[w].sample_int > model.random.rand() * 2**32]
             for pos, word in enumerate(word_vocabs):
                 reduced_window = model.random.randint(model.window)  # `b` in the original word2vec code
 
@@ -167,7 +166,6 @@ def train_batch_sg(model, sentences, alpha, work=None, compute_loss=False):
             result += len(word_vocabs)
         return result
 
-
     def train_batch_cbow(model, sentences, alpha, work=None, neu1=None, compute_loss=False):
         """
         Update CBOW model by training on a sequence of sentences.
@@ -182,7 +180,7 @@ def train_batch_cbow(model, sentences, alpha, work=None, neu1=None, compute_loss
         result = 0
         for sentence in sentences:
             word_vocabs = [model.wv.vocab[w] for w in sentence if w in model.wv.vocab and
-                           model.wv.vocab[w].sample_int > model.random.rand() * 2 ** 32]
+                           model.wv.vocab[w].sample_int > model.random.rand() * 2**32]
             for pos, word in enumerate(word_vocabs):
                 reduced_window = model.random.randint(model.window)  # `b` in the original word2vec code
                 start = max(0, pos - model.window + reduced_window)
@@ -195,7 +193,6 @@ def train_batch_cbow(model, sentences, alpha, work=None, neu1=None, compute_loss
             result += len(word_vocabs)
         return result
 
-
     def score_sentence_sg(model, sentence, work=None):
         """
         Obtain likelihood score for a single sentence in a fitted skip-gram representaion.
@@ -225,7 +222,6 @@ def score_sentence_sg(model, sentence, work=None):
 
         return log_prob_sentence
 
-
     def score_sentence_cbow(model, sentence, work=None, neu1=None):
         """
         Obtain likelihood score for a single sentence in a fitted CBOW representaion.
@@ -299,7 +295,7 @@ def train_sg_pair(model, word, context_index, alpha, learn_vectors=True, learn_h
 
         # loss component corresponding to hierarchical softmax
         if compute_loss:
-            sgn = (-1.0) ** predict_word.code  # `ch` function, 0 -> 1, 1 -> -1
+            sgn = (-1.0)**predict_word.code  # `ch` function, 0 -> 1, 1 -> -1
             lprob = -log(expit(-sgn * prod_term))
             model.running_training_loss += sum(lprob)
 
@@ -333,8 +329,7 @@ def train_sg_pair(model, word, context_index, alpha, learn_vectors=True, learn_h
     return neu1e
 
 
-def train_cbow_pair(model, word, input_word_indices, l1, alpha, learn_vectors=True, learn_hidden=True,
-                    compute_loss=False,
+def train_cbow_pair(model, word, input_word_indices, l1, alpha, learn_vectors=True, learn_hidden=True, compute_loss=False,
                     context_vectors=None, context_locks=None, is_ft=False):
     if context_vectors is None:
         if is_ft:
@@ -362,7 +357,7 @@ def train_cbow_pair(model, word, input_word_indices, l1, alpha, learn_vectors=Tr
 
         # loss component corresponding to hierarchical softmax
         if compute_loss:
-            sgn = (-1.0) ** word.code  # ch function, 0-> 1, 1 -> -1
+            sgn = (-1.0)**word.code  # ch function, 0-> 1, 1 -> -1
             model.running_training_loss += sum(-log(expit(-sgn * prod_term)))
 
     if model.negative:
@@ -406,14 +401,14 @@ def train_cbow_pair(model, word, input_word_indices, l1, alpha, learn_vectors=Tr
 def score_sg_pair(model, word, word2):
     l1 = model.wv.syn0[word2.index]
     l2a = deepcopy(model.syn1[word.point])  # 2d matrix, codelen x layer1_size
-    sgn = (-1.0) ** word.code  # ch function, 0-> 1, 1 -> -1
+    sgn = (-1.0)**word.code  # ch function, 0-> 1, 1 -> -1
     lprob = -logaddexp(0, -sgn * dot(l1, l2a.T))
     return sum(lprob)
 
 
 def score_cbow_pair(model, word, l1):
     l2a = model.syn1[word.point]  # 2d matrix, codelen x layer1_size
-    sgn = (-1.0) ** word.code  # ch function, 0-> 1, 1 -> -1
+    sgn = (-1.0)**word.code  # ch function, 0-> 1, 1 -> -1
     lprob = -logaddexp(0, -sgn * dot(l1, l2a.T))
     return sum(lprob)
 
@@ -545,8 +540,7 @@ def __init__(self, sentences=None, size=100, alpha=0.025, window=5, min_count=5,
             if isinstance(sentences, GeneratorType):
                 raise TypeError("You can't pass a generator as the sentences argument. Try an iterator.")
             self.build_vocab(sentences, trim_rule=trim_rule)
-            self.train(sentences, total_examples=self.corpus_count, epochs=self.iter, start_alpha=self.alpha,
-                       end_alpha=self.min_alpha)
+            self.train(sentences, total_examples=self.corpus_count, epochs=self.iter, start_alpha=self.alpha, end_alpha=self.min_alpha)
         else:
             if trim_rule is not None:
                 logger.warning(
@@ -558,7 +552,7 @@ def __init__(self, sentences=None, size=100, alpha=0.025, window=5, min_count=5,
     def initialize_word_vectors(self):
         self.wv = KeyedVectors()
 
-    def make_cum_table(self, power=0.75, domain=2 ** 31 - 1):
+    def make_cum_table(self, power=0.75, domain=2**31 - 1):
         """
         Create a cumulative-distribution table using stored vocabulary word counts for
         drawing random words in the negative-sampling training routines.
@@ -575,10 +569,10 @@ def make_cum_table(self, power=0.75, domain=2 ** 31 - 1):
         # compute sum of all power (Z in paper)
         train_words_pow = 0.0
         for word_index in xrange(vocab_size):
-            train_words_pow += self.wv.vocab[self.wv.index2word[word_index]].count ** power
+            train_words_pow += self.wv.vocab[self.wv.index2word[word_index]].count**power
         cumulative = 0.0
         for word_index in xrange(vocab_size):
-            cumulative += self.wv.vocab[self.wv.index2word[word_index]].count ** power
+            cumulative += self.wv.vocab[self.wv.index2word[word_index]].count**power
             self.cum_table[word_index] = round(cumulative / train_words_pow * domain)
         if len(self.cum_table) > 0:
             assert self.cum_table[-1] == domain
@@ -623,8 +617,7 @@ def build_vocab(self, sentences, keep_raw_vocab=False, trim_rule=None, progress_
         Each sentence must be a list of unicode strings.
         """
         self.scan_vocab(sentences, progress_per=progress_per, trim_rule=trim_rule)  # initial survey
-        self.scale_vocab(keep_raw_vocab=keep_raw_vocab, trim_rule=trim_rule,
-                         update=update)  # trim by min_count & precalculate downsampling
+        self.scale_vocab(keep_raw_vocab=keep_raw_vocab, trim_rule=trim_rule, update=update)  # trim by min_count & precalculate downsampling
         self.finalize_vocab(update=update)  # build tables & arrays
 
     def build_vocab_from_freq(self, word_freq, keep_raw_vocab=False, corpus_count=None, trim_rule=None, update=False):
@@ -662,8 +655,7 @@ def build_vocab_from_freq(self, word_freq, keep_raw_vocab=False, corpus_count=No
         self.corpus_count = corpus_count if corpus_count else 0
         self.raw_vocab = vocab
 
-        self.scale_vocab(keep_raw_vocab=keep_raw_vocab, trim_rule=trim_rule,
-                         update=update)  # trim by min_count & precalculate downsampling
+        self.scale_vocab(keep_raw_vocab=keep_raw_vocab, trim_rule=trim_rule, update=update)  # trim by min_count & precalculate downsampling
         self.finalize_vocab(update=update)  # build tables & arrays
 
     def scan_vocab(self, sentences, progress_per=10000, trim_rule=None):
@@ -674,22 +666,20 @@ def scan_vocab(self, sentences, progress_per=10000, trim_rule=None):
         min_reduce = 1
         vocab = defaultdict(int)
         checked_string_types = 0
-
         for sentence_no, sentence in enumerate(sentences):
             if not checked_string_types:
                 if isinstance(sentence, string_types):
-                    logger.warning("Each 'sentences' "
-                                   "item should be a list of words "
-                                   "(usually unicode strings)."
-                                   "First item here is instead plain %s.", type(sentence)
-                                   )
+                    logger.warning(
+                        "Each 'sentences' item should be a list of words (usually unicode strings). "
+                        "First item here is instead plain %s.",
+                        type(sentence)
+                    )
                 checked_string_types += 1
             if sentence_no % progress_per == 0:
-                logger.info("PROGRESS: at sentence #%i,"
-                            " processed %i words, "
-                            "keeping %i word types",
-                            sentence_no, total_words, len(vocab))
-
+                logger.info(
+                    "PROGRESS: at sentence #%i, processed %i words, keeping %i word types",
+                    sentence_no, total_words, len(vocab)
+                )
             for word in sentence:
                 vocab[word] += 1
                 total_words += 1
@@ -698,11 +688,13 @@ def scan_vocab(self, sentences, progress_per=10000, trim_rule=None):
                 utils.prune_vocab(vocab, min_reduce, trim_rule=trim_rule)
                 min_reduce += 1
 
-        logger.info("collected %i word types from a corpus of %i raw words and %i sentences", len(vocab), total_words,
-                    sentence_no + 1)
+        logger.info(
+            "collected %i word types from a corpus of %i raw words and %i sentences",
+            len(vocab), total_words, sentence_no + 1
+        )
         self.corpus_count = sentence_no + 1
         self.raw_vocab = vocab
-
+        
     def scale_vocab(self, min_count=None, sample=None, dry_run=False,
                     keep_raw_vocab=False, trim_rule=None, update=False):
         """
@@ -809,7 +801,7 @@ def scale_vocab(self, min_count=None, sample=None, dry_run=False,
                 word_probability = 1.0
                 downsample_total += v
             if not dry_run:
-                self.wv.vocab[w].sample_int = int(round(word_probability * 2 ** 32))
+                self.wv.vocab[w].sample_int = int(round(word_probability * 2**32))
 
         if not dry_run and not keep_raw_vocab:
             logger.info("deleting the raw counts dictionary of %i items", len(self.raw_vocab))
@@ -927,8 +919,7 @@ def train(self, sentences, total_examples=None, total_words=None,
         logger.info(
             "training model with %i workers on %i vocabulary and %i features, "
             "using sg=%s hs=%s sample=%s negative=%s window=%s",
-            self.workers, len(self.wv.vocab), self.layer1_size, self.sg, self.hs, self.sample, self.negative,
-            self.window
+            self.workers, len(self.wv.vocab), self.layer1_size, self.sg, self.hs, self.sample, self.negative, self.window
         )
 
         if not self.wv.vocab:
@@ -1095,16 +1086,13 @@ def job_producer():
             raw_word_count, trained_word_count, elapsed, trained_word_count / elapsed
         )
         if job_tally < 10 * self.workers:
-            logger.warning(
-                "under 10 jobs per worker: consider setting a smaller `batch_words' for smoother alpha decay")
+            logger.warning("under 10 jobs per worker: consider setting a smaller `batch_words' for smoother alpha decay")
 
         # check that the input corpus hasn't changed during iteration
         if total_examples and total_examples != example_count:
-            logger.warning("supplied example count (%i) did not equal expected count (%i)", example_count,
-                           total_examples)
+            logger.warning("supplied example count (%i) did not equal expected count (%i)", example_count, total_examples)
         if total_words and total_words != raw_word_count:
-            logger.warning("supplied raw word count (%i) did not equal expected count (%i)", raw_word_count,
-                           total_words)
+            logger.warning("supplied raw word count (%i) did not equal expected count (%i)", raw_word_count, total_words)
 
         self.train_count += 1  # number of times train() has been called
         self.total_train_time += elapsed
@@ -1439,8 +1427,7 @@ def predict_output_word(self, context_words_list, topn=10):
         prob_values = exp(dot(l1, self.syn1neg.T))  # propagate hidden -> output and take softmax to get probabilities
         prob_values /= sum(prob_values)
         top_indices = matutils.argsort(prob_values, topn=topn, reverse=True)
-        return [(self.wv.index2word[index1], prob_values[index1]) for index1 in
-                top_indices]  # returning the most probable output words with their probabilities
+        return [(self.wv.index2word[index1], prob_values[index1]) for index1 in top_indices]  # returning the most probable output words with their probabilities
 
     def init_sims(self, replace=False):
         """
@@ -1484,8 +1471,7 @@ def log_evaluate_word_pairs(pearson, spearman, oov, pairs):
         """
         return KeyedVectors.log_evaluate_word_pairs(pearson, spearman, oov, pairs)
 
-    def evaluate_word_pairs(self, pairs, delimiter='\t', restrict_vocab=300000, case_insensitive=True,
-                            dummy4unknown=False):
+    def evaluate_word_pairs(self, pairs, delimiter='\t', restrict_vocab=300000, case_insensitive=True, dummy4unknown=False):
         """
         Deprecated. Use self.wv.evaluate_word_pairs() instead.
         Refer to the documentation for `gensim.models.KeyedVectors.evaluate_word_pairs`
@@ -1493,8 +1479,7 @@ def evaluate_word_pairs(self, pairs, delimiter='\t', restrict_vocab=300000, case
         return self.wv.evaluate_word_pairs(pairs, delimiter, restrict_vocab, case_insensitive, dummy4unknown)
 
     def __str__(self):
-        return "%s(vocab=%s, size=%s, alpha=%s)" % (
-            self.__class__.__name__, len(self.wv.index2word), self.vector_size, self.alpha)
+        return "%s(vocab=%s, size=%s, alpha=%s)" % (self.__class__.__name__, len(self.wv.index2word), self.vector_size, self.alpha)
 
     def _minimize_model(self, save_syn1=False, save_syn1neg=False, save_syn0_lockf=False):
         warnings.warn(
@@ -1544,7 +1529,7 @@ def load(cls, *args, **kwargs):
             if hasattr(v, 'sample_int'):
                 break  # already 0.12.0+ style int probabilities
             elif hasattr(v, 'sample_probability'):
-                v.sample_int = int(round(v.sample_probability * 2 ** 32))
+                v.sample_int = int(round(v.sample_probability * 2**32))
                 del v.sample_probability
         if not hasattr(model, 'syn0_lockf') and hasattr(model, 'syn0'):
             model.syn0_lockf = ones(len(model.wv.syn0), dtype=REAL)
@@ -1568,7 +1553,7 @@ def _load_specials(self, *args, **kwargs):
 
     @classmethod
     def load_word2vec_format(cls, fname, fvocab=None, binary=False, encoding='utf8', unicode_errors='strict',
-                             limit=None, datatype=REAL):
+                         limit=None, datatype=REAL):
         """Deprecated. Use gensim.models.KeyedVectors.load_word2vec_format instead."""
         raise DeprecationWarning("Deprecated. Use gensim.models.KeyedVectors.load_word2vec_format instead.")
 
@@ -1732,7 +1717,6 @@ def __iter__(self):
 # Example: ./word2vec.py -train data.txt -output vec.txt -size 200 -window 5 -sample 1e-4 -negative 5 -hs 0 -binary 0 -cbow 1 -iter 3
 if __name__ == "__main__":
     import argparse
-
     logging.basicConfig(
         format='%(asctime)s : %(threadName)s : %(levelname)s : %(message)s',
         level=logging.INFO)
@@ -1754,23 +1738,14 @@ def __iter__(self):
     parser.add_argument("-output", help="Use file OUTPUT to save the resulting word vectors")
     parser.add_argument("-window", help="Set max skip length WINDOW between words; default is 5", type=int, default=5)
     parser.add_argument("-size", help="Set size of word vectors; default is 100", type=int, default=100)
-    parser.add_argument("-sample",
-                        help="Set threshold for occurrence of words. Those that appear with higher frequency in the training data will be randomly down-sampled; default is 1e-3, useful range is (0, 1e-5)",
-                        type=float, default=1e-3)
-    parser.add_argument("-hs", help="Use Hierarchical Softmax; default is 0 (not used)", type=int, default=0,
-                        choices=[0, 1])
-    parser.add_argument("-negative",
-                        help="Number of negative examples; default is 5, common values are 3 - 10 (0 = not used)",
-                        type=int, default=5)
+    parser.add_argument("-sample", help="Set threshold for occurrence of words. Those that appear with higher frequency in the training data will be randomly down-sampled; default is 1e-3, useful range is (0, 1e-5)", type=float, default=1e-3)
+    parser.add_argument("-hs", help="Use Hierarchical Softmax; default is 0 (not used)", type=int, default=0, choices=[0, 1])
+    parser.add_argument("-negative", help="Number of negative examples; default is 5, common values are 3 - 10 (0 = not used)", type=int, default=5)
     parser.add_argument("-threads", help="Use THREADS threads (default 12)", type=int, default=12)
     parser.add_argument("-iter", help="Run more training iterations (default 5)", type=int, default=5)
-    parser.add_argument("-min_count",
-                        help="This will discard words that appear less than MIN_COUNT times; default is 5", type=int,
-                        default=5)
-    parser.add_argument("-cbow", help="Use the continuous bag of words model; default is 1 (use 0 for skip-gram model)",
-                        type=int, default=1, choices=[0, 1])
-    parser.add_argument("-binary", help="Save the resulting vectors in binary mode; default is 0 (off)", type=int,
-                        default=0, choices=[0, 1])
+    parser.add_argument("-min_count", help="This will discard words that appear less than MIN_COUNT times; default is 5", type=int, default=5)
+    parser.add_argument("-cbow", help="Use the continuous bag of words model; default is 1 (use 0 for skip-gram model)", type=int, default=1, choices=[0, 1])
+    parser.add_argument("-binary", help="Save the resulting vectors in binary mode; default is 0 (off)", type=int, default=0, choices=[0, 1])
     parser.add_argument("-accuracy", help="Use questions from file ACCURACY to evaluate the model")
 
     args = parser.parse_args()

From 644fcada795d87043d213026fab3c6e609022c67 Mon Sep 17 00:00:00 2001
From: jodevak <ahmadkhaldi94@gmail.com>
Date: Tue, 17 Oct 2017 13:57:53 +0300
Subject: [PATCH 07/20] Fixing gensim/models/word2vec.py:697:1: W293 blank line
 contains whitespace

---
 gensim/models/word2vec.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/gensim/models/word2vec.py b/gensim/models/word2vec.py
index 7e52c060ec..756660a19e 100644
--- a/gensim/models/word2vec.py
+++ b/gensim/models/word2vec.py
@@ -693,8 +693,8 @@ def scan_vocab(self, sentences, progress_per=10000, trim_rule=None):
             len(vocab), total_words, sentence_no + 1
         )
         self.corpus_count = sentence_no + 1
-        self.raw_vocab = vocab
-        
+        self.raw_vocab = vocab                
+
     def scale_vocab(self, min_count=None, sample=None, dry_run=False,
                     keep_raw_vocab=False, trim_rule=None, update=False):
         """

From c91b4cb157e48ca253bb89596ed640d92b91e916 Mon Sep 17 00:00:00 2001
From: jodevak <ahmadkhaldi94@gmail.com>
Date: Tue, 17 Oct 2017 14:50:26 +0300
Subject: [PATCH 08/20] Remove trailing white spaces

---
 gensim/models/word2vec.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/gensim/models/word2vec.py b/gensim/models/word2vec.py
index 756660a19e..66a242e288 100644
--- a/gensim/models/word2vec.py
+++ b/gensim/models/word2vec.py
@@ -693,7 +693,7 @@ def scan_vocab(self, sentences, progress_per=10000, trim_rule=None):
             len(vocab), total_words, sentence_no + 1
         )
         self.corpus_count = sentence_no + 1
-        self.raw_vocab = vocab                
+        self.raw_vocab = vocab
 
     def scale_vocab(self, min_count=None, sample=None, dry_run=False,
                     keep_raw_vocab=False, trim_rule=None, update=False):

From 1e4ef3ee9cb99ac4b175518269bbe4977ab09bc1 Mon Sep 17 00:00:00 2001
From: jodevak <ahmadkhaldi94@gmail.com>
Date: Wed, 18 Oct 2017 22:59:28 +0300
Subject: [PATCH 09/20] Adding test

---
 gensim/test/test_word2vec.py | 49 +++++++++++++++++++++++++++++++++++-
 1 file changed, 48 insertions(+), 1 deletion(-)

diff --git a/gensim/test/test_word2vec.py b/gensim/test/test_word2vec.py
index 81123ccd7a..85a7855412 100644
--- a/gensim/test/test_word2vec.py
+++ b/gensim/test/test_word2vec.py
@@ -84,6 +84,53 @@ def load_on_instance():
 
 
 class TestWord2VecModel(unittest.TestCase):
+    def testBuildVocabFromFreq(self):
+        """Test that the algorithm is able to build vocabulary from given
+        frequency table"""
+        freq_dict={
+        'minors': 2, 'graph': 3, 'system': 4,
+        'trees': 3, 'eps': 2, 'computer': 2,
+        'survey': 2, 'user': 3, 'human': 2,
+        'time': 2, 'interface': 2, 'response': 2
+        }
+        model_hs = word2vec.Word2Vec(size=10, min_count=0, seed=42, hs=1, negative=0)
+        model_neg = word2vec.Word2Vec(size=10, min_count=0, seed=42, hs=0, negative=5)
+        model_hs.build_vocab_from_freq(freq_dict)
+        model_neg.build_vocab_from_freq(freq_dict)
+        self.assertTrue(len(model_hs.wv.vocab), 12)
+        self.assertTrue(len(model_neg.wv.vocab), 12)
+        self.assertEqual(model_hs.wv.vocab['minors'].count, 2)
+        self.assertEqual(model_hs.wv.vocab['graph'].count, 3)
+        self.assertEqual(model_hs.wv.vocab['system'].count, 4)
+        self.assertEqual(model_hs.wv.vocab['trees'].count, 3)
+        self.assertEqual(model_hs.wv.vocab['eps'].count, 2)
+        self.assertEqual(model_hs.wv.vocab['computer'].count, 2)
+        self.assertEqual(model_hs.wv.vocab['survey'].count, 2)
+        self.assertEqual(model_hs.wv.vocab['user'].count, 3)
+        self.assertEqual(model_hs.wv.vocab['human'].count, 2)
+        self.assertEqual(model_hs.wv.vocab['time'].count, 2)
+        self.assertEqual(model_hs.wv.vocab['interface'].count, 2)
+        self.assertEqual(model_hs.wv.vocab['response'].count, 2)
+        self.assertEqual(model_neg.wv.vocab['minors'].count, 2)
+        self.assertEqual(model_neg.wv.vocab['graph'].count, 3)
+        self.assertEqual(model_neg.wv.vocab['system'].count, 4)
+        self.assertEqual(model_neg.wv.vocab['trees'].count, 3)
+        self.assertEqual(model_neg.wv.vocab['eps'].count, 2)
+        self.assertEqual(model_neg.wv.vocab['computer'].count, 2)
+        self.assertEqual(model_neg.wv.vocab['survey'].count, 2)
+        self.assertEqual(model_neg.wv.vocab['user'].count, 3)
+        self.assertEqual(model_neg.wv.vocab['human'].count, 2)
+        self.assertEqual(model_neg.wv.vocab['time'].count, 2)
+        self.assertEqual(model_neg.wv.vocab['interface'].count, 2)
+        self.assertEqual(model_neg.wv.vocab['response'].count, 2)
+        new_freq_dict={'computer': 1, 'artificial': 4, 'human': 1, 'graph': 1, 'intelligence': 4, 'system': 1, 'trees': 1}
+        model_hs.build_vocab_from_freq(new_freq_dict,update=True)
+        model_neg.build_vocab_from_freq(new_freq_dict,update=True)
+        self.assertTrue(model_hs.wv.vocab['graph'].count, 4)
+        self.assertTrue(model_hs.wv.vocab['artificial'].count, 4)
+        self.assertEqual(len(model_hs.wv.vocab), 14)
+        self.assertEqual(len(model_neg.wv.vocab), 14)
+
     def testOnlineLearning(self):
         """Test that the algorithm is able to add new words to the
         vocabulary and to a trained model when using a sorted vocabulary"""
@@ -831,4 +878,4 @@ def assertLess(self, a, b, msg=None):
         level=logging.DEBUG
     )
     logging.info("using optimization %s", word2vec.FAST_VERSION)
-    unittest.main()
+    unittest.main()
\ No newline at end of file

From 9ae7a84c946e9758ff45cdaaf7f16e7487105824 Mon Sep 17 00:00:00 2001
From: jodevak <ahmadkhaldi94@gmail.com>
Date: Wed, 18 Oct 2017 23:34:43 +0300
Subject: [PATCH 10/20] fix spaces

---
 gensim/test/test_word2vec.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/gensim/test/test_word2vec.py b/gensim/test/test_word2vec.py
index 85a7855412..20fb26ce4b 100644
--- a/gensim/test/test_word2vec.py
+++ b/gensim/test/test_word2vec.py
@@ -87,7 +87,7 @@ class TestWord2VecModel(unittest.TestCase):
     def testBuildVocabFromFreq(self):
         """Test that the algorithm is able to build vocabulary from given
         frequency table"""
-        freq_dict={
+        freq_dict = {
         'minors': 2, 'graph': 3, 'system': 4,
         'trees': 3, 'eps': 2, 'computer': 2,
         'survey': 2, 'user': 3, 'human': 2,
@@ -123,9 +123,9 @@ def testBuildVocabFromFreq(self):
         self.assertEqual(model_neg.wv.vocab['time'].count, 2)
         self.assertEqual(model_neg.wv.vocab['interface'].count, 2)
         self.assertEqual(model_neg.wv.vocab['response'].count, 2)
-        new_freq_dict={'computer': 1, 'artificial': 4, 'human': 1, 'graph': 1, 'intelligence': 4, 'system': 1, 'trees': 1}
-        model_hs.build_vocab_from_freq(new_freq_dict,update=True)
-        model_neg.build_vocab_from_freq(new_freq_dict,update=True)
+        new_freq_dict = {'computer': 1, 'artificial': 4, 'human': 1, 'graph': 1, 'intelligence': 4, 'system': 1, 'trees': 1}
+        model_hs.build_vocab_from_freq(new_freq_dict, update=True)
+        model_neg.build_vocab_from_freq(new_freq_dict, update=True)
         self.assertTrue(model_hs.wv.vocab['graph'].count, 4)
         self.assertTrue(model_hs.wv.vocab['artificial'].count, 4)
         self.assertEqual(len(model_hs.wv.vocab), 14)
@@ -878,4 +878,4 @@ def assertLess(self, a, b, msg=None):
         level=logging.DEBUG
     )
     logging.info("using optimization %s", word2vec.FAST_VERSION)
-    unittest.main()
\ No newline at end of file
+    unittest.main()

From 1e82811cc1a4afa3cb07a86daacef5b631ffd5d3 Mon Sep 17 00:00:00 2001
From: jodevak <ahmadkhaldi94@gmail.com>
Date: Mon, 6 Nov 2017 12:43:00 +0200
Subject: [PATCH 11/20] iteration 2 on code

---
 gensim/models/word2vec.py    | 16 ++++++++++------
 gensim/test/test_word2vec.py | 26 ++++++++++++++++++++++++++
 2 files changed, 36 insertions(+), 6 deletions(-)

diff --git a/gensim/models/word2vec.py b/gensim/models/word2vec.py
index 66a242e288..ab4521de63 100644
--- a/gensim/models/word2vec.py
+++ b/gensim/models/word2vec.py
@@ -647,13 +647,17 @@ def build_vocab_from_freq(self, word_freq, keep_raw_vocab=False, corpus_count=No
 
         Examples
         --------
-        >>> build_vocab_from_freq({"Word1":15,"Word2":20}, update=True)
+        >>> model.build_vocab_from_freq({"Word1":15,"Word2":20}, update=True)
         """
         logger.info("Processing provided word frequencies")
-        vocab = defaultdict(int, word_freq)
+        raw_vocab = word_freq #Instead of scanning text, this will assign provided word frequencies dictionary(word_freq) to be directly the raw vocab
+        logger.info(
+            "collected %i different raw word, with total frequency of %i",
+            len(raw_vocab), sum(itervalues(raw_vocab))
+        )
 
-        self.corpus_count = corpus_count if corpus_count else 0
-        self.raw_vocab = vocab
+        self.corpus_count = corpus_count if corpus_count else 0 #Since no sentences are provided, this is to control the corpus_count
+        self.raw_vocab = raw_vocab
 
         self.scale_vocab(keep_raw_vocab=keep_raw_vocab, trim_rule=trim_rule, update=update)  # trim by min_count & precalculate downsampling
         self.finalize_vocab(update=update)  # build tables & arrays
@@ -675,14 +679,14 @@ def scan_vocab(self, sentences, progress_per=10000, trim_rule=None):
                         type(sentence)
                     )
                 checked_string_types += 1
-            if sentence_no % progress_per == 0:
+            if sentence_no % progress_per == 0 and sentence_no != 0:
                 logger.info(
                     "PROGRESS: at sentence #%i, processed %i words, keeping %i word types",
                     sentence_no, total_words, len(vocab)
                 )
             for word in sentence:
                 vocab[word] += 1
-                total_words += 1
+            total_words += len(sentence)
 
             if self.max_vocab_size and len(vocab) > self.max_vocab_size:
                 utils.prune_vocab(vocab, min_reduce, trim_rule=trim_rule)
diff --git a/gensim/test/test_word2vec.py b/gensim/test/test_word2vec.py
index 20fb26ce4b..45aee7366f 100644
--- a/gensim/test/test_word2vec.py
+++ b/gensim/test/test_word2vec.py
@@ -131,6 +131,32 @@ def testBuildVocabFromFreq(self):
         self.assertEqual(len(model_hs.wv.vocab), 14)
         self.assertEqual(len(model_neg.wv.vocab), 14)
 
+    def testPruneVocab(self):
+        """Test Prune vocab while scanning sentences"""
+        sentences = [
+            ["graph", "system"],
+            ["graph", "system"],
+            ["system", "eps"],
+            ["graph", "system"]
+        ]
+        model = word2vec.Word2Vec(sentences, size=10, min_count=0, max_vocab_size=2, seed=42, hs=1, negative=0)
+        self.assertTrue(len(model.wv.vocab), 2)
+        self.assertEqual(model.wv.vocab['graph'].count, 3)
+        self.assertEqual(model.wv.vocab['system'].count, 4)
+
+        sentences = [
+            ["graph", "system"],
+            ["graph", "system"],
+            ["system", "eps"],
+            ["graph", "system"],
+            ["minors", "survey", "minors", "survey", "minors"]
+        ]
+        model = word2vec.Word2Vec(sentences, size=10, min_count=0, max_vocab_size=2, seed=42, hs=1, negative=0)
+        self.assertTrue(len(model.wv.vocab), 3)
+        self.assertEqual(model.wv.vocab['graph'].count, 3)
+        self.assertEqual(model.wv.vocab['minors'].count, 3)
+        self.assertEqual(model.wv.vocab['system'].count, 4)
+
     def testOnlineLearning(self):
         """Test that the algorithm is able to add new words to the
         vocabulary and to a trained model when using a sorted vocabulary"""

From aa9227d7714906dee1524d91e6022370a2812b24 Mon Sep 17 00:00:00 2001
From: jodevak <ahmadkhaldi94@gmail.com>
Date: Mon, 6 Nov 2017 12:50:51 +0200
Subject: [PATCH 12/20] iteration 2 on code

---
 gensim/models/word2vec.py    | 56 +++++++++++++++++-------------------
 gensim/test/test_word2vec.py | 30 +++++++++++++++++--
 2 files changed, 55 insertions(+), 31 deletions(-)

diff --git a/gensim/models/word2vec.py b/gensim/models/word2vec.py
index 39a7219433..ab4521de63 100644
--- a/gensim/models/word2vec.py
+++ b/gensim/models/word2vec.py
@@ -647,13 +647,17 @@ def build_vocab_from_freq(self, word_freq, keep_raw_vocab=False, corpus_count=No
 
         Examples
         --------
-        >>> build_vocab_from_freq({"Word1":15,"Word2":20}, update=True)
+        >>> model.build_vocab_from_freq({"Word1":15,"Word2":20}, update=True)
         """
         logger.info("Processing provided word frequencies")
-        vocab = defaultdict(int, word_freq)
+        raw_vocab = word_freq #Instead of scanning text, this will assign provided word frequencies dictionary(word_freq) to be directly the raw vocab
+        logger.info(
+            "collected %i different raw word, with total frequency of %i",
+            len(raw_vocab), sum(itervalues(raw_vocab))
+        )
 
-        self.corpus_count = corpus_count if corpus_count else 0
-        self.raw_vocab = vocab
+        self.corpus_count = corpus_count if corpus_count else 0 #Since no sentences are provided, this is to control the corpus_count
+        self.raw_vocab = raw_vocab
 
         self.scale_vocab(keep_raw_vocab=keep_raw_vocab, trim_rule=trim_rule, update=update)  # trim by min_count & precalculate downsampling
         self.finalize_vocab(update=update)  # build tables & arrays
@@ -675,14 +679,14 @@ def scan_vocab(self, sentences, progress_per=10000, trim_rule=None):
                         type(sentence)
                     )
                 checked_string_types += 1
-            if sentence_no % progress_per == 0:
+            if sentence_no % progress_per == 0 and sentence_no != 0:
                 logger.info(
                     "PROGRESS: at sentence #%i, processed %i words, keeping %i word types",
                     sentence_no, total_words, len(vocab)
                 )
             for word in sentence:
                 vocab[word] += 1
-                total_words += 1
+            total_words += len(sentence)
 
             if self.max_vocab_size and len(vocab) > self.max_vocab_size:
                 utils.prune_vocab(vocab, min_reduce, trim_rule=trim_rule)
@@ -1112,10 +1116,10 @@ def score(self, sentences, total_sentences=int(1e6), chunksize=100, queue_factor
         Note that you should specify total_sentences; we'll run into problems if you ask to
         score more than this number of sentences but it is inefficient to set the value too high.
 
-        See the article by [#taddy]_ and the gensim demo at [#deepir]_ for examples of how to use such scores in document classification.
+        See the article by [taddy]_ and the gensim demo at [deepir]_ for examples of how to use such scores in document classification.
 
-        .. [#taddy] Taddy, Matt.  Document Classification by Inversion of Distributed Language Representations, in Proceedings of the 2015 Conference of the Association of Computational Linguistics.
-        .. [#deepir] https://github.com/piskvorky/gensim/blob/develop/docs/notebooks/deepir.ipynb
+        .. [taddy] Taddy, Matt.  Document Classification by Inversion of Distributed Language Representations, in Proceedings of the 2015 Conference of the Association of Computational Linguistics.
+        .. [deepir] https://github.com/piskvorky/gensim/blob/develop/docs/notebooks/deepir.ipynb
 
         """
         if FAST_VERSION < 0:
@@ -1625,7 +1629,7 @@ class LineSentence(object):
     def __init__(self, source, max_sentence_length=MAX_WORDS_IN_BATCH, limit=None):
         """
         `source` can be either a string or a file object. Clip the file to the first
-        `limit` lines (or not clipped if limit is None, the default).
+        `limit` lines (or no clipped if limit is None, the default).
 
         Example::
 
@@ -1666,20 +1670,15 @@ def __iter__(self):
 
 class PathLineSentences(object):
     """
-
-    Works like word2vec.LineSentence, but will process all files in a directory in alphabetical order by filename.
-    The directory can only contain files that can be read by LineSentence: .bz2, .gz, and text files. Any file not ending
-    with .bz2 or .gz is assumed to be a text file. Does not work with subdirectories.
-
-    The format of files (either text, or compressed text files) in the path is one sentence = one line, with words already
-    preprocessed and separated by whitespace.
-
+    Simple format: one sentence = one line; words already preprocessed and separated by whitespace.
+    Like LineSentence, but will process all files in a directory in alphabetical order by filename
     """
 
     def __init__(self, source, max_sentence_length=MAX_WORDS_IN_BATCH, limit=None):
         """
         `source` should be a path to a directory (as a string) where all files can be opened by the
-        LineSentence class. Each file will be read up to `limit` lines (or not clipped if limit is None, the default).
+        LineSentence class. Each file will be read up to
+        `limit` lines (or no clipped if limit is None, the default).
 
         Example::
 
@@ -1693,23 +1692,23 @@ def __init__(self, source, max_sentence_length=MAX_WORDS_IN_BATCH, limit=None):
         self.limit = limit
 
         if os.path.isfile(self.source):
-            logger.debug('single file given as source, rather than a directory of files')
-            logger.debug('consider using models.word2vec.LineSentence for a single file')
+            logging.warning('single file read, better to use models.word2vec.LineSentence')
             self.input_files = [self.source]  # force code compatibility with list of files
         elif os.path.isdir(self.source):
             self.source = os.path.join(self.source, '')  # ensures os-specific slash at end of path
-            logger.info('reading directory %s', self.source)
+            logging.debug('reading directory %s', self.source)
             self.input_files = os.listdir(self.source)
-            self.input_files = [self.source + filename for filename in self.input_files]  # make full paths
+            self.input_files = [self.source + file for file in self.input_files]  # make full paths
             self.input_files.sort()  # makes sure it happens in filename order
         else:  # not a file or a directory, then we can't do anything with it
             raise ValueError('input is neither a file nor a path')
-        logger.info('files read into PathLineSentences:%s', '\n'.join(self.input_files))
+
+        logging.info('files read into PathLineSentences:%s', '\n'.join(self.input_files))
 
     def __iter__(self):
         """iterate through the files"""
         for file_name in self.input_files:
-            logger.info('reading file %s', file_name)
+            logging.info('reading file %s', file_name)
             with utils.smart_open(file_name) as fin:
                 for line in itertools.islice(fin, self.limit):
                     line = utils.to_unicode(line).split()
@@ -1724,10 +1723,9 @@ def __iter__(self):
     import argparse
     logging.basicConfig(
         format='%(asctime)s : %(threadName)s : %(levelname)s : %(message)s',
-        level=logging.INFO
-    )
-    logger.info("running %s", " ".join(sys.argv))
-    logger.info("using optimization %s", FAST_VERSION)
+        level=logging.INFO)
+    logging.info("running %s", " ".join(sys.argv))
+    logging.info("using optimization %s", FAST_VERSION)
 
     # check and process cmdline input
     program = os.path.basename(sys.argv[0])
diff --git a/gensim/test/test_word2vec.py b/gensim/test/test_word2vec.py
index 4c642ce5d2..45aee7366f 100644
--- a/gensim/test/test_word2vec.py
+++ b/gensim/test/test_word2vec.py
@@ -131,6 +131,32 @@ def testBuildVocabFromFreq(self):
         self.assertEqual(len(model_hs.wv.vocab), 14)
         self.assertEqual(len(model_neg.wv.vocab), 14)
 
+    def testPruneVocab(self):
+        """Test Prune vocab while scanning sentences"""
+        sentences = [
+            ["graph", "system"],
+            ["graph", "system"],
+            ["system", "eps"],
+            ["graph", "system"]
+        ]
+        model = word2vec.Word2Vec(sentences, size=10, min_count=0, max_vocab_size=2, seed=42, hs=1, negative=0)
+        self.assertTrue(len(model.wv.vocab), 2)
+        self.assertEqual(model.wv.vocab['graph'].count, 3)
+        self.assertEqual(model.wv.vocab['system'].count, 4)
+
+        sentences = [
+            ["graph", "system"],
+            ["graph", "system"],
+            ["system", "eps"],
+            ["graph", "system"],
+            ["minors", "survey", "minors", "survey", "minors"]
+        ]
+        model = word2vec.Word2Vec(sentences, size=10, min_count=0, max_vocab_size=2, seed=42, hs=1, negative=0)
+        self.assertTrue(len(model.wv.vocab), 3)
+        self.assertEqual(model.wv.vocab['graph'].count, 3)
+        self.assertEqual(model.wv.vocab['minors'].count, 3)
+        self.assertEqual(model.wv.vocab['system'].count, 4)
+
     def testOnlineLearning(self):
         """Test that the algorithm is able to add new words to the
         vocabulary and to a trained model when using a sorted vocabulary"""
@@ -291,11 +317,11 @@ def testPersistenceWord2VecFormat(self):
         self.assertFalse(np.allclose(model['human'], norm_only_model['human']))
         self.assertTrue(np.allclose(model.wv.syn0norm[model.wv.vocab['human'].index], norm_only_model['human']))
         limited_model_kv = keyedvectors.KeyedVectors.load_word2vec_format(testfile(), binary=True, limit=3)
-        self.assertEqual(len(limited_model_kv.syn0), 3)
+        self.assertEquals(len(limited_model_kv.syn0), 3)
         half_precision_model_kv = keyedvectors.KeyedVectors.load_word2vec_format(
             testfile(), binary=True, datatype=np.float16
         )
-        self.assertEqual(binary_model_kv.syn0.nbytes, half_precision_model_kv.syn0.nbytes * 2)
+        self.assertEquals(binary_model_kv.syn0.nbytes, half_precision_model_kv.syn0.nbytes * 2)
 
     def testNoTrainingCFormat(self):
         model = word2vec.Word2Vec(sentences, min_count=1)

From 2066a2afb1b5044729e41e662c2c790d792623c9 Mon Sep 17 00:00:00 2001
From: jodevak <ahmadkhaldi94@gmail.com>
Date: Mon, 6 Nov 2017 15:24:49 +0200
Subject: [PATCH 13/20] Fixing old version of word2vec.py merge problems

---
 gensim/models/word2vec.py    | 47 +++++++++++++++++++++---------------
 gensim/test/test_word2vec.py | 16 ++++++------
 2 files changed, 35 insertions(+), 28 deletions(-)

diff --git a/gensim/models/word2vec.py b/gensim/models/word2vec.py
index ab4521de63..1c72a86099 100644
--- a/gensim/models/word2vec.py
+++ b/gensim/models/word2vec.py
@@ -650,16 +650,17 @@ def build_vocab_from_freq(self, word_freq, keep_raw_vocab=False, corpus_count=No
         >>> model.build_vocab_from_freq({"Word1":15,"Word2":20}, update=True)
         """
         logger.info("Processing provided word frequencies")
-        raw_vocab = word_freq #Instead of scanning text, this will assign provided word frequencies dictionary(word_freq) to be directly the raw vocab
+        raw_vocab = word_freq  # Instead of scanning text, this will assign provided word frequencies dictionary(word_freq) to be directly the raw vocab
         logger.info(
             "collected %i different raw word, with total frequency of %i",
             len(raw_vocab), sum(itervalues(raw_vocab))
         )
 
-        self.corpus_count = corpus_count if corpus_count else 0 #Since no sentences are provided, this is to control the corpus_count
+        self.corpus_count = corpus_count if corpus_count else 0  # Since no sentences are provided, this is to control the corpus_count
         self.raw_vocab = raw_vocab
 
-        self.scale_vocab(keep_raw_vocab=keep_raw_vocab, trim_rule=trim_rule, update=update)  # trim by min_count & precalculate downsampling
+        self.scale_vocab(keep_raw_vocab=keep_raw_vocab, trim_rule=trim_rule,
+                         update=update)  # trim by min_count & precalculate downsampling
         self.finalize_vocab(update=update)  # build tables & arrays
 
     def scan_vocab(self, sentences, progress_per=10000, trim_rule=None):
@@ -1116,10 +1117,10 @@ def score(self, sentences, total_sentences=int(1e6), chunksize=100, queue_factor
         Note that you should specify total_sentences; we'll run into problems if you ask to
         score more than this number of sentences but it is inefficient to set the value too high.
 
-        See the article by [taddy]_ and the gensim demo at [deepir]_ for examples of how to use such scores in document classification.
+        See the article by [#taddy]_ and the gensim demo at [#deepir]_ for examples of how to use such scores in document classification.
 
-        .. [taddy] Taddy, Matt.  Document Classification by Inversion of Distributed Language Representations, in Proceedings of the 2015 Conference of the Association of Computational Linguistics.
-        .. [deepir] https://github.com/piskvorky/gensim/blob/develop/docs/notebooks/deepir.ipynb
+        .. [#taddy] Taddy, Matt.  Document Classification by Inversion of Distributed Language Representations, in Proceedings of the 2015 Conference of the Association of Computational Linguistics.
+        .. [#deepir] https://github.com/piskvorky/gensim/blob/develop/docs/notebooks/deepir.ipynb
 
         """
         if FAST_VERSION < 0:
@@ -1629,7 +1630,7 @@ class LineSentence(object):
     def __init__(self, source, max_sentence_length=MAX_WORDS_IN_BATCH, limit=None):
         """
         `source` can be either a string or a file object. Clip the file to the first
-        `limit` lines (or no clipped if limit is None, the default).
+        `limit` lines (or not clipped if limit is None, the default).
 
         Example::
 
@@ -1670,15 +1671,20 @@ def __iter__(self):
 
 class PathLineSentences(object):
     """
-    Simple format: one sentence = one line; words already preprocessed and separated by whitespace.
-    Like LineSentence, but will process all files in a directory in alphabetical order by filename
+
+    Works like word2vec.LineSentence, but will process all files in a directory in alphabetical order by filename.
+    The directory can only contain files that can be read by LineSentence: .bz2, .gz, and text files. Any file not ending
+    with .bz2 or .gz is assumed to be a text file. Does not work with subdirectories.
+
+    The format of files (either text, or compressed text files) in the path is one sentence = one line, with words already
+    preprocessed and separated by whitespace.
+
     """
 
     def __init__(self, source, max_sentence_length=MAX_WORDS_IN_BATCH, limit=None):
         """
         `source` should be a path to a directory (as a string) where all files can be opened by the
-        LineSentence class. Each file will be read up to
-        `limit` lines (or no clipped if limit is None, the default).
+        LineSentence class. Each file will be read up to `limit` lines (or not clipped if limit is None, the default).
 
         Example::
 
@@ -1692,23 +1698,23 @@ def __init__(self, source, max_sentence_length=MAX_WORDS_IN_BATCH, limit=None):
         self.limit = limit
 
         if os.path.isfile(self.source):
-            logging.warning('single file read, better to use models.word2vec.LineSentence')
+            logger.debug('single file given as source, rather than a directory of files')
+            logger.debug('consider using models.word2vec.LineSentence for a single file')
             self.input_files = [self.source]  # force code compatibility with list of files
         elif os.path.isdir(self.source):
             self.source = os.path.join(self.source, '')  # ensures os-specific slash at end of path
-            logging.debug('reading directory %s', self.source)
+            logger.info('reading directory %s', self.source)
             self.input_files = os.listdir(self.source)
-            self.input_files = [self.source + file for file in self.input_files]  # make full paths
+            self.input_files = [self.source + filename for filename in self.input_files]  # make full paths
             self.input_files.sort()  # makes sure it happens in filename order
         else:  # not a file or a directory, then we can't do anything with it
             raise ValueError('input is neither a file nor a path')
-
-        logging.info('files read into PathLineSentences:%s', '\n'.join(self.input_files))
+        logger.info('files read into PathLineSentences:%s', '\n'.join(self.input_files))
 
     def __iter__(self):
         """iterate through the files"""
         for file_name in self.input_files:
-            logging.info('reading file %s', file_name)
+            logger.info('reading file %s', file_name)
             with utils.smart_open(file_name) as fin:
                 for line in itertools.islice(fin, self.limit):
                     line = utils.to_unicode(line).split()
@@ -1723,9 +1729,10 @@ def __iter__(self):
     import argparse
     logging.basicConfig(
         format='%(asctime)s : %(threadName)s : %(levelname)s : %(message)s',
-        level=logging.INFO)
-    logging.info("running %s", " ".join(sys.argv))
-    logging.info("using optimization %s", FAST_VERSION)
+        level=logging.INFO
+    )
+    logger.info("running %s", " ".join(sys.argv))
+    logger.info("using optimization %s", FAST_VERSION)
 
     # check and process cmdline input
     program = os.path.basename(sys.argv[0])
diff --git a/gensim/test/test_word2vec.py b/gensim/test/test_word2vec.py
index 45aee7366f..9da2bb3d15 100644
--- a/gensim/test/test_word2vec.py
+++ b/gensim/test/test_word2vec.py
@@ -97,8 +97,8 @@ def testBuildVocabFromFreq(self):
         model_neg = word2vec.Word2Vec(size=10, min_count=0, seed=42, hs=0, negative=5)
         model_hs.build_vocab_from_freq(freq_dict)
         model_neg.build_vocab_from_freq(freq_dict)
-        self.assertTrue(len(model_hs.wv.vocab), 12)
-        self.assertTrue(len(model_neg.wv.vocab), 12)
+        self.assertEqual(len(model_hs.wv.vocab), 12)
+        self.assertEqual(len(model_neg.wv.vocab), 12)
         self.assertEqual(model_hs.wv.vocab['minors'].count, 2)
         self.assertEqual(model_hs.wv.vocab['graph'].count, 3)
         self.assertEqual(model_hs.wv.vocab['system'].count, 4)
@@ -126,8 +126,8 @@ def testBuildVocabFromFreq(self):
         new_freq_dict = {'computer': 1, 'artificial': 4, 'human': 1, 'graph': 1, 'intelligence': 4, 'system': 1, 'trees': 1}
         model_hs.build_vocab_from_freq(new_freq_dict, update=True)
         model_neg.build_vocab_from_freq(new_freq_dict, update=True)
-        self.assertTrue(model_hs.wv.vocab['graph'].count, 4)
-        self.assertTrue(model_hs.wv.vocab['artificial'].count, 4)
+        self.assertEqual(model_hs.wv.vocab['graph'].count, 4)
+        self.assertEqual(model_hs.wv.vocab['artificial'].count, 4)
         self.assertEqual(len(model_hs.wv.vocab), 14)
         self.assertEqual(len(model_neg.wv.vocab), 14)
 
@@ -140,7 +140,7 @@ def testPruneVocab(self):
             ["graph", "system"]
         ]
         model = word2vec.Word2Vec(sentences, size=10, min_count=0, max_vocab_size=2, seed=42, hs=1, negative=0)
-        self.assertTrue(len(model.wv.vocab), 2)
+        self.assertEqual(len(model.wv.vocab), 2)
         self.assertEqual(model.wv.vocab['graph'].count, 3)
         self.assertEqual(model.wv.vocab['system'].count, 4)
 
@@ -152,7 +152,7 @@ def testPruneVocab(self):
             ["minors", "survey", "minors", "survey", "minors"]
         ]
         model = word2vec.Word2Vec(sentences, size=10, min_count=0, max_vocab_size=2, seed=42, hs=1, negative=0)
-        self.assertTrue(len(model.wv.vocab), 3)
+        self.assertEqual(len(model.wv.vocab), 3)
         self.assertEqual(model.wv.vocab['graph'].count, 3)
         self.assertEqual(model.wv.vocab['minors'].count, 3)
         self.assertEqual(model.wv.vocab['system'].count, 4)
@@ -317,11 +317,11 @@ def testPersistenceWord2VecFormat(self):
         self.assertFalse(np.allclose(model['human'], norm_only_model['human']))
         self.assertTrue(np.allclose(model.wv.syn0norm[model.wv.vocab['human'].index], norm_only_model['human']))
         limited_model_kv = keyedvectors.KeyedVectors.load_word2vec_format(testfile(), binary=True, limit=3)
-        self.assertEquals(len(limited_model_kv.syn0), 3)
+        self.assertEqual(len(limited_model_kv.syn0), 3)
         half_precision_model_kv = keyedvectors.KeyedVectors.load_word2vec_format(
             testfile(), binary=True, datatype=np.float16
         )
-        self.assertEquals(binary_model_kv.syn0.nbytes, half_precision_model_kv.syn0.nbytes * 2)
+        self.assertEqual(binary_model_kv.syn0.nbytes, half_precision_model_kv.syn0.nbytes * 2)
 
     def testNoTrainingCFormat(self):
         model = word2vec.Word2Vec(sentences, min_count=1)

From 62ed129291ff5b1892e93f09bd700de22669f76a Mon Sep 17 00:00:00 2001
From: jodevak <ahmadkhaldi94@gmail.com>
Date: Mon, 6 Nov 2017 15:34:26 +0200
Subject: [PATCH 14/20] Fixing indent

---
 gensim/models/word2vec.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/gensim/models/word2vec.py b/gensim/models/word2vec.py
index 1c72a86099..c3209e745d 100644
--- a/gensim/models/word2vec.py
+++ b/gensim/models/word2vec.py
@@ -659,8 +659,7 @@ def build_vocab_from_freq(self, word_freq, keep_raw_vocab=False, corpus_count=No
         self.corpus_count = corpus_count if corpus_count else 0  # Since no sentences are provided, this is to control the corpus_count
         self.raw_vocab = raw_vocab
 
-        self.scale_vocab(keep_raw_vocab=keep_raw_vocab, trim_rule=trim_rule,
-                         update=update)  # trim by min_count & precalculate downsampling
+        self.scale_vocab(keep_raw_vocab=keep_raw_vocab, trim_rule=trim_rule,update=update)  # trim by min_count & precalculate downsampling
         self.finalize_vocab(update=update)  # build tables & arrays
 
     def scan_vocab(self, sentences, progress_per=10000, trim_rule=None):

From 473d7e6a77330a84c62916f4b0b7398fbefc90ef Mon Sep 17 00:00:00 2001
From: jodevak <ahmadkhaldi94@gmail.com>
Date: Mon, 6 Nov 2017 16:37:36 +0200
Subject: [PATCH 15/20] Fixing Styling

---
 gensim/models/word2vec.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/gensim/models/word2vec.py b/gensim/models/word2vec.py
index c3209e745d..45bf6f90ae 100644
--- a/gensim/models/word2vec.py
+++ b/gensim/models/word2vec.py
@@ -647,7 +647,9 @@ def build_vocab_from_freq(self, word_freq, keep_raw_vocab=False, corpus_count=No
 
         Examples
         --------
-        >>> model.build_vocab_from_freq({"Word1":15,"Word2":20}, update=True)
+        >>> from gensim.models.word2vec import Word2Vec
+        >>> model=Word2Vec()
+        >>> model.build_vocab_from_freq({"Word1": 15, "Word2": 20})
         """
         logger.info("Processing provided word frequencies")
         raw_vocab = word_freq  # Instead of scanning text, this will assign provided word frequencies dictionary(word_freq) to be directly the raw vocab
@@ -659,7 +661,7 @@ def build_vocab_from_freq(self, word_freq, keep_raw_vocab=False, corpus_count=No
         self.corpus_count = corpus_count if corpus_count else 0  # Since no sentences are provided, this is to control the corpus_count
         self.raw_vocab = raw_vocab
 
-        self.scale_vocab(keep_raw_vocab=keep_raw_vocab, trim_rule=trim_rule,update=update)  # trim by min_count & precalculate downsampling
+        self.scale_vocab(keep_raw_vocab=keep_raw_vocab, trim_rule=trim_rule, update=update)  # trim by min_count & precalculate downsampling
         self.finalize_vocab(update=update)  # build tables & arrays
 
     def scan_vocab(self, sentences, progress_per=10000, trim_rule=None):

From a65e36bc0fddcc39322b4b266226f6db241bb4e4 Mon Sep 17 00:00:00 2001
From: jodevak <ahmadkhaldi94@gmail.com>
Date: Mon, 6 Nov 2017 17:15:34 +0200
Subject: [PATCH 16/20] Fixing Styling

---
 gensim/models/word2vec.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/gensim/models/word2vec.py b/gensim/models/word2vec.py
index 45bf6f90ae..46c45e2abe 100644
--- a/gensim/models/word2vec.py
+++ b/gensim/models/word2vec.py
@@ -648,7 +648,7 @@ def build_vocab_from_freq(self, word_freq, keep_raw_vocab=False, corpus_count=No
         Examples
         --------
         >>> from gensim.models.word2vec import Word2Vec
-        >>> model=Word2Vec()
+        >>> model= Word2Vec()
         >>> model.build_vocab_from_freq({"Word1": 15, "Word2": 20})
         """
         logger.info("Processing provided word frequencies")

From 7f46a051a1027243872ccc9ea34e5b444e87b457 Mon Sep 17 00:00:00 2001
From: jodevak <ahmadkhaldi94@gmail.com>
Date: Mon, 6 Nov 2017 18:24:30 +0200
Subject: [PATCH 17/20] test

---
 gensim/models/word2vec.py    | 1 +
 gensim/test/test_word2vec.py | 7 +++++++
 2 files changed, 8 insertions(+)

diff --git a/gensim/models/word2vec.py b/gensim/models/word2vec.py
index 46c45e2abe..751b1ce4c4 100644
--- a/gensim/models/word2vec.py
+++ b/gensim/models/word2vec.py
@@ -701,6 +701,7 @@ def scan_vocab(self, sentences, progress_per=10000, trim_rule=None):
         self.corpus_count = sentence_no + 1
         self.raw_vocab = vocab
 
+
     def scale_vocab(self, min_count=None, sample=None, dry_run=False,
                     keep_raw_vocab=False, trim_rule=None, update=False):
         """
diff --git a/gensim/test/test_word2vec.py b/gensim/test/test_word2vec.py
index 9da2bb3d15..c11299f6d2 100644
--- a/gensim/test/test_word2vec.py
+++ b/gensim/test/test_word2vec.py
@@ -21,6 +21,7 @@
 from gensim import utils
 from gensim.models import word2vec, keyedvectors
 from testfixtures import log_capture
+from six import itervalues
 
 try:
     from pyemd import emd  # noqa:F401
@@ -157,6 +158,12 @@ def testPruneVocab(self):
         self.assertEqual(model.wv.vocab['minors'].count, 3)
         self.assertEqual(model.wv.vocab['system'].count, 4)
 
+    def testTotalWordCount(self):
+        model = word2vec.Word2Vec(size=10, min_count=0, seed=42)
+        model.build_vocab(sentences, keep_raw_vocab=True)
+        total_words = sum(itervalues(model.raw_vocab))
+        self.assertEqual(total_words, 29)
+
     def testOnlineLearning(self):
         """Test that the algorithm is able to add new words to the
         vocabulary and to a trained model when using a sorted vocabulary"""

From f744c4f199d8753bee0fe4f67c1a6a5fc2be8167 Mon Sep 17 00:00:00 2001
From: jodevak <ahmadkhaldi94@gmail.com>
Date: Mon, 6 Nov 2017 18:25:13 +0200
Subject: [PATCH 18/20] test

---
 gensim/models/word2vec.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/gensim/models/word2vec.py b/gensim/models/word2vec.py
index 751b1ce4c4..46c45e2abe 100644
--- a/gensim/models/word2vec.py
+++ b/gensim/models/word2vec.py
@@ -701,7 +701,6 @@ def scan_vocab(self, sentences, progress_per=10000, trim_rule=None):
         self.corpus_count = sentence_no + 1
         self.raw_vocab = vocab
 
-
     def scale_vocab(self, min_count=None, sample=None, dry_run=False,
                     keep_raw_vocab=False, trim_rule=None, update=False):
         """

From 64711642d65c5a6f95165e05631863b4120dfa0b Mon Sep 17 00:00:00 2001
From: jodevak <ahmadkhaldi94@gmail.com>
Date: Tue, 7 Nov 2017 11:59:10 +0200
Subject: [PATCH 19/20] adding total words count test

---
 gensim/models/word2vec.py    | 1 +
 gensim/test/test_word2vec.py | 4 +---
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/gensim/models/word2vec.py b/gensim/models/word2vec.py
index 46c45e2abe..64b389b4f3 100644
--- a/gensim/models/word2vec.py
+++ b/gensim/models/word2vec.py
@@ -700,6 +700,7 @@ def scan_vocab(self, sentences, progress_per=10000, trim_rule=None):
         )
         self.corpus_count = sentence_no + 1
         self.raw_vocab = vocab
+        return total_words
 
     def scale_vocab(self, min_count=None, sample=None, dry_run=False,
                     keep_raw_vocab=False, trim_rule=None, update=False):
diff --git a/gensim/test/test_word2vec.py b/gensim/test/test_word2vec.py
index c11299f6d2..242b6d39bd 100644
--- a/gensim/test/test_word2vec.py
+++ b/gensim/test/test_word2vec.py
@@ -21,7 +21,6 @@
 from gensim import utils
 from gensim.models import word2vec, keyedvectors
 from testfixtures import log_capture
-from six import itervalues
 
 try:
     from pyemd import emd  # noqa:F401
@@ -160,8 +159,7 @@ def testPruneVocab(self):
 
     def testTotalWordCount(self):
         model = word2vec.Word2Vec(size=10, min_count=0, seed=42)
-        model.build_vocab(sentences, keep_raw_vocab=True)
-        total_words = sum(itervalues(model.raw_vocab))
+        total_words = model.scan_vocab(sentences)
         self.assertEqual(total_words, 29)
 
     def testOnlineLearning(self):

From 9bc6b78daa82bdbdfe438c941aa9dbcc4c4efee1 Mon Sep 17 00:00:00 2001
From: jodevak <ahmadkhaldi94@gmail.com>
Date: Tue, 7 Nov 2017 12:02:32 +0200
Subject: [PATCH 20/20] adding total words count test

---
 gensim/models/word2vec.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/gensim/models/word2vec.py b/gensim/models/word2vec.py
index 64b389b4f3..4ca0974a17 100644
--- a/gensim/models/word2vec.py
+++ b/gensim/models/word2vec.py
@@ -681,7 +681,7 @@ def scan_vocab(self, sentences, progress_per=10000, trim_rule=None):
                         type(sentence)
                     )
                 checked_string_types += 1
-            if sentence_no % progress_per == 0 and sentence_no != 0:
+            if sentence_no % progress_per == 0:
                 logger.info(
                     "PROGRESS: at sentence #%i, processed %i words, keeping %i word types",
                     sentence_no, total_words, len(vocab)