piskvorky · menshikh-iv · Nov 8, 2017 · Sep 25, 2017 · Sep 25, 2017 · Sep 25, 2017
diff --git a/gensim/models/word2vec.py b/gensim/models/word2vec.py
@@ -647,13 +647,17 @@ def build_vocab_from_freq(self, word_freq, keep_raw_vocab=False, corpus_count=No
 
         Examples
         --------
-        >>> build_vocab_from_freq({"Word1":15,"Word2":20}, update=True)
+        >>> model.build_vocab_from_freq({"Word1":15,"Word2":20}, update=True)
         """
         logger.info("Processing provided word frequencies")
-        vocab = defaultdict(int, word_freq)
+        raw_vocab = word_freq #Instead of scanning text, this will assign provided word frequencies dictionary(word_freq) to be directly the raw vocab
+        logger.info(
+            "collected %i different raw word, with total frequency of %i",
+            len(raw_vocab), sum(itervalues(raw_vocab))
+        )
 
-        self.corpus_count = corpus_count if corpus_count else 0
-        self.raw_vocab = vocab
+        self.corpus_count = corpus_count if corpus_count else 0 #Since no sentences are provided, this is to control the corpus_count
+        self.raw_vocab = raw_vocab
 
         self.scale_vocab(keep_raw_vocab=keep_raw_vocab, trim_rule=trim_rule, update=update)  # trim by min_count & precalculate downsampling
         self.finalize_vocab(update=update)  # build tables & arrays
@@ -675,14 +679,14 @@ def scan_vocab(self, sentences, progress_per=10000, trim_rule=None):
                         type(sentence)
                     )
                 checked_string_types += 1
-            if sentence_no % progress_per == 0:
+            if sentence_no % progress_per == 0 and sentence_no != 0:
                 logger.info(
                     "PROGRESS: at sentence #%i, processed %i words, keeping %i word types",
                     sentence_no, total_words, len(vocab)
                 )
             for word in sentence:
                 vocab[word] += 1
-                total_words += 1
+            total_words += len(sentence)
 
             if self.max_vocab_size and len(vocab) > self.max_vocab_size:
                 utils.prune_vocab(vocab, min_reduce, trim_rule=trim_rule)
@@ -1112,10 +1116,10 @@ def score(self, sentences, total_sentences=int(1e6), chunksize=100, queue_factor
         Note that you should specify total_sentences; we'll run into problems if you ask to
         score more than this number of sentences but it is inefficient to set the value too high.
 
-        See the article by [#taddy]_ and the gensim demo at [#deepir]_ for examples of how to use such scores in document classification.
+        See the article by [taddy]_ and the gensim demo at [deepir]_ for examples of how to use such scores in document classification.
 
-        .. [#taddy] Taddy, Matt.  Document Classification by Inversion of Distributed Language Representations, in Proceedings of the 2015 Conference of the Association of Computational Linguistics.
-        .. [#deepir] https://github.com/piskvorky/gensim/blob/develop/docs/notebooks/deepir.ipynb
+        .. [taddy] Taddy, Matt.  Document Classification by Inversion of Distributed Language Representations, in Proceedings of the 2015 Conference of the Association of Computational Linguistics.
+        .. [deepir] https://github.com/piskvorky/gensim/blob/develop/docs/notebooks/deepir.ipynb
 
         """
         if FAST_VERSION < 0:
@@ -1625,7 +1629,7 @@ class LineSentence(object):
     def __init__(self, source, max_sentence_length=MAX_WORDS_IN_BATCH, limit=None):
         """
         `source` can be either a string or a file object. Clip the file to the first
-        `limit` lines (or not clipped if limit is None, the default).
+        `limit` lines (or no clipped if limit is None, the default).
 
         Example::
 
@@ -1666,20 +1670,15 @@ def __iter__(self):
 
 class PathLineSentences(object):
     """
-
-    Works like word2vec.LineSentence, but will process all files in a directory in alphabetical order by filename.
-    The directory can only contain files that can be read by LineSentence: .bz2, .gz, and text files. Any file not ending
-    with .bz2 or .gz is assumed to be a text file. Does not work with subdirectories.
-
-    The format of files (either text, or compressed text files) in the path is one sentence = one line, with words already
-    preprocessed and separated by whitespace.
-
+    Simple format: one sentence = one line; words already preprocessed and separated by whitespace.
+    Like LineSentence, but will process all files in a directory in alphabetical order by filename
     """
 
     def __init__(self, source, max_sentence_length=MAX_WORDS_IN_BATCH, limit=None):
         """
         `source` should be a path to a directory (as a string) where all files can be opened by the
-        LineSentence class. Each file will be read up to `limit` lines (or not clipped if limit is None, the default).
+        LineSentence class. Each file will be read up to
+        `limit` lines (or no clipped if limit is None, the default).
 
         Example::
 
@@ -1693,23 +1692,23 @@ def __init__(self, source, max_sentence_length=MAX_WORDS_IN_BATCH, limit=None):
         self.limit = limit
 
         if os.path.isfile(self.source):
-            logger.debug('single file given as source, rather than a directory of files')
-            logger.debug('consider using models.word2vec.LineSentence for a single file')
+            logging.warning('single file read, better to use models.word2vec.LineSentence')
             self.input_files = [self.source]  # force code compatibility with list of files
         elif os.path.isdir(self.source):
             self.source = os.path.join(self.source, '')  # ensures os-specific slash at end of path
-            logger.info('reading directory %s', self.source)
+            logging.debug('reading directory %s', self.source)
             self.input_files = os.listdir(self.source)
-            self.input_files = [self.source + filename for filename in self.input_files]  # make full paths
+            self.input_files = [self.source + file for file in self.input_files]  # make full paths
             self.input_files.sort()  # makes sure it happens in filename order
         else:  # not a file or a directory, then we can't do anything with it
             raise ValueError('input is neither a file nor a path')
-        logger.info('files read into PathLineSentences:%s', '\n'.join(self.input_files))
+
+        logging.info('files read into PathLineSentences:%s', '\n'.join(self.input_files))
 
     def __iter__(self):
         """iterate through the files"""
         for file_name in self.input_files:
-            logger.info('reading file %s', file_name)
+            logging.info('reading file %s', file_name)
             with utils.smart_open(file_name) as fin:
                 for line in itertools.islice(fin, self.limit):
                     line = utils.to_unicode(line).split()
@@ -1724,10 +1723,9 @@ def __iter__(self):
     import argparse
     logging.basicConfig(
         format='%(asctime)s : %(threadName)s : %(levelname)s : %(message)s',
-        level=logging.INFO
-    )
-    logger.info("running %s", " ".join(sys.argv))
-    logger.info("using optimization %s", FAST_VERSION)
+        level=logging.INFO)
+    logging.info("running %s", " ".join(sys.argv))
+    logging.info("using optimization %s", FAST_VERSION)
 
     # check and process cmdline input
     program = os.path.basename(sys.argv[0])

diff --git a/gensim/test/test_word2vec.py b/gensim/test/test_word2vec.py
@@ -131,6 +131,32 @@ def testBuildVocabFromFreq(self):
         self.assertEqual(len(model_hs.wv.vocab), 14)
         self.assertEqual(len(model_neg.wv.vocab), 14)
 
+    def testPruneVocab(self):
+        """Test Prune vocab while scanning sentences"""
+        sentences = [
+            ["graph", "system"],
+            ["graph", "system"],
+            ["system", "eps"],
+            ["graph", "system"]
+        ]
+        model = word2vec.Word2Vec(sentences, size=10, min_count=0, max_vocab_size=2, seed=42, hs=1, negative=0)
+        self.assertTrue(len(model.wv.vocab), 2)
+        self.assertEqual(model.wv.vocab['graph'].count, 3)
+        self.assertEqual(model.wv.vocab['system'].count, 4)
+
+        sentences = [
+            ["graph", "system"],
+            ["graph", "system"],
+            ["system", "eps"],
+            ["graph", "system"],
+            ["minors", "survey", "minors", "survey", "minors"]
+        ]
+        model = word2vec.Word2Vec(sentences, size=10, min_count=0, max_vocab_size=2, seed=42, hs=1, negative=0)
+        self.assertTrue(len(model.wv.vocab), 3)
+        self.assertEqual(model.wv.vocab['graph'].count, 3)
+        self.assertEqual(model.wv.vocab['minors'].count, 3)
+        self.assertEqual(model.wv.vocab['system'].count, 4)
+
     def testOnlineLearning(self):
         """Test that the algorithm is able to add new words to the
         vocabulary and to a trained model when using a sorted vocabulary"""
@@ -291,11 +317,11 @@ def testPersistenceWord2VecFormat(self):
         self.assertFalse(np.allclose(model['human'], norm_only_model['human']))
         self.assertTrue(np.allclose(model.wv.syn0norm[model.wv.vocab['human'].index], norm_only_model['human']))
         limited_model_kv = keyedvectors.KeyedVectors.load_word2vec_format(testfile(), binary=True, limit=3)
-        self.assertEqual(len(limited_model_kv.syn0), 3)
+        self.assertEquals(len(limited_model_kv.syn0), 3)
         half_precision_model_kv = keyedvectors.KeyedVectors.load_word2vec_format(
             testfile(), binary=True, datatype=np.float16
         )
-        self.assertEqual(binary_model_kv.syn0.nbytes, half_precision_model_kv.syn0.nbytes * 2)
+        self.assertEquals(binary_model_kv.syn0.nbytes, half_precision_model_kv.syn0.nbytes * 2)
 
     def testNoTrainingCFormat(self):
         model = word2vec.Word2Vec(sentences, min_count=1)