-
-
Notifications
You must be signed in to change notification settings - Fork 4.4k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Improving Scan_Vocab speed, build_vocab_from_freq function. Iteration 2 #1695
Changes from 13 commits
3f30e1e
c4f387e
8abd58b
8ec0433
b9f3a5f
0a5e8d6
644fcad
c91b4cb
1e4ef3e
9ae7a84
1e82811
aa9227d
e156b95
2066a2a
62ed129
473d7e6
a65e36b
7f46a05
f744c4f
6471164
9bc6b78
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -647,13 +647,17 @@ def build_vocab_from_freq(self, word_freq, keep_raw_vocab=False, corpus_count=No | |
|
||
Examples | ||
-------- | ||
>>> build_vocab_from_freq({"Word1":15,"Word2":20}, update=True) | ||
>>> model.build_vocab_from_freq({"Word1":15,"Word2":20}, update=True) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. PEP8: There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. sorry, whats the problem with this ? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. spaces after |
||
""" | ||
logger.info("Processing provided word frequencies") | ||
vocab = defaultdict(int, word_freq) | ||
raw_vocab = word_freq #Instead of scanning text, this will assign provided word frequencies dictionary(word_freq) to be directly the raw vocab | ||
logger.info( | ||
"collected %i different raw word, with total frequency of %i", | ||
len(raw_vocab), sum(itervalues(raw_vocab)) | ||
) | ||
|
||
self.corpus_count = corpus_count if corpus_count else 0 | ||
self.raw_vocab = vocab | ||
self.corpus_count = corpus_count if corpus_count else 0 #Since no sentences are provided, this is to control the corpus_count | ||
self.raw_vocab = raw_vocab | ||
|
||
self.scale_vocab(keep_raw_vocab=keep_raw_vocab, trim_rule=trim_rule, update=update) # trim by min_count & precalculate downsampling | ||
self.finalize_vocab(update=update) # build tables & arrays | ||
|
@@ -675,14 +679,14 @@ def scan_vocab(self, sentences, progress_per=10000, trim_rule=None): | |
type(sentence) | ||
) | ||
checked_string_types += 1 | ||
if sentence_no % progress_per == 0: | ||
if sentence_no % progress_per == 0 and sentence_no != 0: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why did this need? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Because 0% anything will equal to 0; so the logger will log a statement saying sentence 0 and processed 0. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. But we want that :) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. But we want that :) |
||
logger.info( | ||
"PROGRESS: at sentence #%i, processed %i words, keeping %i word types", | ||
sentence_no, total_words, len(vocab) | ||
) | ||
for word in sentence: | ||
vocab[word] += 1 | ||
total_words += 1 | ||
total_words += len(sentence) | ||
|
||
if self.max_vocab_size and len(vocab) > self.max_vocab_size: | ||
utils.prune_vocab(vocab, min_reduce, trim_rule=trim_rule) | ||
|
@@ -1112,10 +1116,10 @@ def score(self, sentences, total_sentences=int(1e6), chunksize=100, queue_factor | |
Note that you should specify total_sentences; we'll run into problems if you ask to | ||
score more than this number of sentences but it is inefficient to set the value too high. | ||
|
||
See the article by [#taddy]_ and the gensim demo at [#deepir]_ for examples of how to use such scores in document classification. | ||
See the article by [taddy]_ and the gensim demo at [deepir]_ for examples of how to use such scores in document classification. | ||
|
||
.. [#taddy] Taddy, Matt. Document Classification by Inversion of Distributed Language Representations, in Proceedings of the 2015 Conference of the Association of Computational Linguistics. | ||
.. [#deepir] https://github.com/piskvorky/gensim/blob/develop/docs/notebooks/deepir.ipynb | ||
.. [taddy] Taddy, Matt. Document Classification by Inversion of Distributed Language Representations, in Proceedings of the 2015 Conference of the Association of Computational Linguistics. | ||
.. [deepir] https://github.com/piskvorky/gensim/blob/develop/docs/notebooks/deepir.ipynb | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'm sorry, but why are you remove There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. autopep8 tool did There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. will fix There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this file is merged with an older version. |
||
|
||
""" | ||
if FAST_VERSION < 0: | ||
|
@@ -1625,7 +1629,7 @@ class LineSentence(object): | |
def __init__(self, source, max_sentence_length=MAX_WORDS_IN_BATCH, limit=None): | ||
""" | ||
`source` can be either a string or a file object. Clip the file to the first | ||
`limit` lines (or not clipped if limit is None, the default). | ||
`limit` lines (or no clipped if limit is None, the default). | ||
|
||
Example:: | ||
|
||
|
@@ -1666,20 +1670,15 @@ def __iter__(self): | |
|
||
class PathLineSentences(object): | ||
""" | ||
|
||
Works like word2vec.LineSentence, but will process all files in a directory in alphabetical order by filename. | ||
The directory can only contain files that can be read by LineSentence: .bz2, .gz, and text files. Any file not ending | ||
with .bz2 or .gz is assumed to be a text file. Does not work with subdirectories. | ||
|
||
The format of files (either text, or compressed text files) in the path is one sentence = one line, with words already | ||
preprocessed and separated by whitespace. | ||
|
||
Simple format: one sentence = one line; words already preprocessed and separated by whitespace. | ||
Like LineSentence, but will process all files in a directory in alphabetical order by filename | ||
""" | ||
|
||
def __init__(self, source, max_sentence_length=MAX_WORDS_IN_BATCH, limit=None): | ||
""" | ||
`source` should be a path to a directory (as a string) where all files can be opened by the | ||
LineSentence class. Each file will be read up to `limit` lines (or not clipped if limit is None, the default). | ||
LineSentence class. Each file will be read up to | ||
`limit` lines (or no clipped if limit is None, the default). | ||
|
||
Example:: | ||
|
||
|
@@ -1693,23 +1692,23 @@ def __init__(self, source, max_sentence_length=MAX_WORDS_IN_BATCH, limit=None): | |
self.limit = limit | ||
|
||
if os.path.isfile(self.source): | ||
logger.debug('single file given as source, rather than a directory of files') | ||
logger.debug('consider using models.word2vec.LineSentence for a single file') | ||
logging.warning('single file read, better to use models.word2vec.LineSentence') | ||
self.input_files = [self.source] # force code compatibility with list of files | ||
elif os.path.isdir(self.source): | ||
self.source = os.path.join(self.source, '') # ensures os-specific slash at end of path | ||
logger.info('reading directory %s', self.source) | ||
logging.debug('reading directory %s', self.source) | ||
self.input_files = os.listdir(self.source) | ||
self.input_files = [self.source + filename for filename in self.input_files] # make full paths | ||
self.input_files = [self.source + file for file in self.input_files] # make full paths | ||
self.input_files.sort() # makes sure it happens in filename order | ||
else: # not a file or a directory, then we can't do anything with it | ||
raise ValueError('input is neither a file nor a path') | ||
logger.info('files read into PathLineSentences:%s', '\n'.join(self.input_files)) | ||
|
||
logging.info('files read into PathLineSentences:%s', '\n'.join(self.input_files)) | ||
|
||
def __iter__(self): | ||
"""iterate through the files""" | ||
for file_name in self.input_files: | ||
logger.info('reading file %s', file_name) | ||
logging.info('reading file %s', file_name) | ||
with utils.smart_open(file_name) as fin: | ||
for line in itertools.islice(fin, self.limit): | ||
line = utils.to_unicode(line).split() | ||
|
@@ -1724,10 +1723,9 @@ def __iter__(self): | |
import argparse | ||
logging.basicConfig( | ||
format='%(asctime)s : %(threadName)s : %(levelname)s : %(message)s', | ||
level=logging.INFO | ||
) | ||
logger.info("running %s", " ".join(sys.argv)) | ||
logger.info("using optimization %s", FAST_VERSION) | ||
level=logging.INFO) | ||
logging.info("running %s", " ".join(sys.argv)) | ||
logging.info("using optimization %s", FAST_VERSION) | ||
|
||
# check and process cmdline input | ||
program = os.path.basename(sys.argv[0]) | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -131,6 +131,32 @@ def testBuildVocabFromFreq(self): | |
self.assertEqual(len(model_hs.wv.vocab), 14) | ||
self.assertEqual(len(model_neg.wv.vocab), 14) | ||
|
||
def testPruneVocab(self): | ||
"""Test Prune vocab while scanning sentences""" | ||
sentences = [ | ||
["graph", "system"], | ||
["graph", "system"], | ||
["system", "eps"], | ||
["graph", "system"] | ||
] | ||
model = word2vec.Word2Vec(sentences, size=10, min_count=0, max_vocab_size=2, seed=42, hs=1, negative=0) | ||
self.assertTrue(len(model.wv.vocab), 2) | ||
self.assertEqual(model.wv.vocab['graph'].count, 3) | ||
self.assertEqual(model.wv.vocab['system'].count, 4) | ||
|
||
sentences = [ | ||
["graph", "system"], | ||
["graph", "system"], | ||
["system", "eps"], | ||
["graph", "system"], | ||
["minors", "survey", "minors", "survey", "minors"] | ||
] | ||
model = word2vec.Word2Vec(sentences, size=10, min_count=0, max_vocab_size=2, seed=42, hs=1, negative=0) | ||
self.assertTrue(len(model.wv.vocab), 3) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. maybe you need There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. will fix |
||
self.assertEqual(model.wv.vocab['graph'].count, 3) | ||
self.assertEqual(model.wv.vocab['minors'].count, 3) | ||
self.assertEqual(model.wv.vocab['system'].count, 4) | ||
|
||
def testOnlineLearning(self): | ||
"""Test that the algorithm is able to add new words to the | ||
vocabulary and to a trained model when using a sorted vocabulary""" | ||
|
@@ -291,11 +317,11 @@ def testPersistenceWord2VecFormat(self): | |
self.assertFalse(np.allclose(model['human'], norm_only_model['human'])) | ||
self.assertTrue(np.allclose(model.wv.syn0norm[model.wv.vocab['human'].index], norm_only_model['human'])) | ||
limited_model_kv = keyedvectors.KeyedVectors.load_word2vec_format(testfile(), binary=True, limit=3) | ||
self.assertEqual(len(limited_model_kv.syn0), 3) | ||
self.assertEquals(len(limited_model_kv.syn0), 3) | ||
half_precision_model_kv = keyedvectors.KeyedVectors.load_word2vec_format( | ||
testfile(), binary=True, datatype=np.float16 | ||
) | ||
self.assertEqual(binary_model_kv.syn0.nbytes, half_precision_model_kv.syn0.nbytes * 2) | ||
self.assertEquals(binary_model_kv.syn0.nbytes, half_precision_model_kv.syn0.nbytes * 2) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. https://docs.python.org/2/library/unittest.html#deprecated-aliases
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. alright |
||
|
||
def testNoTrainingCFormat(self): | ||
model = word2vec.Word2Vec(sentences, min_count=1) | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Model is undefined, please create model first (docstring should be executable, i.e. I can copy-paste this code to console and I expect that code run successfully) we plan to add doctests to our CI soon.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
👍