From 3f30e1e711c108b9ef2e2ae336ab4838280eb256 Mon Sep 17 00:00:00 2001 From: jodevak Date: Mon, 25 Sep 2017 17:47:04 +0300 Subject: [PATCH 01/20] fix build vocab speed issue, and new function to build vocab from previously provided word frequencies table --- gensim/models/word2vec.py | 391 ++++++++++++++++++-------------------- 1 file changed, 180 insertions(+), 211 deletions(-) diff --git a/gensim/models/word2vec.py b/gensim/models/word2vec.py index ce7de6330c..fea1cdc990 100644 --- a/gensim/models/word2vec.py +++ b/gensim/models/word2vec.py @@ -114,16 +114,18 @@ except ImportError: from Queue import Queue, Empty -from numpy import exp, log, dot, zeros, outer, random, dtype, float32 as REAL,\ - uint32, seterr, array, uint8, vstack, fromstring, sqrt,\ - empty, sum as np_sum, ones, logaddexp +from numpy import exp, log, dot, zeros, outer, random, dtype, float32 as REAL, \ + double, uint32, seterr, array, uint8, vstack, fromstring, sqrt, newaxis, \ + ndarray, empty, sum as np_sum, prod, ones, ascontiguousarray, vstack, logaddexp from scipy.special import expit from gensim import utils, matutils # utility fnc for pickling, common scipy operations etc +from gensim.corpora.dictionary import Dictionary from six import iteritems, itervalues, string_types from six.moves import xrange from types import GeneratorType +from scipy import stats logger = logging.getLogger(__name__) @@ -136,6 +138,7 @@ FAST_VERSION = -1 MAX_WORDS_IN_BATCH = 10000 + def train_batch_sg(model, sentences, alpha, work=None, compute_loss=False): """ Update skip-gram model by training on a sequence of sentences. @@ -150,7 +153,7 @@ def train_batch_sg(model, sentences, alpha, work=None, compute_loss=False): result = 0 for sentence in sentences: word_vocabs = [model.wv.vocab[w] for w in sentence if w in model.wv.vocab and - model.wv.vocab[w].sample_int > model.random.rand() * 2**32] + model.wv.vocab[w].sample_int > model.random.rand() * 2 ** 32] for pos, word in enumerate(word_vocabs): reduced_window = model.random.randint(model.window) # `b` in the original word2vec code @@ -159,13 +162,13 @@ def train_batch_sg(model, sentences, alpha, work=None, compute_loss=False): for pos2, word2 in enumerate(word_vocabs[start:(pos + model.window + 1 - reduced_window)], start): # don't train on the `word` itself if pos2 != pos: - train_sg_pair( - model, model.wv.index2word[word.index], word2.index, alpha, compute_loss=compute_loss - ) + train_sg_pair(model, model.wv.index2word[word.index], word2.index, alpha, + compute_loss=compute_loss) result += len(word_vocabs) return result + def train_batch_cbow(model, sentences, alpha, work=None, neu1=None, compute_loss=False): """ Update CBOW model by training on a sequence of sentences. @@ -180,7 +183,7 @@ def train_batch_cbow(model, sentences, alpha, work=None, neu1=None, compute_loss result = 0 for sentence in sentences: word_vocabs = [model.wv.vocab[w] for w in sentence if w in model.wv.vocab and - model.wv.vocab[w].sample_int > model.random.rand() * 2**32] + model.wv.vocab[w].sample_int > model.random.rand() * 2 ** 32] for pos, word in enumerate(word_vocabs): reduced_window = model.random.randint(model.window) # `b` in the original word2vec code start = max(0, pos - model.window + reduced_window) @@ -193,6 +196,7 @@ def train_batch_cbow(model, sentences, alpha, work=None, neu1=None, compute_loss result += len(word_vocabs) return result + def score_sentence_sg(model, sentence, work=None): """ Obtain likelihood score for a single sentence in a fitted skip-gram representaion. @@ -222,6 +226,7 @@ def score_sentence_sg(model, sentence, work=None): return log_prob_sentence + def score_sentence_cbow(model, sentence, work=None, neu1=None): """ Obtain likelihood score for a single sentence in a fitted CBOW representaion. @@ -254,32 +259,18 @@ def score_sentence_cbow(model, sentence, work=None, neu1=None): def train_sg_pair(model, word, context_index, alpha, learn_vectors=True, learn_hidden=True, - context_vectors=None, context_locks=None, compute_loss=False, is_ft=False): + context_vectors=None, context_locks=None, compute_loss=False): if context_vectors is None: - if is_ft: - context_vectors_vocab = model.wv.syn0_vocab - context_vectors_ngrams = model.wv.syn0_ngrams - else: - context_vectors = model.wv.syn0 + context_vectors = model.wv.syn0 if context_locks is None: - if is_ft: - context_locks_vocab = model.syn0_vocab_lockf - context_locks_ngrams = model.syn0_ngrams_lockf - else: - context_locks = model.syn0_lockf + context_locks = model.syn0_lockf if word not in model.wv.vocab: return predict_word = model.wv.vocab[word] # target word (NN output) - if is_ft: - l1_vocab = context_vectors_vocab[context_index[0]] - l1_ngrams = np_sum(context_vectors_ngrams[context_index[1:]], axis=0) - if context_index: - l1 = np_sum([l1_vocab, l1_ngrams], axis=0) / len(context_index) - else: - l1 = context_vectors[context_index] # input word (NN input/projection layer) - lock_factor = context_locks[context_index] + l1 = context_vectors[context_index] # input word (NN input/projection layer) + lock_factor = context_locks[context_index] neu1e = zeros(l1.shape) @@ -295,7 +286,7 @@ def train_sg_pair(model, word, context_index, alpha, learn_vectors=True, learn_h # loss component corresponding to hierarchical softmax if compute_loss: - sgn = (-1.0)**predict_word.code # `ch` function, 0 -> 1, 1 -> -1 + sgn = (-1.0) ** predict_word.code # `ch` function, 0 -> 1, 1 -> -1 lprob = -log(expit(-sgn * prod_term)) model.running_training_loss += sum(lprob) @@ -320,30 +311,12 @@ def train_sg_pair(model, word, context_index, alpha, learn_vectors=True, learn_h model.running_training_loss -= log(expit(prod_term[0])) # for the output word if learn_vectors: - if is_ft: - model.wv.syn0_vocab[context_index[0]] += neu1e * context_locks_vocab[context_index[0]] - for i in context_index[1:]: - model.wv.syn0_ngrams[i] += neu1e * context_locks_ngrams[i] - else: - l1 += neu1e * lock_factor # learn input -> hidden (mutates model.wv.syn0[word2.index], if that is l1) + l1 += neu1e * lock_factor # learn input -> hidden (mutates model.wv.syn0[word2.index], if that is l1) return neu1e -def train_cbow_pair(model, word, input_word_indices, l1, alpha, learn_vectors=True, learn_hidden=True, compute_loss=False, - context_vectors=None, context_locks=None, is_ft=False): - if context_vectors is None: - if is_ft: - context_vectors_vocab = model.wv.syn0_vocab - context_vectors_ngrams = model.wv.syn0_ngrams - else: - context_vectors = model.wv.syn0 - if context_locks is None: - if is_ft: - context_locks_vocab = model.syn0_vocab_lockf - context_locks_ngrams = model.syn0_ngrams_lockf - else: - context_locks = model.syn0_lockf - +def train_cbow_pair(model, word, input_word_indices, l1, alpha, learn_vectors=True, learn_hidden=True, + compute_loss=False): neu1e = zeros(l1.shape) if model.hs: @@ -357,7 +330,7 @@ def train_cbow_pair(model, word, input_word_indices, l1, alpha, learn_vectors=Tr # loss component corresponding to hierarchical softmax if compute_loss: - sgn = (-1.0)**word.code # ch function, 0-> 1, 1 -> -1 + sgn = (-1.0) ** word.code # ch function, 0-> 1, 1 -> -1 model.running_training_loss += sum(-log(expit(-sgn * prod_term))) if model.negative: @@ -382,18 +355,10 @@ def train_cbow_pair(model, word, input_word_indices, l1, alpha, learn_vectors=Tr if learn_vectors: # learn input -> hidden, here for all words in the window separately - if is_ft: - if not model.cbow_mean and input_word_indices: - neu1e /= (len(input_word_indices[0]) + len(input_word_indices[1])) - for i in input_word_indices[0]: - context_vectors_vocab[i] += neu1e * context_locks_vocab[i] - for i in input_word_indices[1]: - context_vectors_ngrams[i] += neu1e * context_locks_ngrams[i] - else: - if not model.cbow_mean and input_word_indices: - neu1e /= len(input_word_indices) - for i in input_word_indices: - context_vectors[i] += neu1e * context_locks[i] + if not model.cbow_mean and input_word_indices: + neu1e /= len(input_word_indices) + for i in input_word_indices: + model.wv.syn0[i] += neu1e * model.syn0_lockf[i] return neu1e @@ -401,14 +366,14 @@ def train_cbow_pair(model, word, input_word_indices, l1, alpha, learn_vectors=Tr def score_sg_pair(model, word, word2): l1 = model.wv.syn0[word2.index] l2a = deepcopy(model.syn1[word.point]) # 2d matrix, codelen x layer1_size - sgn = (-1.0)**word.code # ch function, 0-> 1, 1 -> -1 + sgn = (-1.0) ** word.code # ch function, 0-> 1, 1 -> -1 lprob = -logaddexp(0, -sgn * dot(l1, l2a.T)) return sum(lprob) def score_cbow_pair(model, word, l1): l2a = model.syn1[word.point] # 2d matrix, codelen x layer1_size - sgn = (-1.0)**word.code # ch function, 0-> 1, 1 -> -1 + sgn = (-1.0) ** word.code # ch function, 0-> 1, 1 -> -1 lprob = -logaddexp(0, -sgn * dot(l1, l2a.T)) return sum(lprob) @@ -425,10 +390,11 @@ class Word2Vec(utils.SaveLoad): """ - def __init__(self, sentences=None, size=100, alpha=0.025, window=5, min_count=5, - max_vocab_size=None, sample=1e-3, seed=1, workers=3, min_alpha=0.0001, - sg=0, hs=0, negative=5, cbow_mean=1, hashfxn=hash, iter=5, null_word=0, - trim_rule=None, sorted_vocab=1, batch_words=MAX_WORDS_IN_BATCH, compute_loss=False): + def __init__( + self, sentences=None, size=100, alpha=0.025, window=5, min_count=5, + max_vocab_size=None, sample=1e-3, seed=1, workers=3, min_alpha=0.0001, + sg=0, hs=0, negative=5, cbow_mean=1, hashfxn=hash, iter=5, null_word=0, + trim_rule=None, sorted_vocab=1, batch_words=MAX_WORDS_IN_BATCH, compute_loss=False): """ Initialize the model from an iterable of `sentences`. Each sentence is a list of words (unicode strings) that will be used for training. @@ -502,9 +468,9 @@ def __init__(self, sentences=None, size=100, alpha=0.025, window=5, min_count=5, self.load = call_on_class_only if FAST_VERSION == -1: - logger.warning('Slow version of %s is being used', __name__) + logger.warning('Slow version of {0} is being used'.format(__name__)) else: - logger.debug('Fast version of %s is being used', __name__) + logger.debug('Fast version of {0} is being used'.format(__name__)) self.initialize_word_vectors() self.sg = int(sg) @@ -540,19 +506,18 @@ def __init__(self, sentences=None, size=100, alpha=0.025, window=5, min_count=5, if isinstance(sentences, GeneratorType): raise TypeError("You can't pass a generator as the sentences argument. Try an iterator.") self.build_vocab(sentences, trim_rule=trim_rule) - self.train(sentences, total_examples=self.corpus_count, epochs=self.iter, start_alpha=self.alpha, end_alpha=self.min_alpha) + self.train(sentences, total_examples=self.corpus_count, epochs=self.iter, + start_alpha=self.alpha, end_alpha=self.min_alpha) else: if trim_rule is not None: logger.warning( - "The rule, if given, is only used to prune vocabulary during build_vocab() " - "and is not stored as part of the model. Model initialized without sentences. " - "trim_rule provided, if any, will be ignored." - ) + "The rule, if given, is only used to prune vocabulary during build_vocab() and is not stored as part of the model. ") + logger.warning("Model initialized without sentences. trim_rule provided, if any, will be ignored.") def initialize_word_vectors(self): self.wv = KeyedVectors() - def make_cum_table(self, power=0.75, domain=2**31 - 1): + def make_cum_table(self, power=0.75, domain=2 ** 31 - 1): """ Create a cumulative-distribution table using stored vocabulary word counts for drawing random words in the negative-sampling training routines. @@ -569,10 +534,10 @@ def make_cum_table(self, power=0.75, domain=2**31 - 1): # compute sum of all power (Z in paper) train_words_pow = 0.0 for word_index in xrange(vocab_size): - train_words_pow += self.wv.vocab[self.wv.index2word[word_index]].count**power + train_words_pow += self.wv.vocab[self.wv.index2word[word_index]].count ** power cumulative = 0.0 for word_index in xrange(vocab_size): - cumulative += self.wv.vocab[self.wv.index2word[word_index]].count**power + cumulative += self.wv.vocab[self.wv.index2word[word_index]].count ** power self.cum_table[word_index] = round(cumulative / train_words_pow * domain) if len(self.cum_table) > 0: assert self.cum_table[-1] == domain @@ -590,9 +555,8 @@ def create_binary_tree(self): heapq.heapify(heap) for i in xrange(len(self.wv.vocab) - 1): min1, min2 = heapq.heappop(heap), heapq.heappop(heap) - heapq.heappush( - heap, Vocab(count=min1.count + min2.count, index=i + len(self.wv.vocab), left=min1, right=min2) - ) + heapq.heappush(heap, + Vocab(count=min1.count + min2.count, index=i + len(self.wv.vocab), left=min1, right=min2)) # recurse over the tree, assigning a binary code to each vocabulary word if heap: @@ -612,16 +576,31 @@ def create_binary_tree(self): logger.info("built huffman tree with maximum node depth %i", max_depth) def build_vocab(self, sentences, keep_raw_vocab=False, trim_rule=None, progress_per=10000, update=False): + """ Build vocabulary from a sequence of sentences (can be a once-only generator stream). Each sentence must be a list of unicode strings. - """ self.scan_vocab(sentences, progress_per=progress_per, trim_rule=trim_rule) # initial survey self.scale_vocab(keep_raw_vocab=keep_raw_vocab, trim_rule=trim_rule, update=update) # trim by min_count & precalculate downsampling self.finalize_vocab(update=update) # build tables & arrays + def build_vocab_from_freq(self, word_freq, keep_raw_vocab=False, corpus_count=None, trim_rule=None, update=False): + + """ + Build vocabulary from a dictionary of word frequencies + """ + logger.info("Processing provided word frequencies") + vocab = defaultdict(int, word_freq) + + self.corpus_count = corpus_count if corpus_count else 0 + self.raw_vocab = vocab + + self.scale_vocab(keep_raw_vocab=keep_raw_vocab, trim_rule=trim_rule, update=update) # trim by min_count & precalculate downsampling + self.finalize_vocab(update=update) # build tables & arrays + def scan_vocab(self, sentences, progress_per=10000, trim_rule=None): + """Do an initial scan of all words appearing in sentences.""" logger.info("collecting all words and their counts") sentence_no = -1 @@ -629,37 +608,33 @@ def scan_vocab(self, sentences, progress_per=10000, trim_rule=None): min_reduce = 1 vocab = defaultdict(int) checked_string_types = 0 + for sentence_no, sentence in enumerate(sentences): if not checked_string_types: if isinstance(sentence, string_types): logger.warning( - "Each 'sentences' item should be a list of words (usually unicode strings). " - "First item here is instead plain %s.", - type(sentence) + "Each 'sentences' item should be a list of words (usually unicode strings)." + "First item here is instead plain %s.", type(sentence) ) checked_string_types += 1 if sentence_no % progress_per == 0: - logger.info( - "PROGRESS: at sentence #%i, processed %i words, keeping %i word types", - sentence_no, sum(itervalues(vocab)) + total_words, len(vocab) - ) + logger.info("PROGRESS: at sentence #%i, processed %i words, keeping %i word types", + sentence_no, total_words, len(vocab)) + for word in sentence: vocab[word] += 1 + total_words += 1 if self.max_vocab_size and len(vocab) > self.max_vocab_size: - total_words += utils.prune_vocab(vocab, min_reduce, trim_rule=trim_rule) + utils.prune_vocab(vocab, min_reduce, trim_rule=trim_rule) min_reduce += 1 - total_words += sum(itervalues(vocab)) - logger.info( - "collected %i word types from a corpus of %i raw words and %i sentences", - len(vocab), total_words, sentence_no + 1 - ) + logger.info("collected %i word types from a corpus of %i raw words and %i sentences", len(vocab), total_words, sentence_no + 1) self.corpus_count = sentence_no + 1 self.raw_vocab = vocab - def scale_vocab(self, min_count=None, sample=None, dry_run=False, - keep_raw_vocab=False, trim_rule=None, update=False): + def scale_vocab(self, min_count=None, sample=None, dry_run=False, keep_raw_vocab=False, trim_rule=None, + update=False): """ Apply vocabulary settings for `min_count` (discarding less-frequent words) and `sample` (controlling the downsampling of more-frequent words). @@ -700,16 +675,12 @@ def scale_vocab(self, min_count=None, sample=None, dry_run=False, drop_total += v original_unique_total = len(retain_words) + drop_unique retain_unique_pct = len(retain_words) * 100 / max(original_unique_total, 1) - logger.info( - "min_count=%d retains %i unique words (%i%% of original %i, drops %i)", - min_count, len(retain_words), retain_unique_pct, original_unique_total, drop_unique - ) + logger.info("min_count=%d retains %i unique words (%i%% of original %i, drops %i)", + min_count, len(retain_words), retain_unique_pct, original_unique_total, drop_unique) original_total = retain_total + drop_total retain_pct = retain_total * 100 / max(original_total, 1) - logger.info( - "min_count=%d leaves %i word corpus (%i%% of original %i, drops %i)", - min_count, retain_total, retain_pct, original_total, drop_total - ) + logger.info("min_count=%d leaves %i word corpus (%i%% of original %i, drops %i)", + min_count, retain_total, retain_pct, original_total, drop_total) else: logger.info("Updating model with new vocabulary") new_total = pre_exist_total = 0 @@ -733,12 +704,10 @@ def scale_vocab(self, min_count=None, sample=None, dry_run=False, original_unique_total = len(pre_exist_words) + len(new_words) + drop_unique pre_exist_unique_pct = len(pre_exist_words) * 100 / max(original_unique_total, 1) new_unique_pct = len(new_words) * 100 / max(original_unique_total, 1) - logger.info( - "New added %i unique words (%i%% of original %i) " - "and increased the count of %i pre-existing words (%i%% of original %i)", - len(new_words), new_unique_pct, original_unique_total, len(pre_exist_words), - pre_exist_unique_pct, original_unique_total - ) + logger.info("""New added %i unique words (%i%% of original %i) + and increased the count of %i pre-existing words (%i%% of original %i)""", + len(new_words), new_unique_pct, original_unique_total, + len(pre_exist_words), pre_exist_unique_pct, original_unique_total) retain_words = new_words + pre_exist_words retain_total = new_total + pre_exist_total @@ -764,23 +733,22 @@ def scale_vocab(self, min_count=None, sample=None, dry_run=False, word_probability = 1.0 downsample_total += v if not dry_run: - self.wv.vocab[w].sample_int = int(round(word_probability * 2**32)) + self.wv.vocab[w].sample_int = int(round(word_probability * 2 ** 32)) if not dry_run and not keep_raw_vocab: logger.info("deleting the raw counts dictionary of %i items", len(self.raw_vocab)) self.raw_vocab = defaultdict(int) logger.info("sample=%g downsamples %i most-common words", sample, downsample_unique) - logger.info( - "downsampling leaves estimated %i word corpus (%.1f%% of prior %i)", - downsample_total, downsample_total * 100.0 / max(retain_total, 1), retain_total - ) + logger.info("downsampling leaves estimated %i word corpus (%.1f%% of prior %i)", + downsample_total, downsample_total * 100.0 / max(retain_total, 1), retain_total) - # return from each step: words-affected, resulting-corpus-size, extra memory estimates - report_values = { - 'drop_unique': drop_unique, 'retain_total': retain_total, 'downsample_unique': downsample_unique, - 'downsample_total': int(downsample_total), 'memory': self.estimate_memory(vocab_size=len(retain_words)) - } + # return from each step: words-affected, resulting-corpus-size + report_values = {'drop_unique': drop_unique, 'retain_total': retain_total, + 'downsample_unique': downsample_unique, 'downsample_total': int(downsample_total)} + + # print extra memory estimates + report_values['memory'] = self.estimate_memory(vocab_size=len(retain_words)) return report_values @@ -846,7 +814,8 @@ def _raw_word_count(self, job): return sum(len(sentence) for sentence in job) def train(self, sentences, total_examples=None, total_words=None, - epochs=None, start_alpha=None, end_alpha=None, word_count=0, + epochs=None, start_alpha=None, end_alpha=None, + word_count=0, queue_factor=2, report_delay=1.0, compute_loss=None): """ Update the model's neural weights from a sequence of sentences (can be a once-only generator stream). @@ -862,13 +831,11 @@ def train(self, sentences, total_examples=None, total_words=None, explicit `epochs` argument MUST be provided. In the common and recommended case, where `train()` is only called once, the model's cached `iter` value should be supplied as `epochs` value. """ - if self.model_trimmed_post_training: + if (self.model_trimmed_post_training): raise RuntimeError("Parameters for training were discarded using model_trimmed_post_training method") if FAST_VERSION < 0: - warnings.warn( - "C extension not loaded for Word2Vec, training will be slow. " - "Install a C compiler and reinstall gensim for fast training." - ) + warnings.warn("C extension not loaded for Word2Vec, training will be slow. " + "Install a C compiler and reinstall gensim for fast training.") self.neg_labels = [] if self.negative > 0: # precompute negative labels optimization for pure-python training @@ -882,8 +849,8 @@ def train(self, sentences, total_examples=None, total_words=None, logger.info( "training model with %i workers on %i vocabulary and %i features, " "using sg=%s hs=%s sample=%s negative=%s window=%s", - self.workers, len(self.wv.vocab), self.layer1_size, self.sg, self.hs, self.sample, self.negative, self.window - ) + self.workers, len(self.wv.vocab), self.layer1_size, self.sg, + self.hs, self.sample, self.negative, self.window) if not self.wv.vocab: raise RuntimeError("you must first build vocabulary before training the model") @@ -894,15 +861,11 @@ def train(self, sentences, total_examples=None, total_words=None, raise ValueError( "The number of sentences in the training corpus is missing. Did you load the model via KeyedVectors.load_word2vec_format?" "Models loaded via load_word2vec_format don't support further training. " - "Instead start with a blank model, scan_vocab on the new corpus, " - "intersect_word2vec_format with the old model, then train." - ) + "Instead start with a blank model, scan_vocab on the new corpus, intersect_word2vec_format with the old model, then train.") if total_words is None and total_examples is None: raise ValueError( - "You must specify either total_examples or total_words, for proper alpha and progress calculations. " - "The usual value is total_examples=model.corpus_count." - ) + "You must specify either total_examples or total_words, for proper alpha and progress calculations. The usual value is total_examples=model.corpus_count.") if epochs is None: raise ValueError("You must specify an explict epochs count. The usual value is epochs=model.iter.") start_alpha = start_alpha or self.alpha @@ -937,7 +900,9 @@ def job_producer(): pushed_words, pushed_examples = 0, 0 next_alpha = start_alpha if next_alpha > self.min_alpha_yet_reached: - logger.warning("Effective 'alpha' higher than previous training cycles") + logger.warning( + "Effective 'alpha' higher than previous training cycles" + ) self.min_alpha_yet_reached = next_alpha job_no = 0 @@ -953,8 +918,7 @@ def job_producer(): # no => submit the existing job logger.debug( "queueing job #%i (%i words, %i sentences) at alpha %.05f", - job_no, batch_size, len(job_batch), next_alpha - ) + job_no, batch_size, len(job_batch), next_alpha) job_no += 1 job_queue.put((job_batch, next_alpha)) @@ -978,15 +942,15 @@ def job_producer(): if job_batch: logger.debug( "queueing job #%i (%i words, %i sentences) at alpha %.05f", - job_no, batch_size, len(job_batch), next_alpha - ) + job_no, batch_size, len(job_batch), next_alpha) job_no += 1 job_queue.put((job_batch, next_alpha)) if job_no == 0 and self.train_count == 0: logger.warning( "train() called with an empty iterator (if not intended, " - "be sure to provide a corpus that offers restartable iteration = an iterable)." + "be sure to provide a corpus that offers restartable " + "iteration = an iterable)." ) # give the workers heads up that they can finish -- no more work! @@ -1031,31 +995,34 @@ def job_producer(): logger.info( "PROGRESS: at %.2f%% examples, %.0f words/s, in_qsize %i, out_qsize %i", 100.0 * example_count / total_examples, trained_word_count / elapsed, - utils.qsize(job_queue), utils.qsize(progress_queue) - ) + utils.qsize(job_queue), utils.qsize(progress_queue)) else: # words-based progress % logger.info( "PROGRESS: at %.2f%% words, %.0f words/s, in_qsize %i, out_qsize %i", 100.0 * raw_word_count / total_words, trained_word_count / elapsed, - utils.qsize(job_queue), utils.qsize(progress_queue) - ) + utils.qsize(job_queue), utils.qsize(progress_queue)) next_report = elapsed + report_delay # all done; report the final stats elapsed = default_timer() - start logger.info( "training on %i raw words (%i effective words) took %.1fs, %.0f effective words/s", - raw_word_count, trained_word_count, elapsed, trained_word_count / elapsed - ) + raw_word_count, trained_word_count, elapsed, trained_word_count / elapsed) if job_tally < 10 * self.workers: - logger.warning("under 10 jobs per worker: consider setting a smaller `batch_words' for smoother alpha decay") + logger.warning( + "under 10 jobs per worker: consider setting a smaller `batch_words' for smoother alpha decay" + ) # check that the input corpus hasn't changed during iteration if total_examples and total_examples != example_count: - logger.warning("supplied example count (%i) did not equal expected count (%i)", example_count, total_examples) + logger.warning( + "supplied example count (%i) did not equal expected count (%i)", example_count, total_examples + ) if total_words and total_words != raw_word_count: - logger.warning("supplied raw word count (%i) did not equal expected count (%i)", raw_word_count, total_words) + logger.warning( + "supplied raw word count (%i) did not equal expected count (%i)", raw_word_count, total_words + ) self.train_count += 1 # number of times train() has been called self.total_train_time += elapsed @@ -1082,25 +1049,21 @@ def score(self, sentences, total_sentences=int(1e6), chunksize=100, queue_factor """ if FAST_VERSION < 0: - warnings.warn( - "C extension compilation failed, scoring will be slow. " - "Install a C compiler and reinstall gensim for fastness." - ) + warnings.warn("C extension compilation failed, scoring will be slow. " + "Install a C compiler and reinstall gensim for fastness.") logger.info( "scoring sentences with %i workers on %i vocabulary and %i features, " "using sg=%s hs=%s sample=%s and negative=%s", - self.workers, len(self.wv.vocab), self.layer1_size, self.sg, self.hs, self.sample, self.negative - ) + self.workers, len(self.wv.vocab), self.layer1_size, self.sg, self.hs, self.sample, self.negative) if not self.wv.vocab: raise RuntimeError("you must first build vocabulary before scoring new data") if not self.hs: - raise RuntimeError( - "We have currently only implemented score for the hierarchical softmax scheme, " - "so you need to have run word2vec with hs=1 and negative=0 for this to work." - ) + raise RuntimeError("We have currently only implemented score \ + for the hierarchical softmax scheme, so you need to have \ + run word2vec with hs=1 and negative=0 for this to work.") def worker_loop(): """Compute log probability for each sentence, lifting lists of sentences from the jobs queue.""" @@ -1146,14 +1109,15 @@ def worker_loop(): if (job_no - 1) * chunksize > total_sentences: logger.warning( "terminating after %i sentences (set higher total_sentences if you want more).", - total_sentences - ) + total_sentences) job_no -= 1 raise StopIteration() logger.debug("putting job #%i in the queue", job_no) job_queue.put(items) except StopIteration: - logger.info("reached end of input; waiting to finish %i outstanding jobs", job_no - done_jobs + 1) + logger.info( + "reached end of input; waiting to finish %i outstanding jobs", + job_no - done_jobs + 1) for _ in xrange(self.workers): job_queue.put(None) # give the workers heads up that they can finish -- no more work! push_done = True @@ -1166,8 +1130,7 @@ def worker_loop(): if elapsed >= next_report: logger.info( "PROGRESS: at %.2f%% sentences, %.0f sentences/s", - 100.0 * sentence_count, sentence_count / elapsed - ) + 100.0 * sentence_count, sentence_count / elapsed) next_report = elapsed + report_delay # don't flood log, wait report_delay seconds else: # loop ended by job count; really done @@ -1179,8 +1142,7 @@ def worker_loop(): self.clear_sims() logger.info( "scoring %i sentences took %.1fs, %.0f sentences/s", - sentence_count, elapsed, sentence_count / elapsed - ) + sentence_count, elapsed, sentence_count / elapsed) return sentence_scores[:sentence_count] def clear_sims(self): @@ -1207,10 +1169,9 @@ def update_weights(self): # Raise an error if an online update is run before initial training on a corpus if not len(self.wv.syn0): - raise RuntimeError( - "You cannot do an online vocabulary-update of a model which has no prior vocabulary. " - "First build the vocabulary of your model with a corpus before doing an online update." - ) + raise RuntimeError("You cannot do an online vocabulary-update of a model which has no prior vocabulary. " \ + "First build the vocabulary of your model with a corpus " \ + "before doing an online update.") self.wv.syn0 = vstack([self.wv.syn0, newsyn0]) @@ -1259,16 +1220,16 @@ def intersect_word2vec_format(self, fname, lockf=0.0, binary=False, encoding='ut training. Use 1.0 to allow further training updates of merged vectors. """ overlap_count = 0 - logger.info("loading projection weights from %s", fname) + logger.info("loading projection weights from %s" % (fname)) with utils.smart_open(fname) as fin: header = utils.to_unicode(fin.readline(), encoding=encoding) - vocab_size, vector_size = (int(x) for x in header.split()) # throws for invalid file format + vocab_size, vector_size = map(int, header.split()) # throws for invalid file format if not vector_size == self.vector_size: raise ValueError("incompatible vector size %d in file %s" % (vector_size, fname)) # TOCONSIDER: maybe mismatched vectors still useful enough to merge (truncating/padding)? if binary: binary_len = dtype(REAL).itemsize * vector_size - for _ in xrange(vocab_size): + for line_no in xrange(vocab_size): # mixed text and binary: read text first, then binary word = [] while True: @@ -1287,15 +1248,15 @@ def intersect_word2vec_format(self, fname, lockf=0.0, binary=False, encoding='ut for line_no, line in enumerate(fin): parts = utils.to_unicode(line.rstrip(), encoding=encoding, errors=unicode_errors).split(" ") if len(parts) != vector_size + 1: - raise ValueError("invalid vector on line %s (is this really the text format?)" % line_no) - word, weights = parts[0], [REAL(x) for x in parts[1:]] + raise ValueError("invalid vector on line %s (is this really the text format?)" % (line_no)) + word, weights = parts[0], list(map(REAL, parts[1:])) if word in self.wv.vocab: overlap_count += 1 self.wv.syn0[self.wv.vocab[word].index] = weights self.syn0_lockf[self.wv.vocab[word].index] = lockf # lock-factor: 0.0 stops further changes - logger.info("merged %d vectors into %s matrix from %s", overlap_count, self.wv.syn0.shape, fname) + logger.info("merged %d vectors into %s matrix from %s" % (overlap_count, self.wv.syn0.shape, fname)) - def most_similar(self, positive=None, negative=None, topn=10, restrict_vocab=None, indexer=None): + def most_similar(self, positive=[], negative=[], topn=10, restrict_vocab=None, indexer=None): """ Deprecated. Use self.wv.most_similar() instead. Refer to the documentation for `gensim.models.KeyedVectors.most_similar` @@ -1309,7 +1270,7 @@ def wmdistance(self, document1, document2): """ return self.wv.wmdistance(document1, document2) - def most_similar_cosmul(self, positive=None, negative=None, topn=10): + def most_similar_cosmul(self, positive=[], negative=[], topn=10): """ Deprecated. Use self.wv.most_similar_cosmul() instead. Refer to the documentation for `gensim.models.KeyedVectors.most_similar_cosmul` @@ -1368,10 +1329,9 @@ def n_similarity(self, ws1, ws2): def predict_output_word(self, context_words_list, topn=10): """Report the probability distribution of the center word given the context words as input to the trained model.""" if not self.negative: - raise RuntimeError( - "We have currently only implemented predict_output_word for the negative sampling scheme, " - "so you need to have run word2vec with negative > 0 for this to work." - ) + raise RuntimeError("We have currently only implemented predict_output_word " + "for the negative sampling scheme, so you need to have " + "run word2vec with negative > 0 for this to work.") if not hasattr(self.wv, 'syn0') or not hasattr(self, 'syn1neg'): raise RuntimeError("Parameters required for predicting the output words not found.") @@ -1390,7 +1350,8 @@ def predict_output_word(self, context_words_list, topn=10): prob_values = exp(dot(l1, self.syn1neg.T)) # propagate hidden -> output and take softmax to get probabilities prob_values /= sum(prob_values) top_indices = matutils.argsort(prob_values, topn=topn, reverse=True) - return [(self.wv.index2word[index1], prob_values[index1]) for index1 in top_indices] # returning the most probable output words with their probabilities + return [(self.wv.index2word[index1], prob_values[index1]) for index1 in + top_indices] # returning the most probable output words with their probabilities def init_sims(self, replace=False): """ @@ -1412,10 +1373,8 @@ def estimate_memory(self, vocab_size=None, report=None): if self.negative: report['syn1neg'] = vocab_size * self.layer1_size * dtype(REAL).itemsize report['total'] = sum(report.values()) - logger.info( - "estimated required memory for %i words and %i dimensions: %i bytes", - vocab_size, self.vector_size, report['total'] - ) + logger.info("estimated required memory for %i words and %i dimensions: %i bytes", + vocab_size, self.vector_size, report['total']) return report @staticmethod @@ -1434,7 +1393,8 @@ def log_evaluate_word_pairs(pearson, spearman, oov, pairs): """ return KeyedVectors.log_evaluate_word_pairs(pearson, spearman, oov, pairs) - def evaluate_word_pairs(self, pairs, delimiter='\t', restrict_vocab=300000, case_insensitive=True, dummy4unknown=False): + def evaluate_word_pairs(self, pairs, delimiter='\t', restrict_vocab=300000, case_insensitive=True, + dummy4unknown=False): """ Deprecated. Use self.wv.evaluate_word_pairs() instead. Refer to the documentation for `gensim.models.KeyedVectors.evaluate_word_pairs` @@ -1442,14 +1402,12 @@ def evaluate_word_pairs(self, pairs, delimiter='\t', restrict_vocab=300000, case return self.wv.evaluate_word_pairs(pairs, delimiter, restrict_vocab, case_insensitive, dummy4unknown) def __str__(self): - return "%s(vocab=%s, size=%s, alpha=%s)" % (self.__class__.__name__, len(self.wv.index2word), self.vector_size, self.alpha) + return "%s(vocab=%s, size=%s, alpha=%s)" % ( + self.__class__.__name__, len(self.wv.index2word), self.vector_size, self.alpha) def _minimize_model(self, save_syn1=False, save_syn1neg=False, save_syn0_lockf=False): warnings.warn( - "This method would be deprecated in the future. " - "Keep just_word_vectors = model.wv to retain just the KeyedVectors instance " - "for read-only querying of word vectors." - ) + "This method would be deprecated in the future. Keep just_word_vectors = model.wv to retain just the KeyedVectors instance for read-only querying of word vectors.") if save_syn1 and save_syn1neg and save_syn0_lockf: return if hasattr(self, 'syn1') and not save_syn1: @@ -1492,7 +1450,7 @@ def load(cls, *args, **kwargs): if hasattr(v, 'sample_int'): break # already 0.12.0+ style int probabilities elif hasattr(v, 'sample_probability'): - v.sample_int = int(round(v.sample_probability * 2**32)) + v.sample_int = int(round(v.sample_probability * 2 ** 32)) del v.sample_probability if not hasattr(model, 'syn0_lockf') and hasattr(model, 'syn0'): model.syn0_lockf = ones(len(model.wv.syn0), dtype=REAL) @@ -1516,7 +1474,7 @@ def _load_specials(self, *args, **kwargs): @classmethod def load_word2vec_format(cls, fname, fvocab=None, binary=False, encoding='utf8', unicode_errors='strict', - limit=None, datatype=REAL): + limit=None, datatype=REAL): """Deprecated. Use gensim.models.KeyedVectors.load_word2vec_format instead.""" raise DeprecationWarning("Deprecated. Use gensim.models.KeyedVectors.load_word2vec_format instead.") @@ -1623,7 +1581,7 @@ def __iter__(self): line = utils.to_unicode(line).split() i = 0 while i < len(line): - yield line[i: i + self.max_sentence_length] + yield line[i:i + self.max_sentence_length] i += self.max_sentence_length @@ -1641,7 +1599,7 @@ def __init__(self, source, max_sentence_length=MAX_WORDS_IN_BATCH, limit=None): Example:: - sentences = PathLineSentences(os.getcwd() + '\\corpus\\') + sentences = LineSentencePath(os.getcwd() + '\\corpus\\') The files in the directory should be either text files, .bz2 files, or .gz files. @@ -1655,19 +1613,19 @@ def __init__(self, source, max_sentence_length=MAX_WORDS_IN_BATCH, limit=None): self.input_files = [self.source] # force code compatibility with list of files elif os.path.isdir(self.source): self.source = os.path.join(self.source, '') # ensures os-specific slash at end of path - logging.debug('reading directory %s', self.source) + logging.debug('reading directory ' + self.source) self.input_files = os.listdir(self.source) self.input_files = [self.source + file for file in self.input_files] # make full paths self.input_files.sort() # makes sure it happens in filename order else: # not a file or a directory, then we can't do anything with it raise ValueError('input is neither a file nor a path') - logging.info('files read into PathLineSentences:%s', '\n'.join(self.input_files)) + logging.info('files read into PathLineSentences:' + '\n'.join(self.input_files)) def __iter__(self): - """iterate through the files""" + '''iterate through the files''' for file_name in self.input_files: - logging.info('reading file %s', file_name) + logging.info('reading file ' + file_name) with utils.smart_open(file_name) as fin: for line in itertools.islice(fin, self.limit): line = utils.to_unicode(line).split() @@ -1680,6 +1638,7 @@ def __iter__(self): # Example: ./word2vec.py -train data.txt -output vec.txt -size 200 -window 5 -sample 1e-4 -negative 5 -hs 0 -binary 0 -cbow 1 -iter 3 if __name__ == "__main__": import argparse + logging.basicConfig( format='%(asctime)s : %(threadName)s : %(levelname)s : %(message)s', level=logging.INFO) @@ -1692,7 +1651,7 @@ def __iter__(self): print(globals()['__doc__'] % locals()) sys.exit(1) - from gensim.models.word2vec import Word2Vec # noqa:F811 avoid referencing __main__ in pickle + from gensim.models.word2vec import Word2Vec # avoid referencing __main__ in pickle seterr(all='raise') # don't ignore numpy errors @@ -1701,14 +1660,23 @@ def __iter__(self): parser.add_argument("-output", help="Use file OUTPUT to save the resulting word vectors") parser.add_argument("-window", help="Set max skip length WINDOW between words; default is 5", type=int, default=5) parser.add_argument("-size", help="Set size of word vectors; default is 100", type=int, default=100) - parser.add_argument("-sample", help="Set threshold for occurrence of words. Those that appear with higher frequency in the training data will be randomly down-sampled; default is 1e-3, useful range is (0, 1e-5)", type=float, default=1e-3) - parser.add_argument("-hs", help="Use Hierarchical Softmax; default is 0 (not used)", type=int, default=0, choices=[0, 1]) - parser.add_argument("-negative", help="Number of negative examples; default is 5, common values are 3 - 10 (0 = not used)", type=int, default=5) + parser.add_argument("-sample", + help="Set threshold for occurrence of words. Those that appear with higher frequency in the training data will be randomly down-sampled; default is 1e-3, useful range is (0, 1e-5)", + type=float, default=1e-3) + parser.add_argument("-hs", help="Use Hierarchical Softmax; default is 0 (not used)", type=int, default=0, + choices=[0, 1]) + parser.add_argument("-negative", + help="Number of negative examples; default is 5, common values are 3 - 10 (0 = not used)", + type=int, default=5) parser.add_argument("-threads", help="Use THREADS threads (default 12)", type=int, default=12) parser.add_argument("-iter", help="Run more training iterations (default 5)", type=int, default=5) - parser.add_argument("-min_count", help="This will discard words that appear less than MIN_COUNT times; default is 5", type=int, default=5) - parser.add_argument("-cbow", help="Use the continuous bag of words model; default is 1 (use 0 for skip-gram model)", type=int, default=1, choices=[0, 1]) - parser.add_argument("-binary", help="Save the resulting vectors in binary mode; default is 0 (off)", type=int, default=0, choices=[0, 1]) + parser.add_argument("-min_count", + help="This will discard words that appear less than MIN_COUNT times; default is 5", type=int, + default=5) + parser.add_argument("-cbow", help="Use the continuous bag of words model; default is 1 (use 0 for skip-gram model)", + type=int, default=1, choices=[0, 1]) + parser.add_argument("-binary", help="Save the resulting vectors in binary mode; default is 0 (off)", type=int, + default=0, choices=[0, 1]) parser.add_argument("-accuracy", help="Use questions from file ACCURACY to evaluate the model") args = parser.parse_args() @@ -1723,8 +1691,7 @@ def __iter__(self): model = Word2Vec( corpus, size=args.size, min_count=args.min_count, workers=args.threads, window=args.window, sample=args.sample, sg=skipgram, hs=args.hs, - negative=args.negative, cbow_mean=1, iter=args.iter - ) + negative=args.negative, cbow_mean=1, iter=args.iter) if args.output: outfile = args.output @@ -1741,3 +1708,5 @@ def __iter__(self): model.accuracy(args.accuracy) logger.info("finished running %s", program) + + From c4f387eddf6ebe6f20c1b31f2a301115b23b71b7 Mon Sep 17 00:00:00 2001 From: jodevak Date: Mon, 25 Sep 2017 18:11:44 +0300 Subject: [PATCH 02/20] fix build vocab speed issue, function build vocab from previously provided word frequencies table --- gensim/models/word2vec.py | 376 +++++++++++++++++++++----------------- 1 file changed, 212 insertions(+), 164 deletions(-) diff --git a/gensim/models/word2vec.py b/gensim/models/word2vec.py index fea1cdc990..2e6eb89cb2 100644 --- a/gensim/models/word2vec.py +++ b/gensim/models/word2vec.py @@ -114,18 +114,16 @@ except ImportError: from Queue import Queue, Empty -from numpy import exp, log, dot, zeros, outer, random, dtype, float32 as REAL, \ - double, uint32, seterr, array, uint8, vstack, fromstring, sqrt, newaxis, \ - ndarray, empty, sum as np_sum, prod, ones, ascontiguousarray, vstack, logaddexp +from numpy import exp, log, dot, zeros, outer, random, dtype, float32 as REAL,\ + uint32, seterr, array, uint8, vstack, fromstring, sqrt,\ + empty, sum as np_sum, ones, logaddexp from scipy.special import expit from gensim import utils, matutils # utility fnc for pickling, common scipy operations etc -from gensim.corpora.dictionary import Dictionary from six import iteritems, itervalues, string_types from six.moves import xrange from types import GeneratorType -from scipy import stats logger = logging.getLogger(__name__) @@ -138,7 +136,6 @@ FAST_VERSION = -1 MAX_WORDS_IN_BATCH = 10000 - def train_batch_sg(model, sentences, alpha, work=None, compute_loss=False): """ Update skip-gram model by training on a sequence of sentences. @@ -153,7 +150,7 @@ def train_batch_sg(model, sentences, alpha, work=None, compute_loss=False): result = 0 for sentence in sentences: word_vocabs = [model.wv.vocab[w] for w in sentence if w in model.wv.vocab and - model.wv.vocab[w].sample_int > model.random.rand() * 2 ** 32] + model.wv.vocab[w].sample_int > model.random.rand() * 2**32] for pos, word in enumerate(word_vocabs): reduced_window = model.random.randint(model.window) # `b` in the original word2vec code @@ -162,13 +159,13 @@ def train_batch_sg(model, sentences, alpha, work=None, compute_loss=False): for pos2, word2 in enumerate(word_vocabs[start:(pos + model.window + 1 - reduced_window)], start): # don't train on the `word` itself if pos2 != pos: - train_sg_pair(model, model.wv.index2word[word.index], word2.index, alpha, - compute_loss=compute_loss) + train_sg_pair( + model, model.wv.index2word[word.index], word2.index, alpha, compute_loss=compute_loss + ) result += len(word_vocabs) return result - def train_batch_cbow(model, sentences, alpha, work=None, neu1=None, compute_loss=False): """ Update CBOW model by training on a sequence of sentences. @@ -183,7 +180,7 @@ def train_batch_cbow(model, sentences, alpha, work=None, neu1=None, compute_loss result = 0 for sentence in sentences: word_vocabs = [model.wv.vocab[w] for w in sentence if w in model.wv.vocab and - model.wv.vocab[w].sample_int > model.random.rand() * 2 ** 32] + model.wv.vocab[w].sample_int > model.random.rand() * 2**32] for pos, word in enumerate(word_vocabs): reduced_window = model.random.randint(model.window) # `b` in the original word2vec code start = max(0, pos - model.window + reduced_window) @@ -196,7 +193,6 @@ def train_batch_cbow(model, sentences, alpha, work=None, neu1=None, compute_loss result += len(word_vocabs) return result - def score_sentence_sg(model, sentence, work=None): """ Obtain likelihood score for a single sentence in a fitted skip-gram representaion. @@ -226,7 +222,6 @@ def score_sentence_sg(model, sentence, work=None): return log_prob_sentence - def score_sentence_cbow(model, sentence, work=None, neu1=None): """ Obtain likelihood score for a single sentence in a fitted CBOW representaion. @@ -259,18 +254,32 @@ def score_sentence_cbow(model, sentence, work=None, neu1=None): def train_sg_pair(model, word, context_index, alpha, learn_vectors=True, learn_hidden=True, - context_vectors=None, context_locks=None, compute_loss=False): + context_vectors=None, context_locks=None, compute_loss=False, is_ft=False): if context_vectors is None: - context_vectors = model.wv.syn0 + if is_ft: + context_vectors_vocab = model.wv.syn0_vocab + context_vectors_ngrams = model.wv.syn0_ngrams + else: + context_vectors = model.wv.syn0 if context_locks is None: - context_locks = model.syn0_lockf + if is_ft: + context_locks_vocab = model.syn0_vocab_lockf + context_locks_ngrams = model.syn0_ngrams_lockf + else: + context_locks = model.syn0_lockf if word not in model.wv.vocab: return predict_word = model.wv.vocab[word] # target word (NN output) - l1 = context_vectors[context_index] # input word (NN input/projection layer) - lock_factor = context_locks[context_index] + if is_ft: + l1_vocab = context_vectors_vocab[context_index[0]] + l1_ngrams = np_sum(context_vectors_ngrams[context_index[1:]], axis=0) + if context_index: + l1 = np_sum([l1_vocab, l1_ngrams], axis=0) / len(context_index) + else: + l1 = context_vectors[context_index] # input word (NN input/projection layer) + lock_factor = context_locks[context_index] neu1e = zeros(l1.shape) @@ -286,7 +295,7 @@ def train_sg_pair(model, word, context_index, alpha, learn_vectors=True, learn_h # loss component corresponding to hierarchical softmax if compute_loss: - sgn = (-1.0) ** predict_word.code # `ch` function, 0 -> 1, 1 -> -1 + sgn = (-1.0)**predict_word.code # `ch` function, 0 -> 1, 1 -> -1 lprob = -log(expit(-sgn * prod_term)) model.running_training_loss += sum(lprob) @@ -311,12 +320,30 @@ def train_sg_pair(model, word, context_index, alpha, learn_vectors=True, learn_h model.running_training_loss -= log(expit(prod_term[0])) # for the output word if learn_vectors: - l1 += neu1e * lock_factor # learn input -> hidden (mutates model.wv.syn0[word2.index], if that is l1) + if is_ft: + model.wv.syn0_vocab[context_index[0]] += neu1e * context_locks_vocab[context_index[0]] + for i in context_index[1:]: + model.wv.syn0_ngrams[i] += neu1e * context_locks_ngrams[i] + else: + l1 += neu1e * lock_factor # learn input -> hidden (mutates model.wv.syn0[word2.index], if that is l1) return neu1e -def train_cbow_pair(model, word, input_word_indices, l1, alpha, learn_vectors=True, learn_hidden=True, - compute_loss=False): +def train_cbow_pair(model, word, input_word_indices, l1, alpha, learn_vectors=True, learn_hidden=True, compute_loss=False, + context_vectors=None, context_locks=None, is_ft=False): + if context_vectors is None: + if is_ft: + context_vectors_vocab = model.wv.syn0_vocab + context_vectors_ngrams = model.wv.syn0_ngrams + else: + context_vectors = model.wv.syn0 + if context_locks is None: + if is_ft: + context_locks_vocab = model.syn0_vocab_lockf + context_locks_ngrams = model.syn0_ngrams_lockf + else: + context_locks = model.syn0_lockf + neu1e = zeros(l1.shape) if model.hs: @@ -330,7 +357,7 @@ def train_cbow_pair(model, word, input_word_indices, l1, alpha, learn_vectors=Tr # loss component corresponding to hierarchical softmax if compute_loss: - sgn = (-1.0) ** word.code # ch function, 0-> 1, 1 -> -1 + sgn = (-1.0)**word.code # ch function, 0-> 1, 1 -> -1 model.running_training_loss += sum(-log(expit(-sgn * prod_term))) if model.negative: @@ -355,10 +382,18 @@ def train_cbow_pair(model, word, input_word_indices, l1, alpha, learn_vectors=Tr if learn_vectors: # learn input -> hidden, here for all words in the window separately - if not model.cbow_mean and input_word_indices: - neu1e /= len(input_word_indices) - for i in input_word_indices: - model.wv.syn0[i] += neu1e * model.syn0_lockf[i] + if is_ft: + if not model.cbow_mean and input_word_indices: + neu1e /= (len(input_word_indices[0]) + len(input_word_indices[1])) + for i in input_word_indices[0]: + context_vectors_vocab[i] += neu1e * context_locks_vocab[i] + for i in input_word_indices[1]: + context_vectors_ngrams[i] += neu1e * context_locks_ngrams[i] + else: + if not model.cbow_mean and input_word_indices: + neu1e /= len(input_word_indices) + for i in input_word_indices: + context_vectors[i] += neu1e * context_locks[i] return neu1e @@ -366,14 +401,14 @@ def train_cbow_pair(model, word, input_word_indices, l1, alpha, learn_vectors=Tr def score_sg_pair(model, word, word2): l1 = model.wv.syn0[word2.index] l2a = deepcopy(model.syn1[word.point]) # 2d matrix, codelen x layer1_size - sgn = (-1.0) ** word.code # ch function, 0-> 1, 1 -> -1 + sgn = (-1.0)**word.code # ch function, 0-> 1, 1 -> -1 lprob = -logaddexp(0, -sgn * dot(l1, l2a.T)) return sum(lprob) def score_cbow_pair(model, word, l1): l2a = model.syn1[word.point] # 2d matrix, codelen x layer1_size - sgn = (-1.0) ** word.code # ch function, 0-> 1, 1 -> -1 + sgn = (-1.0)**word.code # ch function, 0-> 1, 1 -> -1 lprob = -logaddexp(0, -sgn * dot(l1, l2a.T)) return sum(lprob) @@ -390,11 +425,10 @@ class Word2Vec(utils.SaveLoad): """ - def __init__( - self, sentences=None, size=100, alpha=0.025, window=5, min_count=5, - max_vocab_size=None, sample=1e-3, seed=1, workers=3, min_alpha=0.0001, - sg=0, hs=0, negative=5, cbow_mean=1, hashfxn=hash, iter=5, null_word=0, - trim_rule=None, sorted_vocab=1, batch_words=MAX_WORDS_IN_BATCH, compute_loss=False): + def __init__(self, sentences=None, size=100, alpha=0.025, window=5, min_count=5, + max_vocab_size=None, sample=1e-3, seed=1, workers=3, min_alpha=0.0001, + sg=0, hs=0, negative=5, cbow_mean=1, hashfxn=hash, iter=5, null_word=0, + trim_rule=None, sorted_vocab=1, batch_words=MAX_WORDS_IN_BATCH, compute_loss=False): """ Initialize the model from an iterable of `sentences`. Each sentence is a list of words (unicode strings) that will be used for training. @@ -468,9 +502,9 @@ def __init__( self.load = call_on_class_only if FAST_VERSION == -1: - logger.warning('Slow version of {0} is being used'.format(__name__)) + logger.warning('Slow version of %s is being used', __name__) else: - logger.debug('Fast version of {0} is being used'.format(__name__)) + logger.debug('Fast version of %s is being used', __name__) self.initialize_word_vectors() self.sg = int(sg) @@ -506,18 +540,19 @@ def __init__( if isinstance(sentences, GeneratorType): raise TypeError("You can't pass a generator as the sentences argument. Try an iterator.") self.build_vocab(sentences, trim_rule=trim_rule) - self.train(sentences, total_examples=self.corpus_count, epochs=self.iter, - start_alpha=self.alpha, end_alpha=self.min_alpha) + self.train(sentences, total_examples=self.corpus_count, epochs=self.iter, start_alpha=self.alpha, end_alpha=self.min_alpha) else: if trim_rule is not None: logger.warning( - "The rule, if given, is only used to prune vocabulary during build_vocab() and is not stored as part of the model. ") - logger.warning("Model initialized without sentences. trim_rule provided, if any, will be ignored.") + "The rule, if given, is only used to prune vocabulary during build_vocab() " + "and is not stored as part of the model. Model initialized without sentences. " + "trim_rule provided, if any, will be ignored." + ) def initialize_word_vectors(self): self.wv = KeyedVectors() - def make_cum_table(self, power=0.75, domain=2 ** 31 - 1): + def make_cum_table(self, power=0.75, domain=2**31 - 1): """ Create a cumulative-distribution table using stored vocabulary word counts for drawing random words in the negative-sampling training routines. @@ -534,10 +569,10 @@ def make_cum_table(self, power=0.75, domain=2 ** 31 - 1): # compute sum of all power (Z in paper) train_words_pow = 0.0 for word_index in xrange(vocab_size): - train_words_pow += self.wv.vocab[self.wv.index2word[word_index]].count ** power + train_words_pow += self.wv.vocab[self.wv.index2word[word_index]].count**power cumulative = 0.0 for word_index in xrange(vocab_size): - cumulative += self.wv.vocab[self.wv.index2word[word_index]].count ** power + cumulative += self.wv.vocab[self.wv.index2word[word_index]].count**power self.cum_table[word_index] = round(cumulative / train_words_pow * domain) if len(self.cum_table) > 0: assert self.cum_table[-1] == domain @@ -555,8 +590,9 @@ def create_binary_tree(self): heapq.heapify(heap) for i in xrange(len(self.wv.vocab) - 1): min1, min2 = heapq.heappop(heap), heapq.heappop(heap) - heapq.heappush(heap, - Vocab(count=min1.count + min2.count, index=i + len(self.wv.vocab), left=min1, right=min2)) + heapq.heappush( + heap, Vocab(count=min1.count + min2.count, index=i + len(self.wv.vocab), left=min1, right=min2) + ) # recurse over the tree, assigning a binary code to each vocabulary word if heap: @@ -585,6 +621,7 @@ def build_vocab(self, sentences, keep_raw_vocab=False, trim_rule=None, progress_ self.scale_vocab(keep_raw_vocab=keep_raw_vocab, trim_rule=trim_rule, update=update) # trim by min_count & precalculate downsampling self.finalize_vocab(update=update) # build tables & arrays + def build_vocab_from_freq(self, word_freq, keep_raw_vocab=False, corpus_count=None, trim_rule=None, update=False): """ @@ -599,8 +636,8 @@ def build_vocab_from_freq(self, word_freq, keep_raw_vocab=False, corpus_count=No self.scale_vocab(keep_raw_vocab=keep_raw_vocab, trim_rule=trim_rule, update=update) # trim by min_count & precalculate downsampling self.finalize_vocab(update=update) # build tables & arrays - def scan_vocab(self, sentences, progress_per=10000, trim_rule=None): + def scan_vocab(self, sentences, progress_per=10000, trim_rule=None): """Do an initial scan of all words appearing in sentences.""" logger.info("collecting all words and their counts") sentence_no = -1 @@ -608,33 +645,37 @@ def scan_vocab(self, sentences, progress_per=10000, trim_rule=None): min_reduce = 1 vocab = defaultdict(int) checked_string_types = 0 - for sentence_no, sentence in enumerate(sentences): if not checked_string_types: if isinstance(sentence, string_types): logger.warning( - "Each 'sentences' item should be a list of words (usually unicode strings)." - "First item here is instead plain %s.", type(sentence) + "Each 'sentences' item should be a list of words (usually unicode strings). " + "First item here is instead plain %s.", + type(sentence) ) checked_string_types += 1 if sentence_no % progress_per == 0: - logger.info("PROGRESS: at sentence #%i, processed %i words, keeping %i word types", - sentence_no, total_words, len(vocab)) - + logger.info( + "PROGRESS: at sentence #%i, processed %i words, keeping %i word types", + sentence_no, sum(itervalues(vocab)) + total_words, len(vocab) + ) for word in sentence: vocab[word] += 1 - total_words += 1 if self.max_vocab_size and len(vocab) > self.max_vocab_size: - utils.prune_vocab(vocab, min_reduce, trim_rule=trim_rule) + total_words += utils.prune_vocab(vocab, min_reduce, trim_rule=trim_rule) min_reduce += 1 - logger.info("collected %i word types from a corpus of %i raw words and %i sentences", len(vocab), total_words, sentence_no + 1) + total_words += sum(itervalues(vocab)) + logger.info( + "collected %i word types from a corpus of %i raw words and %i sentences", + len(vocab), total_words, sentence_no + 1 + ) self.corpus_count = sentence_no + 1 self.raw_vocab = vocab - def scale_vocab(self, min_count=None, sample=None, dry_run=False, keep_raw_vocab=False, trim_rule=None, - update=False): + def scale_vocab(self, min_count=None, sample=None, dry_run=False, + keep_raw_vocab=False, trim_rule=None, update=False): """ Apply vocabulary settings for `min_count` (discarding less-frequent words) and `sample` (controlling the downsampling of more-frequent words). @@ -675,12 +716,16 @@ def scale_vocab(self, min_count=None, sample=None, dry_run=False, keep_raw_vocab drop_total += v original_unique_total = len(retain_words) + drop_unique retain_unique_pct = len(retain_words) * 100 / max(original_unique_total, 1) - logger.info("min_count=%d retains %i unique words (%i%% of original %i, drops %i)", - min_count, len(retain_words), retain_unique_pct, original_unique_total, drop_unique) + logger.info( + "min_count=%d retains %i unique words (%i%% of original %i, drops %i)", + min_count, len(retain_words), retain_unique_pct, original_unique_total, drop_unique + ) original_total = retain_total + drop_total retain_pct = retain_total * 100 / max(original_total, 1) - logger.info("min_count=%d leaves %i word corpus (%i%% of original %i, drops %i)", - min_count, retain_total, retain_pct, original_total, drop_total) + logger.info( + "min_count=%d leaves %i word corpus (%i%% of original %i, drops %i)", + min_count, retain_total, retain_pct, original_total, drop_total + ) else: logger.info("Updating model with new vocabulary") new_total = pre_exist_total = 0 @@ -704,10 +749,12 @@ def scale_vocab(self, min_count=None, sample=None, dry_run=False, keep_raw_vocab original_unique_total = len(pre_exist_words) + len(new_words) + drop_unique pre_exist_unique_pct = len(pre_exist_words) * 100 / max(original_unique_total, 1) new_unique_pct = len(new_words) * 100 / max(original_unique_total, 1) - logger.info("""New added %i unique words (%i%% of original %i) - and increased the count of %i pre-existing words (%i%% of original %i)""", - len(new_words), new_unique_pct, original_unique_total, - len(pre_exist_words), pre_exist_unique_pct, original_unique_total) + logger.info( + "New added %i unique words (%i%% of original %i) " + "and increased the count of %i pre-existing words (%i%% of original %i)", + len(new_words), new_unique_pct, original_unique_total, len(pre_exist_words), + pre_exist_unique_pct, original_unique_total + ) retain_words = new_words + pre_exist_words retain_total = new_total + pre_exist_total @@ -733,22 +780,23 @@ def scale_vocab(self, min_count=None, sample=None, dry_run=False, keep_raw_vocab word_probability = 1.0 downsample_total += v if not dry_run: - self.wv.vocab[w].sample_int = int(round(word_probability * 2 ** 32)) + self.wv.vocab[w].sample_int = int(round(word_probability * 2**32)) if not dry_run and not keep_raw_vocab: logger.info("deleting the raw counts dictionary of %i items", len(self.raw_vocab)) self.raw_vocab = defaultdict(int) logger.info("sample=%g downsamples %i most-common words", sample, downsample_unique) - logger.info("downsampling leaves estimated %i word corpus (%.1f%% of prior %i)", - downsample_total, downsample_total * 100.0 / max(retain_total, 1), retain_total) - - # return from each step: words-affected, resulting-corpus-size - report_values = {'drop_unique': drop_unique, 'retain_total': retain_total, - 'downsample_unique': downsample_unique, 'downsample_total': int(downsample_total)} + logger.info( + "downsampling leaves estimated %i word corpus (%.1f%% of prior %i)", + downsample_total, downsample_total * 100.0 / max(retain_total, 1), retain_total + ) - # print extra memory estimates - report_values['memory'] = self.estimate_memory(vocab_size=len(retain_words)) + # return from each step: words-affected, resulting-corpus-size, extra memory estimates + report_values = { + 'drop_unique': drop_unique, 'retain_total': retain_total, 'downsample_unique': downsample_unique, + 'downsample_total': int(downsample_total), 'memory': self.estimate_memory(vocab_size=len(retain_words)) + } return report_values @@ -814,8 +862,7 @@ def _raw_word_count(self, job): return sum(len(sentence) for sentence in job) def train(self, sentences, total_examples=None, total_words=None, - epochs=None, start_alpha=None, end_alpha=None, - word_count=0, + epochs=None, start_alpha=None, end_alpha=None, word_count=0, queue_factor=2, report_delay=1.0, compute_loss=None): """ Update the model's neural weights from a sequence of sentences (can be a once-only generator stream). @@ -831,11 +878,13 @@ def train(self, sentences, total_examples=None, total_words=None, explicit `epochs` argument MUST be provided. In the common and recommended case, where `train()` is only called once, the model's cached `iter` value should be supplied as `epochs` value. """ - if (self.model_trimmed_post_training): + if self.model_trimmed_post_training: raise RuntimeError("Parameters for training were discarded using model_trimmed_post_training method") if FAST_VERSION < 0: - warnings.warn("C extension not loaded for Word2Vec, training will be slow. " - "Install a C compiler and reinstall gensim for fast training.") + warnings.warn( + "C extension not loaded for Word2Vec, training will be slow. " + "Install a C compiler and reinstall gensim for fast training." + ) self.neg_labels = [] if self.negative > 0: # precompute negative labels optimization for pure-python training @@ -849,8 +898,8 @@ def train(self, sentences, total_examples=None, total_words=None, logger.info( "training model with %i workers on %i vocabulary and %i features, " "using sg=%s hs=%s sample=%s negative=%s window=%s", - self.workers, len(self.wv.vocab), self.layer1_size, self.sg, - self.hs, self.sample, self.negative, self.window) + self.workers, len(self.wv.vocab), self.layer1_size, self.sg, self.hs, self.sample, self.negative, self.window + ) if not self.wv.vocab: raise RuntimeError("you must first build vocabulary before training the model") @@ -861,11 +910,15 @@ def train(self, sentences, total_examples=None, total_words=None, raise ValueError( "The number of sentences in the training corpus is missing. Did you load the model via KeyedVectors.load_word2vec_format?" "Models loaded via load_word2vec_format don't support further training. " - "Instead start with a blank model, scan_vocab on the new corpus, intersect_word2vec_format with the old model, then train.") + "Instead start with a blank model, scan_vocab on the new corpus, " + "intersect_word2vec_format with the old model, then train." + ) if total_words is None and total_examples is None: raise ValueError( - "You must specify either total_examples or total_words, for proper alpha and progress calculations. The usual value is total_examples=model.corpus_count.") + "You must specify either total_examples or total_words, for proper alpha and progress calculations. " + "The usual value is total_examples=model.corpus_count." + ) if epochs is None: raise ValueError("You must specify an explict epochs count. The usual value is epochs=model.iter.") start_alpha = start_alpha or self.alpha @@ -900,9 +953,7 @@ def job_producer(): pushed_words, pushed_examples = 0, 0 next_alpha = start_alpha if next_alpha > self.min_alpha_yet_reached: - logger.warning( - "Effective 'alpha' higher than previous training cycles" - ) + logger.warning("Effective 'alpha' higher than previous training cycles") self.min_alpha_yet_reached = next_alpha job_no = 0 @@ -918,7 +969,8 @@ def job_producer(): # no => submit the existing job logger.debug( "queueing job #%i (%i words, %i sentences) at alpha %.05f", - job_no, batch_size, len(job_batch), next_alpha) + job_no, batch_size, len(job_batch), next_alpha + ) job_no += 1 job_queue.put((job_batch, next_alpha)) @@ -942,15 +994,15 @@ def job_producer(): if job_batch: logger.debug( "queueing job #%i (%i words, %i sentences) at alpha %.05f", - job_no, batch_size, len(job_batch), next_alpha) + job_no, batch_size, len(job_batch), next_alpha + ) job_no += 1 job_queue.put((job_batch, next_alpha)) if job_no == 0 and self.train_count == 0: logger.warning( "train() called with an empty iterator (if not intended, " - "be sure to provide a corpus that offers restartable " - "iteration = an iterable)." + "be sure to provide a corpus that offers restartable iteration = an iterable)." ) # give the workers heads up that they can finish -- no more work! @@ -995,34 +1047,31 @@ def job_producer(): logger.info( "PROGRESS: at %.2f%% examples, %.0f words/s, in_qsize %i, out_qsize %i", 100.0 * example_count / total_examples, trained_word_count / elapsed, - utils.qsize(job_queue), utils.qsize(progress_queue)) + utils.qsize(job_queue), utils.qsize(progress_queue) + ) else: # words-based progress % logger.info( "PROGRESS: at %.2f%% words, %.0f words/s, in_qsize %i, out_qsize %i", 100.0 * raw_word_count / total_words, trained_word_count / elapsed, - utils.qsize(job_queue), utils.qsize(progress_queue)) + utils.qsize(job_queue), utils.qsize(progress_queue) + ) next_report = elapsed + report_delay # all done; report the final stats elapsed = default_timer() - start logger.info( "training on %i raw words (%i effective words) took %.1fs, %.0f effective words/s", - raw_word_count, trained_word_count, elapsed, trained_word_count / elapsed) + raw_word_count, trained_word_count, elapsed, trained_word_count / elapsed + ) if job_tally < 10 * self.workers: - logger.warning( - "under 10 jobs per worker: consider setting a smaller `batch_words' for smoother alpha decay" - ) + logger.warning("under 10 jobs per worker: consider setting a smaller `batch_words' for smoother alpha decay") # check that the input corpus hasn't changed during iteration if total_examples and total_examples != example_count: - logger.warning( - "supplied example count (%i) did not equal expected count (%i)", example_count, total_examples - ) + logger.warning("supplied example count (%i) did not equal expected count (%i)", example_count, total_examples) if total_words and total_words != raw_word_count: - logger.warning( - "supplied raw word count (%i) did not equal expected count (%i)", raw_word_count, total_words - ) + logger.warning("supplied raw word count (%i) did not equal expected count (%i)", raw_word_count, total_words) self.train_count += 1 # number of times train() has been called self.total_train_time += elapsed @@ -1049,21 +1098,25 @@ def score(self, sentences, total_sentences=int(1e6), chunksize=100, queue_factor """ if FAST_VERSION < 0: - warnings.warn("C extension compilation failed, scoring will be slow. " - "Install a C compiler and reinstall gensim for fastness.") + warnings.warn( + "C extension compilation failed, scoring will be slow. " + "Install a C compiler and reinstall gensim for fastness." + ) logger.info( "scoring sentences with %i workers on %i vocabulary and %i features, " "using sg=%s hs=%s sample=%s and negative=%s", - self.workers, len(self.wv.vocab), self.layer1_size, self.sg, self.hs, self.sample, self.negative) + self.workers, len(self.wv.vocab), self.layer1_size, self.sg, self.hs, self.sample, self.negative + ) if not self.wv.vocab: raise RuntimeError("you must first build vocabulary before scoring new data") if not self.hs: - raise RuntimeError("We have currently only implemented score \ - for the hierarchical softmax scheme, so you need to have \ - run word2vec with hs=1 and negative=0 for this to work.") + raise RuntimeError( + "We have currently only implemented score for the hierarchical softmax scheme, " + "so you need to have run word2vec with hs=1 and negative=0 for this to work." + ) def worker_loop(): """Compute log probability for each sentence, lifting lists of sentences from the jobs queue.""" @@ -1109,15 +1162,14 @@ def worker_loop(): if (job_no - 1) * chunksize > total_sentences: logger.warning( "terminating after %i sentences (set higher total_sentences if you want more).", - total_sentences) + total_sentences + ) job_no -= 1 raise StopIteration() logger.debug("putting job #%i in the queue", job_no) job_queue.put(items) except StopIteration: - logger.info( - "reached end of input; waiting to finish %i outstanding jobs", - job_no - done_jobs + 1) + logger.info("reached end of input; waiting to finish %i outstanding jobs", job_no - done_jobs + 1) for _ in xrange(self.workers): job_queue.put(None) # give the workers heads up that they can finish -- no more work! push_done = True @@ -1130,7 +1182,8 @@ def worker_loop(): if elapsed >= next_report: logger.info( "PROGRESS: at %.2f%% sentences, %.0f sentences/s", - 100.0 * sentence_count, sentence_count / elapsed) + 100.0 * sentence_count, sentence_count / elapsed + ) next_report = elapsed + report_delay # don't flood log, wait report_delay seconds else: # loop ended by job count; really done @@ -1142,7 +1195,8 @@ def worker_loop(): self.clear_sims() logger.info( "scoring %i sentences took %.1fs, %.0f sentences/s", - sentence_count, elapsed, sentence_count / elapsed) + sentence_count, elapsed, sentence_count / elapsed + ) return sentence_scores[:sentence_count] def clear_sims(self): @@ -1169,9 +1223,10 @@ def update_weights(self): # Raise an error if an online update is run before initial training on a corpus if not len(self.wv.syn0): - raise RuntimeError("You cannot do an online vocabulary-update of a model which has no prior vocabulary. " \ - "First build the vocabulary of your model with a corpus " \ - "before doing an online update.") + raise RuntimeError( + "You cannot do an online vocabulary-update of a model which has no prior vocabulary. " + "First build the vocabulary of your model with a corpus before doing an online update." + ) self.wv.syn0 = vstack([self.wv.syn0, newsyn0]) @@ -1220,16 +1275,16 @@ def intersect_word2vec_format(self, fname, lockf=0.0, binary=False, encoding='ut training. Use 1.0 to allow further training updates of merged vectors. """ overlap_count = 0 - logger.info("loading projection weights from %s" % (fname)) + logger.info("loading projection weights from %s", fname) with utils.smart_open(fname) as fin: header = utils.to_unicode(fin.readline(), encoding=encoding) - vocab_size, vector_size = map(int, header.split()) # throws for invalid file format + vocab_size, vector_size = (int(x) for x in header.split()) # throws for invalid file format if not vector_size == self.vector_size: raise ValueError("incompatible vector size %d in file %s" % (vector_size, fname)) # TOCONSIDER: maybe mismatched vectors still useful enough to merge (truncating/padding)? if binary: binary_len = dtype(REAL).itemsize * vector_size - for line_no in xrange(vocab_size): + for _ in xrange(vocab_size): # mixed text and binary: read text first, then binary word = [] while True: @@ -1248,15 +1303,15 @@ def intersect_word2vec_format(self, fname, lockf=0.0, binary=False, encoding='ut for line_no, line in enumerate(fin): parts = utils.to_unicode(line.rstrip(), encoding=encoding, errors=unicode_errors).split(" ") if len(parts) != vector_size + 1: - raise ValueError("invalid vector on line %s (is this really the text format?)" % (line_no)) - word, weights = parts[0], list(map(REAL, parts[1:])) + raise ValueError("invalid vector on line %s (is this really the text format?)" % line_no) + word, weights = parts[0], [REAL(x) for x in parts[1:]] if word in self.wv.vocab: overlap_count += 1 self.wv.syn0[self.wv.vocab[word].index] = weights self.syn0_lockf[self.wv.vocab[word].index] = lockf # lock-factor: 0.0 stops further changes - logger.info("merged %d vectors into %s matrix from %s" % (overlap_count, self.wv.syn0.shape, fname)) + logger.info("merged %d vectors into %s matrix from %s", overlap_count, self.wv.syn0.shape, fname) - def most_similar(self, positive=[], negative=[], topn=10, restrict_vocab=None, indexer=None): + def most_similar(self, positive=None, negative=None, topn=10, restrict_vocab=None, indexer=None): """ Deprecated. Use self.wv.most_similar() instead. Refer to the documentation for `gensim.models.KeyedVectors.most_similar` @@ -1270,7 +1325,7 @@ def wmdistance(self, document1, document2): """ return self.wv.wmdistance(document1, document2) - def most_similar_cosmul(self, positive=[], negative=[], topn=10): + def most_similar_cosmul(self, positive=None, negative=None, topn=10): """ Deprecated. Use self.wv.most_similar_cosmul() instead. Refer to the documentation for `gensim.models.KeyedVectors.most_similar_cosmul` @@ -1329,9 +1384,10 @@ def n_similarity(self, ws1, ws2): def predict_output_word(self, context_words_list, topn=10): """Report the probability distribution of the center word given the context words as input to the trained model.""" if not self.negative: - raise RuntimeError("We have currently only implemented predict_output_word " - "for the negative sampling scheme, so you need to have " - "run word2vec with negative > 0 for this to work.") + raise RuntimeError( + "We have currently only implemented predict_output_word for the negative sampling scheme, " + "so you need to have run word2vec with negative > 0 for this to work." + ) if not hasattr(self.wv, 'syn0') or not hasattr(self, 'syn1neg'): raise RuntimeError("Parameters required for predicting the output words not found.") @@ -1350,8 +1406,7 @@ def predict_output_word(self, context_words_list, topn=10): prob_values = exp(dot(l1, self.syn1neg.T)) # propagate hidden -> output and take softmax to get probabilities prob_values /= sum(prob_values) top_indices = matutils.argsort(prob_values, topn=topn, reverse=True) - return [(self.wv.index2word[index1], prob_values[index1]) for index1 in - top_indices] # returning the most probable output words with their probabilities + return [(self.wv.index2word[index1], prob_values[index1]) for index1 in top_indices] # returning the most probable output words with their probabilities def init_sims(self, replace=False): """ @@ -1373,8 +1428,10 @@ def estimate_memory(self, vocab_size=None, report=None): if self.negative: report['syn1neg'] = vocab_size * self.layer1_size * dtype(REAL).itemsize report['total'] = sum(report.values()) - logger.info("estimated required memory for %i words and %i dimensions: %i bytes", - vocab_size, self.vector_size, report['total']) + logger.info( + "estimated required memory for %i words and %i dimensions: %i bytes", + vocab_size, self.vector_size, report['total'] + ) return report @staticmethod @@ -1393,8 +1450,7 @@ def log_evaluate_word_pairs(pearson, spearman, oov, pairs): """ return KeyedVectors.log_evaluate_word_pairs(pearson, spearman, oov, pairs) - def evaluate_word_pairs(self, pairs, delimiter='\t', restrict_vocab=300000, case_insensitive=True, - dummy4unknown=False): + def evaluate_word_pairs(self, pairs, delimiter='\t', restrict_vocab=300000, case_insensitive=True, dummy4unknown=False): """ Deprecated. Use self.wv.evaluate_word_pairs() instead. Refer to the documentation for `gensim.models.KeyedVectors.evaluate_word_pairs` @@ -1402,12 +1458,14 @@ def evaluate_word_pairs(self, pairs, delimiter='\t', restrict_vocab=300000, case return self.wv.evaluate_word_pairs(pairs, delimiter, restrict_vocab, case_insensitive, dummy4unknown) def __str__(self): - return "%s(vocab=%s, size=%s, alpha=%s)" % ( - self.__class__.__name__, len(self.wv.index2word), self.vector_size, self.alpha) + return "%s(vocab=%s, size=%s, alpha=%s)" % (self.__class__.__name__, len(self.wv.index2word), self.vector_size, self.alpha) def _minimize_model(self, save_syn1=False, save_syn1neg=False, save_syn0_lockf=False): warnings.warn( - "This method would be deprecated in the future. Keep just_word_vectors = model.wv to retain just the KeyedVectors instance for read-only querying of word vectors.") + "This method would be deprecated in the future. " + "Keep just_word_vectors = model.wv to retain just the KeyedVectors instance " + "for read-only querying of word vectors." + ) if save_syn1 and save_syn1neg and save_syn0_lockf: return if hasattr(self, 'syn1') and not save_syn1: @@ -1450,7 +1508,7 @@ def load(cls, *args, **kwargs): if hasattr(v, 'sample_int'): break # already 0.12.0+ style int probabilities elif hasattr(v, 'sample_probability'): - v.sample_int = int(round(v.sample_probability * 2 ** 32)) + v.sample_int = int(round(v.sample_probability * 2**32)) del v.sample_probability if not hasattr(model, 'syn0_lockf') and hasattr(model, 'syn0'): model.syn0_lockf = ones(len(model.wv.syn0), dtype=REAL) @@ -1474,7 +1532,7 @@ def _load_specials(self, *args, **kwargs): @classmethod def load_word2vec_format(cls, fname, fvocab=None, binary=False, encoding='utf8', unicode_errors='strict', - limit=None, datatype=REAL): + limit=None, datatype=REAL): """Deprecated. Use gensim.models.KeyedVectors.load_word2vec_format instead.""" raise DeprecationWarning("Deprecated. Use gensim.models.KeyedVectors.load_word2vec_format instead.") @@ -1581,7 +1639,7 @@ def __iter__(self): line = utils.to_unicode(line).split() i = 0 while i < len(line): - yield line[i:i + self.max_sentence_length] + yield line[i: i + self.max_sentence_length] i += self.max_sentence_length @@ -1599,7 +1657,7 @@ def __init__(self, source, max_sentence_length=MAX_WORDS_IN_BATCH, limit=None): Example:: - sentences = LineSentencePath(os.getcwd() + '\\corpus\\') + sentences = PathLineSentences(os.getcwd() + '\\corpus\\') The files in the directory should be either text files, .bz2 files, or .gz files. @@ -1613,19 +1671,19 @@ def __init__(self, source, max_sentence_length=MAX_WORDS_IN_BATCH, limit=None): self.input_files = [self.source] # force code compatibility with list of files elif os.path.isdir(self.source): self.source = os.path.join(self.source, '') # ensures os-specific slash at end of path - logging.debug('reading directory ' + self.source) + logging.debug('reading directory %s', self.source) self.input_files = os.listdir(self.source) self.input_files = [self.source + file for file in self.input_files] # make full paths self.input_files.sort() # makes sure it happens in filename order else: # not a file or a directory, then we can't do anything with it raise ValueError('input is neither a file nor a path') - logging.info('files read into PathLineSentences:' + '\n'.join(self.input_files)) + logging.info('files read into PathLineSentences:%s', '\n'.join(self.input_files)) def __iter__(self): - '''iterate through the files''' + """iterate through the files""" for file_name in self.input_files: - logging.info('reading file ' + file_name) + logging.info('reading file %s', file_name) with utils.smart_open(file_name) as fin: for line in itertools.islice(fin, self.limit): line = utils.to_unicode(line).split() @@ -1638,7 +1696,6 @@ def __iter__(self): # Example: ./word2vec.py -train data.txt -output vec.txt -size 200 -window 5 -sample 1e-4 -negative 5 -hs 0 -binary 0 -cbow 1 -iter 3 if __name__ == "__main__": import argparse - logging.basicConfig( format='%(asctime)s : %(threadName)s : %(levelname)s : %(message)s', level=logging.INFO) @@ -1651,7 +1708,7 @@ def __iter__(self): print(globals()['__doc__'] % locals()) sys.exit(1) - from gensim.models.word2vec import Word2Vec # avoid referencing __main__ in pickle + from gensim.models.word2vec import Word2Vec # noqa:F811 avoid referencing __main__ in pickle seterr(all='raise') # don't ignore numpy errors @@ -1660,23 +1717,14 @@ def __iter__(self): parser.add_argument("-output", help="Use file OUTPUT to save the resulting word vectors") parser.add_argument("-window", help="Set max skip length WINDOW between words; default is 5", type=int, default=5) parser.add_argument("-size", help="Set size of word vectors; default is 100", type=int, default=100) - parser.add_argument("-sample", - help="Set threshold for occurrence of words. Those that appear with higher frequency in the training data will be randomly down-sampled; default is 1e-3, useful range is (0, 1e-5)", - type=float, default=1e-3) - parser.add_argument("-hs", help="Use Hierarchical Softmax; default is 0 (not used)", type=int, default=0, - choices=[0, 1]) - parser.add_argument("-negative", - help="Number of negative examples; default is 5, common values are 3 - 10 (0 = not used)", - type=int, default=5) + parser.add_argument("-sample", help="Set threshold for occurrence of words. Those that appear with higher frequency in the training data will be randomly down-sampled; default is 1e-3, useful range is (0, 1e-5)", type=float, default=1e-3) + parser.add_argument("-hs", help="Use Hierarchical Softmax; default is 0 (not used)", type=int, default=0, choices=[0, 1]) + parser.add_argument("-negative", help="Number of negative examples; default is 5, common values are 3 - 10 (0 = not used)", type=int, default=5) parser.add_argument("-threads", help="Use THREADS threads (default 12)", type=int, default=12) parser.add_argument("-iter", help="Run more training iterations (default 5)", type=int, default=5) - parser.add_argument("-min_count", - help="This will discard words that appear less than MIN_COUNT times; default is 5", type=int, - default=5) - parser.add_argument("-cbow", help="Use the continuous bag of words model; default is 1 (use 0 for skip-gram model)", - type=int, default=1, choices=[0, 1]) - parser.add_argument("-binary", help="Save the resulting vectors in binary mode; default is 0 (off)", type=int, - default=0, choices=[0, 1]) + parser.add_argument("-min_count", help="This will discard words that appear less than MIN_COUNT times; default is 5", type=int, default=5) + parser.add_argument("-cbow", help="Use the continuous bag of words model; default is 1 (use 0 for skip-gram model)", type=int, default=1, choices=[0, 1]) + parser.add_argument("-binary", help="Save the resulting vectors in binary mode; default is 0 (off)", type=int, default=0, choices=[0, 1]) parser.add_argument("-accuracy", help="Use questions from file ACCURACY to evaluate the model") args = parser.parse_args() @@ -1691,7 +1739,8 @@ def __iter__(self): model = Word2Vec( corpus, size=args.size, min_count=args.min_count, workers=args.threads, window=args.window, sample=args.sample, sg=skipgram, hs=args.hs, - negative=args.negative, cbow_mean=1, iter=args.iter) + negative=args.negative, cbow_mean=1, iter=args.iter + ) if args.output: outfile = args.output @@ -1709,4 +1758,3 @@ def __iter__(self): logger.info("finished running %s", program) - From 8abd58b58cc35521cb930ffee78c723b481877ab Mon Sep 17 00:00:00 2001 From: jodevak Date: Mon, 25 Sep 2017 18:17:38 +0300 Subject: [PATCH 03/20] fix build vocab speed issue, function build vocab from previously provided word frequencies table --- gensim/models/word2vec.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/gensim/models/word2vec.py b/gensim/models/word2vec.py index 2e6eb89cb2..aada500661 100644 --- a/gensim/models/word2vec.py +++ b/gensim/models/word2vec.py @@ -612,7 +612,6 @@ def create_binary_tree(self): logger.info("built huffman tree with maximum node depth %i", max_depth) def build_vocab(self, sentences, keep_raw_vocab=False, trim_rule=None, progress_per=10000, update=False): - """ Build vocabulary from a sequence of sentences (can be a once-only generator stream). Each sentence must be a list of unicode strings. @@ -623,7 +622,6 @@ def build_vocab(self, sentences, keep_raw_vocab=False, trim_rule=None, progress_ def build_vocab_from_freq(self, word_freq, keep_raw_vocab=False, corpus_count=None, trim_rule=None, update=False): - """ Build vocabulary from a dictionary of word frequencies """ @@ -1758,3 +1756,4 @@ def __iter__(self): logger.info("finished running %s", program) + From 8ec04332d3f8314635d51433a3c78e55de0cb695 Mon Sep 17 00:00:00 2001 From: jodevak Date: Mon, 25 Sep 2017 18:27:04 +0300 Subject: [PATCH 04/20] fix build vocab speed issue, function build vocab from previously provided word frequencies table --- gensim/models/word2vec.py | 24 +++++++++--------------- 1 file changed, 9 insertions(+), 15 deletions(-) diff --git a/gensim/models/word2vec.py b/gensim/models/word2vec.py index aada500661..20f1fc560c 100644 --- a/gensim/models/word2vec.py +++ b/gensim/models/word2vec.py @@ -643,32 +643,28 @@ def scan_vocab(self, sentences, progress_per=10000, trim_rule=None): min_reduce = 1 vocab = defaultdict(int) checked_string_types = 0 + for sentence_no, sentence in enumerate(sentences): if not checked_string_types: if isinstance(sentence, string_types): logger.warning( - "Each 'sentences' item should be a list of words (usually unicode strings). " - "First item here is instead plain %s.", - type(sentence) + "Each 'sentences' item should be a list of words (usually unicode strings)." + "First item here is instead plain %s.", type(sentence) ) checked_string_types += 1 if sentence_no % progress_per == 0: - logger.info( - "PROGRESS: at sentence #%i, processed %i words, keeping %i word types", - sentence_no, sum(itervalues(vocab)) + total_words, len(vocab) - ) + logger.info("PROGRESS: at sentence #%i, processed %i words, keeping %i word types", + sentence_no, total_words, len(vocab)) + for word in sentence: vocab[word] += 1 + total_words += 1 if self.max_vocab_size and len(vocab) > self.max_vocab_size: - total_words += utils.prune_vocab(vocab, min_reduce, trim_rule=trim_rule) + utils.prune_vocab(vocab, min_reduce, trim_rule=trim_rule) min_reduce += 1 - total_words += sum(itervalues(vocab)) - logger.info( - "collected %i word types from a corpus of %i raw words and %i sentences", - len(vocab), total_words, sentence_no + 1 - ) + logger.info("collected %i word types from a corpus of %i raw words and %i sentences", len(vocab), total_words, sentence_no + 1) self.corpus_count = sentence_no + 1 self.raw_vocab = vocab @@ -1755,5 +1751,3 @@ def __iter__(self): model.accuracy(args.accuracy) logger.info("finished running %s", program) - - From b9f3a5f81d8dd4c6e953e0dd353bc89b0ea61b2f Mon Sep 17 00:00:00 2001 From: jodevak Date: Mon, 16 Oct 2017 21:26:16 +0300 Subject: [PATCH 05/20] Removing the extra blank lines, documentation in numpy-style to build_vocab_from_freq, and hanging indents in build_vocab --- gensim/models/word2vec.py | 132 ++++++++++++++++++++++++++------------ 1 file changed, 92 insertions(+), 40 deletions(-) diff --git a/gensim/models/word2vec.py b/gensim/models/word2vec.py index 20f1fc560c..d4d92aed56 100644 --- a/gensim/models/word2vec.py +++ b/gensim/models/word2vec.py @@ -114,8 +114,8 @@ except ImportError: from Queue import Queue, Empty -from numpy import exp, log, dot, zeros, outer, random, dtype, float32 as REAL,\ - uint32, seterr, array, uint8, vstack, fromstring, sqrt,\ +from numpy import exp, log, dot, zeros, outer, random, dtype, float32 as REAL, \ + uint32, seterr, array, uint8, vstack, fromstring, sqrt, \ empty, sum as np_sum, ones, logaddexp from scipy.special import expit @@ -136,6 +136,7 @@ FAST_VERSION = -1 MAX_WORDS_IN_BATCH = 10000 + def train_batch_sg(model, sentences, alpha, work=None, compute_loss=False): """ Update skip-gram model by training on a sequence of sentences. @@ -150,7 +151,7 @@ def train_batch_sg(model, sentences, alpha, work=None, compute_loss=False): result = 0 for sentence in sentences: word_vocabs = [model.wv.vocab[w] for w in sentence if w in model.wv.vocab and - model.wv.vocab[w].sample_int > model.random.rand() * 2**32] + model.wv.vocab[w].sample_int > model.random.rand() * 2 ** 32] for pos, word in enumerate(word_vocabs): reduced_window = model.random.randint(model.window) # `b` in the original word2vec code @@ -166,6 +167,7 @@ def train_batch_sg(model, sentences, alpha, work=None, compute_loss=False): result += len(word_vocabs) return result + def train_batch_cbow(model, sentences, alpha, work=None, neu1=None, compute_loss=False): """ Update CBOW model by training on a sequence of sentences. @@ -180,7 +182,7 @@ def train_batch_cbow(model, sentences, alpha, work=None, neu1=None, compute_loss result = 0 for sentence in sentences: word_vocabs = [model.wv.vocab[w] for w in sentence if w in model.wv.vocab and - model.wv.vocab[w].sample_int > model.random.rand() * 2**32] + model.wv.vocab[w].sample_int > model.random.rand() * 2 ** 32] for pos, word in enumerate(word_vocabs): reduced_window = model.random.randint(model.window) # `b` in the original word2vec code start = max(0, pos - model.window + reduced_window) @@ -193,6 +195,7 @@ def train_batch_cbow(model, sentences, alpha, work=None, neu1=None, compute_loss result += len(word_vocabs) return result + def score_sentence_sg(model, sentence, work=None): """ Obtain likelihood score for a single sentence in a fitted skip-gram representaion. @@ -222,6 +225,7 @@ def score_sentence_sg(model, sentence, work=None): return log_prob_sentence + def score_sentence_cbow(model, sentence, work=None, neu1=None): """ Obtain likelihood score for a single sentence in a fitted CBOW representaion. @@ -295,7 +299,7 @@ def train_sg_pair(model, word, context_index, alpha, learn_vectors=True, learn_h # loss component corresponding to hierarchical softmax if compute_loss: - sgn = (-1.0)**predict_word.code # `ch` function, 0 -> 1, 1 -> -1 + sgn = (-1.0) ** predict_word.code # `ch` function, 0 -> 1, 1 -> -1 lprob = -log(expit(-sgn * prod_term)) model.running_training_loss += sum(lprob) @@ -329,7 +333,8 @@ def train_sg_pair(model, word, context_index, alpha, learn_vectors=True, learn_h return neu1e -def train_cbow_pair(model, word, input_word_indices, l1, alpha, learn_vectors=True, learn_hidden=True, compute_loss=False, +def train_cbow_pair(model, word, input_word_indices, l1, alpha, learn_vectors=True, learn_hidden=True, + compute_loss=False, context_vectors=None, context_locks=None, is_ft=False): if context_vectors is None: if is_ft: @@ -357,7 +362,7 @@ def train_cbow_pair(model, word, input_word_indices, l1, alpha, learn_vectors=Tr # loss component corresponding to hierarchical softmax if compute_loss: - sgn = (-1.0)**word.code # ch function, 0-> 1, 1 -> -1 + sgn = (-1.0) ** word.code # ch function, 0-> 1, 1 -> -1 model.running_training_loss += sum(-log(expit(-sgn * prod_term))) if model.negative: @@ -401,14 +406,14 @@ def train_cbow_pair(model, word, input_word_indices, l1, alpha, learn_vectors=Tr def score_sg_pair(model, word, word2): l1 = model.wv.syn0[word2.index] l2a = deepcopy(model.syn1[word.point]) # 2d matrix, codelen x layer1_size - sgn = (-1.0)**word.code # ch function, 0-> 1, 1 -> -1 + sgn = (-1.0) ** word.code # ch function, 0-> 1, 1 -> -1 lprob = -logaddexp(0, -sgn * dot(l1, l2a.T)) return sum(lprob) def score_cbow_pair(model, word, l1): l2a = model.syn1[word.point] # 2d matrix, codelen x layer1_size - sgn = (-1.0)**word.code # ch function, 0-> 1, 1 -> -1 + sgn = (-1.0) ** word.code # ch function, 0-> 1, 1 -> -1 lprob = -logaddexp(0, -sgn * dot(l1, l2a.T)) return sum(lprob) @@ -540,7 +545,8 @@ def __init__(self, sentences=None, size=100, alpha=0.025, window=5, min_count=5, if isinstance(sentences, GeneratorType): raise TypeError("You can't pass a generator as the sentences argument. Try an iterator.") self.build_vocab(sentences, trim_rule=trim_rule) - self.train(sentences, total_examples=self.corpus_count, epochs=self.iter, start_alpha=self.alpha, end_alpha=self.min_alpha) + self.train(sentences, total_examples=self.corpus_count, epochs=self.iter, start_alpha=self.alpha, + end_alpha=self.min_alpha) else: if trim_rule is not None: logger.warning( @@ -552,7 +558,7 @@ def __init__(self, sentences=None, size=100, alpha=0.025, window=5, min_count=5, def initialize_word_vectors(self): self.wv = KeyedVectors() - def make_cum_table(self, power=0.75, domain=2**31 - 1): + def make_cum_table(self, power=0.75, domain=2 ** 31 - 1): """ Create a cumulative-distribution table using stored vocabulary word counts for drawing random words in the negative-sampling training routines. @@ -569,10 +575,10 @@ def make_cum_table(self, power=0.75, domain=2**31 - 1): # compute sum of all power (Z in paper) train_words_pow = 0.0 for word_index in xrange(vocab_size): - train_words_pow += self.wv.vocab[self.wv.index2word[word_index]].count**power + train_words_pow += self.wv.vocab[self.wv.index2word[word_index]].count ** power cumulative = 0.0 for word_index in xrange(vocab_size): - cumulative += self.wv.vocab[self.wv.index2word[word_index]].count**power + cumulative += self.wv.vocab[self.wv.index2word[word_index]].count ** power self.cum_table[word_index] = round(cumulative / train_words_pow * domain) if len(self.cum_table) > 0: assert self.cum_table[-1] == domain @@ -617,13 +623,38 @@ def build_vocab(self, sentences, keep_raw_vocab=False, trim_rule=None, progress_ Each sentence must be a list of unicode strings. """ self.scan_vocab(sentences, progress_per=progress_per, trim_rule=trim_rule) # initial survey - self.scale_vocab(keep_raw_vocab=keep_raw_vocab, trim_rule=trim_rule, update=update) # trim by min_count & precalculate downsampling + self.scale_vocab(keep_raw_vocab=keep_raw_vocab, trim_rule=trim_rule, + update=update) # trim by min_count & precalculate downsampling self.finalize_vocab(update=update) # build tables & arrays - def build_vocab_from_freq(self, word_freq, keep_raw_vocab=False, corpus_count=None, trim_rule=None, update=False): """ - Build vocabulary from a dictionary of word frequencies + Build vocabulary from a dictionary of word frequencies. + Build model vocabulary from a passed dictionary that contains (word,word count). + Words must be of type unicode strings. + + Parameters + ---------- + `word_freq` : dict + Word,Word_Count dictionary. + `keep_raw_vocab` : bool + If not true, delete the raw vocabulary after the scaling is done and free up RAM. + `corpus_count`: int + Even if no corpus is provided, this argument can set corpus_count explicitly. + `trim_rule` = vocabulary trimming rule, specifies whether certain words should remain + in the vocabulary, be trimmed away, or handled using the default (discard if word count < min_count). + Can be None (min_count will be used), or a callable that accepts parameters (word, count, min_count) and + returns either `utils.RULE_DISCARD`, `utils.RULE_KEEP` or `utils.RULE_DEFAULT`. + `update`: bool + If true, the new provided words in `word_freq` dict will be added to model's vocab. + + Returns + -------- + None + + Examples + -------- + >>> build_vocab_from_freq({"Word1":15,"Word2":20}, update=True) """ logger.info("Processing provided word frequencies") vocab = defaultdict(int, word_freq) @@ -631,10 +662,10 @@ def build_vocab_from_freq(self, word_freq, keep_raw_vocab=False, corpus_count=No self.corpus_count = corpus_count if corpus_count else 0 self.raw_vocab = vocab - self.scale_vocab(keep_raw_vocab=keep_raw_vocab, trim_rule=trim_rule, update=update) # trim by min_count & precalculate downsampling + self.scale_vocab(keep_raw_vocab=keep_raw_vocab, trim_rule=trim_rule, + update=update) # trim by min_count & precalculate downsampling self.finalize_vocab(update=update) # build tables & arrays - def scan_vocab(self, sentences, progress_per=10000, trim_rule=None): """Do an initial scan of all words appearing in sentences.""" logger.info("collecting all words and their counts") @@ -647,13 +678,16 @@ def scan_vocab(self, sentences, progress_per=10000, trim_rule=None): for sentence_no, sentence in enumerate(sentences): if not checked_string_types: if isinstance(sentence, string_types): - logger.warning( - "Each 'sentences' item should be a list of words (usually unicode strings)." - "First item here is instead plain %s.", type(sentence) - ) + logger.warning("Each 'sentences' " + "item should be a list of words " + "(usually unicode strings)." + "First item here is instead plain %s.", type(sentence) + ) checked_string_types += 1 if sentence_no % progress_per == 0: - logger.info("PROGRESS: at sentence #%i, processed %i words, keeping %i word types", + logger.info("PROGRESS: at sentence #%i," + " processed %i words, " + "keeping %i word types", sentence_no, total_words, len(vocab)) for word in sentence: @@ -664,7 +698,8 @@ def scan_vocab(self, sentences, progress_per=10000, trim_rule=None): utils.prune_vocab(vocab, min_reduce, trim_rule=trim_rule) min_reduce += 1 - logger.info("collected %i word types from a corpus of %i raw words and %i sentences", len(vocab), total_words, sentence_no + 1) + logger.info("collected %i word types from a corpus of %i raw words and %i sentences", len(vocab), total_words, + sentence_no + 1) self.corpus_count = sentence_no + 1 self.raw_vocab = vocab @@ -774,7 +809,7 @@ def scale_vocab(self, min_count=None, sample=None, dry_run=False, word_probability = 1.0 downsample_total += v if not dry_run: - self.wv.vocab[w].sample_int = int(round(word_probability * 2**32)) + self.wv.vocab[w].sample_int = int(round(word_probability * 2 ** 32)) if not dry_run and not keep_raw_vocab: logger.info("deleting the raw counts dictionary of %i items", len(self.raw_vocab)) @@ -892,7 +927,8 @@ def train(self, sentences, total_examples=None, total_words=None, logger.info( "training model with %i workers on %i vocabulary and %i features, " "using sg=%s hs=%s sample=%s negative=%s window=%s", - self.workers, len(self.wv.vocab), self.layer1_size, self.sg, self.hs, self.sample, self.negative, self.window + self.workers, len(self.wv.vocab), self.layer1_size, self.sg, self.hs, self.sample, self.negative, + self.window ) if not self.wv.vocab: @@ -1059,13 +1095,16 @@ def job_producer(): raw_word_count, trained_word_count, elapsed, trained_word_count / elapsed ) if job_tally < 10 * self.workers: - logger.warning("under 10 jobs per worker: consider setting a smaller `batch_words' for smoother alpha decay") + logger.warning( + "under 10 jobs per worker: consider setting a smaller `batch_words' for smoother alpha decay") # check that the input corpus hasn't changed during iteration if total_examples and total_examples != example_count: - logger.warning("supplied example count (%i) did not equal expected count (%i)", example_count, total_examples) + logger.warning("supplied example count (%i) did not equal expected count (%i)", example_count, + total_examples) if total_words and total_words != raw_word_count: - logger.warning("supplied raw word count (%i) did not equal expected count (%i)", raw_word_count, total_words) + logger.warning("supplied raw word count (%i) did not equal expected count (%i)", raw_word_count, + total_words) self.train_count += 1 # number of times train() has been called self.total_train_time += elapsed @@ -1400,7 +1439,8 @@ def predict_output_word(self, context_words_list, topn=10): prob_values = exp(dot(l1, self.syn1neg.T)) # propagate hidden -> output and take softmax to get probabilities prob_values /= sum(prob_values) top_indices = matutils.argsort(prob_values, topn=topn, reverse=True) - return [(self.wv.index2word[index1], prob_values[index1]) for index1 in top_indices] # returning the most probable output words with their probabilities + return [(self.wv.index2word[index1], prob_values[index1]) for index1 in + top_indices] # returning the most probable output words with their probabilities def init_sims(self, replace=False): """ @@ -1444,7 +1484,8 @@ def log_evaluate_word_pairs(pearson, spearman, oov, pairs): """ return KeyedVectors.log_evaluate_word_pairs(pearson, spearman, oov, pairs) - def evaluate_word_pairs(self, pairs, delimiter='\t', restrict_vocab=300000, case_insensitive=True, dummy4unknown=False): + def evaluate_word_pairs(self, pairs, delimiter='\t', restrict_vocab=300000, case_insensitive=True, + dummy4unknown=False): """ Deprecated. Use self.wv.evaluate_word_pairs() instead. Refer to the documentation for `gensim.models.KeyedVectors.evaluate_word_pairs` @@ -1452,7 +1493,8 @@ def evaluate_word_pairs(self, pairs, delimiter='\t', restrict_vocab=300000, case return self.wv.evaluate_word_pairs(pairs, delimiter, restrict_vocab, case_insensitive, dummy4unknown) def __str__(self): - return "%s(vocab=%s, size=%s, alpha=%s)" % (self.__class__.__name__, len(self.wv.index2word), self.vector_size, self.alpha) + return "%s(vocab=%s, size=%s, alpha=%s)" % ( + self.__class__.__name__, len(self.wv.index2word), self.vector_size, self.alpha) def _minimize_model(self, save_syn1=False, save_syn1neg=False, save_syn0_lockf=False): warnings.warn( @@ -1502,7 +1544,7 @@ def load(cls, *args, **kwargs): if hasattr(v, 'sample_int'): break # already 0.12.0+ style int probabilities elif hasattr(v, 'sample_probability'): - v.sample_int = int(round(v.sample_probability * 2**32)) + v.sample_int = int(round(v.sample_probability * 2 ** 32)) del v.sample_probability if not hasattr(model, 'syn0_lockf') and hasattr(model, 'syn0'): model.syn0_lockf = ones(len(model.wv.syn0), dtype=REAL) @@ -1526,7 +1568,7 @@ def _load_specials(self, *args, **kwargs): @classmethod def load_word2vec_format(cls, fname, fvocab=None, binary=False, encoding='utf8', unicode_errors='strict', - limit=None, datatype=REAL): + limit=None, datatype=REAL): """Deprecated. Use gensim.models.KeyedVectors.load_word2vec_format instead.""" raise DeprecationWarning("Deprecated. Use gensim.models.KeyedVectors.load_word2vec_format instead.") @@ -1690,6 +1732,7 @@ def __iter__(self): # Example: ./word2vec.py -train data.txt -output vec.txt -size 200 -window 5 -sample 1e-4 -negative 5 -hs 0 -binary 0 -cbow 1 -iter 3 if __name__ == "__main__": import argparse + logging.basicConfig( format='%(asctime)s : %(threadName)s : %(levelname)s : %(message)s', level=logging.INFO) @@ -1711,14 +1754,23 @@ def __iter__(self): parser.add_argument("-output", help="Use file OUTPUT to save the resulting word vectors") parser.add_argument("-window", help="Set max skip length WINDOW between words; default is 5", type=int, default=5) parser.add_argument("-size", help="Set size of word vectors; default is 100", type=int, default=100) - parser.add_argument("-sample", help="Set threshold for occurrence of words. Those that appear with higher frequency in the training data will be randomly down-sampled; default is 1e-3, useful range is (0, 1e-5)", type=float, default=1e-3) - parser.add_argument("-hs", help="Use Hierarchical Softmax; default is 0 (not used)", type=int, default=0, choices=[0, 1]) - parser.add_argument("-negative", help="Number of negative examples; default is 5, common values are 3 - 10 (0 = not used)", type=int, default=5) + parser.add_argument("-sample", + help="Set threshold for occurrence of words. Those that appear with higher frequency in the training data will be randomly down-sampled; default is 1e-3, useful range is (0, 1e-5)", + type=float, default=1e-3) + parser.add_argument("-hs", help="Use Hierarchical Softmax; default is 0 (not used)", type=int, default=0, + choices=[0, 1]) + parser.add_argument("-negative", + help="Number of negative examples; default is 5, common values are 3 - 10 (0 = not used)", + type=int, default=5) parser.add_argument("-threads", help="Use THREADS threads (default 12)", type=int, default=12) parser.add_argument("-iter", help="Run more training iterations (default 5)", type=int, default=5) - parser.add_argument("-min_count", help="This will discard words that appear less than MIN_COUNT times; default is 5", type=int, default=5) - parser.add_argument("-cbow", help="Use the continuous bag of words model; default is 1 (use 0 for skip-gram model)", type=int, default=1, choices=[0, 1]) - parser.add_argument("-binary", help="Save the resulting vectors in binary mode; default is 0 (off)", type=int, default=0, choices=[0, 1]) + parser.add_argument("-min_count", + help="This will discard words that appear less than MIN_COUNT times; default is 5", type=int, + default=5) + parser.add_argument("-cbow", help="Use the continuous bag of words model; default is 1 (use 0 for skip-gram model)", + type=int, default=1, choices=[0, 1]) + parser.add_argument("-binary", help="Save the resulting vectors in binary mode; default is 0 (off)", type=int, + default=0, choices=[0, 1]) parser.add_argument("-accuracy", help="Use questions from file ACCURACY to evaluate the model") args = parser.parse_args() From 0a5e8d6cd6b1d7e7dbe94b03ec14ab03a56d637f Mon Sep 17 00:00:00 2001 From: jodevak Date: Tue, 17 Oct 2017 13:00:40 +0300 Subject: [PATCH 06/20] Fixing Indentation --- gensim/models/word2vec.py | 115 +++++++++++++++----------------------- 1 file changed, 45 insertions(+), 70 deletions(-) diff --git a/gensim/models/word2vec.py b/gensim/models/word2vec.py index d4d92aed56..7e52c060ec 100644 --- a/gensim/models/word2vec.py +++ b/gensim/models/word2vec.py @@ -114,8 +114,8 @@ except ImportError: from Queue import Queue, Empty -from numpy import exp, log, dot, zeros, outer, random, dtype, float32 as REAL, \ - uint32, seterr, array, uint8, vstack, fromstring, sqrt, \ +from numpy import exp, log, dot, zeros, outer, random, dtype, float32 as REAL,\ + uint32, seterr, array, uint8, vstack, fromstring, sqrt,\ empty, sum as np_sum, ones, logaddexp from scipy.special import expit @@ -136,7 +136,6 @@ FAST_VERSION = -1 MAX_WORDS_IN_BATCH = 10000 - def train_batch_sg(model, sentences, alpha, work=None, compute_loss=False): """ Update skip-gram model by training on a sequence of sentences. @@ -151,7 +150,7 @@ def train_batch_sg(model, sentences, alpha, work=None, compute_loss=False): result = 0 for sentence in sentences: word_vocabs = [model.wv.vocab[w] for w in sentence if w in model.wv.vocab and - model.wv.vocab[w].sample_int > model.random.rand() * 2 ** 32] + model.wv.vocab[w].sample_int > model.random.rand() * 2**32] for pos, word in enumerate(word_vocabs): reduced_window = model.random.randint(model.window) # `b` in the original word2vec code @@ -167,7 +166,6 @@ def train_batch_sg(model, sentences, alpha, work=None, compute_loss=False): result += len(word_vocabs) return result - def train_batch_cbow(model, sentences, alpha, work=None, neu1=None, compute_loss=False): """ Update CBOW model by training on a sequence of sentences. @@ -182,7 +180,7 @@ def train_batch_cbow(model, sentences, alpha, work=None, neu1=None, compute_loss result = 0 for sentence in sentences: word_vocabs = [model.wv.vocab[w] for w in sentence if w in model.wv.vocab and - model.wv.vocab[w].sample_int > model.random.rand() * 2 ** 32] + model.wv.vocab[w].sample_int > model.random.rand() * 2**32] for pos, word in enumerate(word_vocabs): reduced_window = model.random.randint(model.window) # `b` in the original word2vec code start = max(0, pos - model.window + reduced_window) @@ -195,7 +193,6 @@ def train_batch_cbow(model, sentences, alpha, work=None, neu1=None, compute_loss result += len(word_vocabs) return result - def score_sentence_sg(model, sentence, work=None): """ Obtain likelihood score for a single sentence in a fitted skip-gram representaion. @@ -225,7 +222,6 @@ def score_sentence_sg(model, sentence, work=None): return log_prob_sentence - def score_sentence_cbow(model, sentence, work=None, neu1=None): """ Obtain likelihood score for a single sentence in a fitted CBOW representaion. @@ -299,7 +295,7 @@ def train_sg_pair(model, word, context_index, alpha, learn_vectors=True, learn_h # loss component corresponding to hierarchical softmax if compute_loss: - sgn = (-1.0) ** predict_word.code # `ch` function, 0 -> 1, 1 -> -1 + sgn = (-1.0)**predict_word.code # `ch` function, 0 -> 1, 1 -> -1 lprob = -log(expit(-sgn * prod_term)) model.running_training_loss += sum(lprob) @@ -333,8 +329,7 @@ def train_sg_pair(model, word, context_index, alpha, learn_vectors=True, learn_h return neu1e -def train_cbow_pair(model, word, input_word_indices, l1, alpha, learn_vectors=True, learn_hidden=True, - compute_loss=False, +def train_cbow_pair(model, word, input_word_indices, l1, alpha, learn_vectors=True, learn_hidden=True, compute_loss=False, context_vectors=None, context_locks=None, is_ft=False): if context_vectors is None: if is_ft: @@ -362,7 +357,7 @@ def train_cbow_pair(model, word, input_word_indices, l1, alpha, learn_vectors=Tr # loss component corresponding to hierarchical softmax if compute_loss: - sgn = (-1.0) ** word.code # ch function, 0-> 1, 1 -> -1 + sgn = (-1.0)**word.code # ch function, 0-> 1, 1 -> -1 model.running_training_loss += sum(-log(expit(-sgn * prod_term))) if model.negative: @@ -406,14 +401,14 @@ def train_cbow_pair(model, word, input_word_indices, l1, alpha, learn_vectors=Tr def score_sg_pair(model, word, word2): l1 = model.wv.syn0[word2.index] l2a = deepcopy(model.syn1[word.point]) # 2d matrix, codelen x layer1_size - sgn = (-1.0) ** word.code # ch function, 0-> 1, 1 -> -1 + sgn = (-1.0)**word.code # ch function, 0-> 1, 1 -> -1 lprob = -logaddexp(0, -sgn * dot(l1, l2a.T)) return sum(lprob) def score_cbow_pair(model, word, l1): l2a = model.syn1[word.point] # 2d matrix, codelen x layer1_size - sgn = (-1.0) ** word.code # ch function, 0-> 1, 1 -> -1 + sgn = (-1.0)**word.code # ch function, 0-> 1, 1 -> -1 lprob = -logaddexp(0, -sgn * dot(l1, l2a.T)) return sum(lprob) @@ -545,8 +540,7 @@ def __init__(self, sentences=None, size=100, alpha=0.025, window=5, min_count=5, if isinstance(sentences, GeneratorType): raise TypeError("You can't pass a generator as the sentences argument. Try an iterator.") self.build_vocab(sentences, trim_rule=trim_rule) - self.train(sentences, total_examples=self.corpus_count, epochs=self.iter, start_alpha=self.alpha, - end_alpha=self.min_alpha) + self.train(sentences, total_examples=self.corpus_count, epochs=self.iter, start_alpha=self.alpha, end_alpha=self.min_alpha) else: if trim_rule is not None: logger.warning( @@ -558,7 +552,7 @@ def __init__(self, sentences=None, size=100, alpha=0.025, window=5, min_count=5, def initialize_word_vectors(self): self.wv = KeyedVectors() - def make_cum_table(self, power=0.75, domain=2 ** 31 - 1): + def make_cum_table(self, power=0.75, domain=2**31 - 1): """ Create a cumulative-distribution table using stored vocabulary word counts for drawing random words in the negative-sampling training routines. @@ -575,10 +569,10 @@ def make_cum_table(self, power=0.75, domain=2 ** 31 - 1): # compute sum of all power (Z in paper) train_words_pow = 0.0 for word_index in xrange(vocab_size): - train_words_pow += self.wv.vocab[self.wv.index2word[word_index]].count ** power + train_words_pow += self.wv.vocab[self.wv.index2word[word_index]].count**power cumulative = 0.0 for word_index in xrange(vocab_size): - cumulative += self.wv.vocab[self.wv.index2word[word_index]].count ** power + cumulative += self.wv.vocab[self.wv.index2word[word_index]].count**power self.cum_table[word_index] = round(cumulative / train_words_pow * domain) if len(self.cum_table) > 0: assert self.cum_table[-1] == domain @@ -623,8 +617,7 @@ def build_vocab(self, sentences, keep_raw_vocab=False, trim_rule=None, progress_ Each sentence must be a list of unicode strings. """ self.scan_vocab(sentences, progress_per=progress_per, trim_rule=trim_rule) # initial survey - self.scale_vocab(keep_raw_vocab=keep_raw_vocab, trim_rule=trim_rule, - update=update) # trim by min_count & precalculate downsampling + self.scale_vocab(keep_raw_vocab=keep_raw_vocab, trim_rule=trim_rule, update=update) # trim by min_count & precalculate downsampling self.finalize_vocab(update=update) # build tables & arrays def build_vocab_from_freq(self, word_freq, keep_raw_vocab=False, corpus_count=None, trim_rule=None, update=False): @@ -662,8 +655,7 @@ def build_vocab_from_freq(self, word_freq, keep_raw_vocab=False, corpus_count=No self.corpus_count = corpus_count if corpus_count else 0 self.raw_vocab = vocab - self.scale_vocab(keep_raw_vocab=keep_raw_vocab, trim_rule=trim_rule, - update=update) # trim by min_count & precalculate downsampling + self.scale_vocab(keep_raw_vocab=keep_raw_vocab, trim_rule=trim_rule, update=update) # trim by min_count & precalculate downsampling self.finalize_vocab(update=update) # build tables & arrays def scan_vocab(self, sentences, progress_per=10000, trim_rule=None): @@ -674,22 +666,20 @@ def scan_vocab(self, sentences, progress_per=10000, trim_rule=None): min_reduce = 1 vocab = defaultdict(int) checked_string_types = 0 - for sentence_no, sentence in enumerate(sentences): if not checked_string_types: if isinstance(sentence, string_types): - logger.warning("Each 'sentences' " - "item should be a list of words " - "(usually unicode strings)." - "First item here is instead plain %s.", type(sentence) - ) + logger.warning( + "Each 'sentences' item should be a list of words (usually unicode strings). " + "First item here is instead plain %s.", + type(sentence) + ) checked_string_types += 1 if sentence_no % progress_per == 0: - logger.info("PROGRESS: at sentence #%i," - " processed %i words, " - "keeping %i word types", - sentence_no, total_words, len(vocab)) - + logger.info( + "PROGRESS: at sentence #%i, processed %i words, keeping %i word types", + sentence_no, total_words, len(vocab) + ) for word in sentence: vocab[word] += 1 total_words += 1 @@ -698,11 +688,13 @@ def scan_vocab(self, sentences, progress_per=10000, trim_rule=None): utils.prune_vocab(vocab, min_reduce, trim_rule=trim_rule) min_reduce += 1 - logger.info("collected %i word types from a corpus of %i raw words and %i sentences", len(vocab), total_words, - sentence_no + 1) + logger.info( + "collected %i word types from a corpus of %i raw words and %i sentences", + len(vocab), total_words, sentence_no + 1 + ) self.corpus_count = sentence_no + 1 self.raw_vocab = vocab - + def scale_vocab(self, min_count=None, sample=None, dry_run=False, keep_raw_vocab=False, trim_rule=None, update=False): """ @@ -809,7 +801,7 @@ def scale_vocab(self, min_count=None, sample=None, dry_run=False, word_probability = 1.0 downsample_total += v if not dry_run: - self.wv.vocab[w].sample_int = int(round(word_probability * 2 ** 32)) + self.wv.vocab[w].sample_int = int(round(word_probability * 2**32)) if not dry_run and not keep_raw_vocab: logger.info("deleting the raw counts dictionary of %i items", len(self.raw_vocab)) @@ -927,8 +919,7 @@ def train(self, sentences, total_examples=None, total_words=None, logger.info( "training model with %i workers on %i vocabulary and %i features, " "using sg=%s hs=%s sample=%s negative=%s window=%s", - self.workers, len(self.wv.vocab), self.layer1_size, self.sg, self.hs, self.sample, self.negative, - self.window + self.workers, len(self.wv.vocab), self.layer1_size, self.sg, self.hs, self.sample, self.negative, self.window ) if not self.wv.vocab: @@ -1095,16 +1086,13 @@ def job_producer(): raw_word_count, trained_word_count, elapsed, trained_word_count / elapsed ) if job_tally < 10 * self.workers: - logger.warning( - "under 10 jobs per worker: consider setting a smaller `batch_words' for smoother alpha decay") + logger.warning("under 10 jobs per worker: consider setting a smaller `batch_words' for smoother alpha decay") # check that the input corpus hasn't changed during iteration if total_examples and total_examples != example_count: - logger.warning("supplied example count (%i) did not equal expected count (%i)", example_count, - total_examples) + logger.warning("supplied example count (%i) did not equal expected count (%i)", example_count, total_examples) if total_words and total_words != raw_word_count: - logger.warning("supplied raw word count (%i) did not equal expected count (%i)", raw_word_count, - total_words) + logger.warning("supplied raw word count (%i) did not equal expected count (%i)", raw_word_count, total_words) self.train_count += 1 # number of times train() has been called self.total_train_time += elapsed @@ -1439,8 +1427,7 @@ def predict_output_word(self, context_words_list, topn=10): prob_values = exp(dot(l1, self.syn1neg.T)) # propagate hidden -> output and take softmax to get probabilities prob_values /= sum(prob_values) top_indices = matutils.argsort(prob_values, topn=topn, reverse=True) - return [(self.wv.index2word[index1], prob_values[index1]) for index1 in - top_indices] # returning the most probable output words with their probabilities + return [(self.wv.index2word[index1], prob_values[index1]) for index1 in top_indices] # returning the most probable output words with their probabilities def init_sims(self, replace=False): """ @@ -1484,8 +1471,7 @@ def log_evaluate_word_pairs(pearson, spearman, oov, pairs): """ return KeyedVectors.log_evaluate_word_pairs(pearson, spearman, oov, pairs) - def evaluate_word_pairs(self, pairs, delimiter='\t', restrict_vocab=300000, case_insensitive=True, - dummy4unknown=False): + def evaluate_word_pairs(self, pairs, delimiter='\t', restrict_vocab=300000, case_insensitive=True, dummy4unknown=False): """ Deprecated. Use self.wv.evaluate_word_pairs() instead. Refer to the documentation for `gensim.models.KeyedVectors.evaluate_word_pairs` @@ -1493,8 +1479,7 @@ def evaluate_word_pairs(self, pairs, delimiter='\t', restrict_vocab=300000, case return self.wv.evaluate_word_pairs(pairs, delimiter, restrict_vocab, case_insensitive, dummy4unknown) def __str__(self): - return "%s(vocab=%s, size=%s, alpha=%s)" % ( - self.__class__.__name__, len(self.wv.index2word), self.vector_size, self.alpha) + return "%s(vocab=%s, size=%s, alpha=%s)" % (self.__class__.__name__, len(self.wv.index2word), self.vector_size, self.alpha) def _minimize_model(self, save_syn1=False, save_syn1neg=False, save_syn0_lockf=False): warnings.warn( @@ -1544,7 +1529,7 @@ def load(cls, *args, **kwargs): if hasattr(v, 'sample_int'): break # already 0.12.0+ style int probabilities elif hasattr(v, 'sample_probability'): - v.sample_int = int(round(v.sample_probability * 2 ** 32)) + v.sample_int = int(round(v.sample_probability * 2**32)) del v.sample_probability if not hasattr(model, 'syn0_lockf') and hasattr(model, 'syn0'): model.syn0_lockf = ones(len(model.wv.syn0), dtype=REAL) @@ -1568,7 +1553,7 @@ def _load_specials(self, *args, **kwargs): @classmethod def load_word2vec_format(cls, fname, fvocab=None, binary=False, encoding='utf8', unicode_errors='strict', - limit=None, datatype=REAL): + limit=None, datatype=REAL): """Deprecated. Use gensim.models.KeyedVectors.load_word2vec_format instead.""" raise DeprecationWarning("Deprecated. Use gensim.models.KeyedVectors.load_word2vec_format instead.") @@ -1732,7 +1717,6 @@ def __iter__(self): # Example: ./word2vec.py -train data.txt -output vec.txt -size 200 -window 5 -sample 1e-4 -negative 5 -hs 0 -binary 0 -cbow 1 -iter 3 if __name__ == "__main__": import argparse - logging.basicConfig( format='%(asctime)s : %(threadName)s : %(levelname)s : %(message)s', level=logging.INFO) @@ -1754,23 +1738,14 @@ def __iter__(self): parser.add_argument("-output", help="Use file OUTPUT to save the resulting word vectors") parser.add_argument("-window", help="Set max skip length WINDOW between words; default is 5", type=int, default=5) parser.add_argument("-size", help="Set size of word vectors; default is 100", type=int, default=100) - parser.add_argument("-sample", - help="Set threshold for occurrence of words. Those that appear with higher frequency in the training data will be randomly down-sampled; default is 1e-3, useful range is (0, 1e-5)", - type=float, default=1e-3) - parser.add_argument("-hs", help="Use Hierarchical Softmax; default is 0 (not used)", type=int, default=0, - choices=[0, 1]) - parser.add_argument("-negative", - help="Number of negative examples; default is 5, common values are 3 - 10 (0 = not used)", - type=int, default=5) + parser.add_argument("-sample", help="Set threshold for occurrence of words. Those that appear with higher frequency in the training data will be randomly down-sampled; default is 1e-3, useful range is (0, 1e-5)", type=float, default=1e-3) + parser.add_argument("-hs", help="Use Hierarchical Softmax; default is 0 (not used)", type=int, default=0, choices=[0, 1]) + parser.add_argument("-negative", help="Number of negative examples; default is 5, common values are 3 - 10 (0 = not used)", type=int, default=5) parser.add_argument("-threads", help="Use THREADS threads (default 12)", type=int, default=12) parser.add_argument("-iter", help="Run more training iterations (default 5)", type=int, default=5) - parser.add_argument("-min_count", - help="This will discard words that appear less than MIN_COUNT times; default is 5", type=int, - default=5) - parser.add_argument("-cbow", help="Use the continuous bag of words model; default is 1 (use 0 for skip-gram model)", - type=int, default=1, choices=[0, 1]) - parser.add_argument("-binary", help="Save the resulting vectors in binary mode; default is 0 (off)", type=int, - default=0, choices=[0, 1]) + parser.add_argument("-min_count", help="This will discard words that appear less than MIN_COUNT times; default is 5", type=int, default=5) + parser.add_argument("-cbow", help="Use the continuous bag of words model; default is 1 (use 0 for skip-gram model)", type=int, default=1, choices=[0, 1]) + parser.add_argument("-binary", help="Save the resulting vectors in binary mode; default is 0 (off)", type=int, default=0, choices=[0, 1]) parser.add_argument("-accuracy", help="Use questions from file ACCURACY to evaluate the model") args = parser.parse_args() From 644fcada795d87043d213026fab3c6e609022c67 Mon Sep 17 00:00:00 2001 From: jodevak Date: Tue, 17 Oct 2017 13:57:53 +0300 Subject: [PATCH 07/20] Fixing gensim/models/word2vec.py:697:1: W293 blank line contains whitespace --- gensim/models/word2vec.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/gensim/models/word2vec.py b/gensim/models/word2vec.py index 7e52c060ec..756660a19e 100644 --- a/gensim/models/word2vec.py +++ b/gensim/models/word2vec.py @@ -693,8 +693,8 @@ def scan_vocab(self, sentences, progress_per=10000, trim_rule=None): len(vocab), total_words, sentence_no + 1 ) self.corpus_count = sentence_no + 1 - self.raw_vocab = vocab - + self.raw_vocab = vocab + def scale_vocab(self, min_count=None, sample=None, dry_run=False, keep_raw_vocab=False, trim_rule=None, update=False): """ From c91b4cb157e48ca253bb89596ed640d92b91e916 Mon Sep 17 00:00:00 2001 From: jodevak Date: Tue, 17 Oct 2017 14:50:26 +0300 Subject: [PATCH 08/20] Remove trailing white spaces --- gensim/models/word2vec.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gensim/models/word2vec.py b/gensim/models/word2vec.py index 756660a19e..66a242e288 100644 --- a/gensim/models/word2vec.py +++ b/gensim/models/word2vec.py @@ -693,7 +693,7 @@ def scan_vocab(self, sentences, progress_per=10000, trim_rule=None): len(vocab), total_words, sentence_no + 1 ) self.corpus_count = sentence_no + 1 - self.raw_vocab = vocab + self.raw_vocab = vocab def scale_vocab(self, min_count=None, sample=None, dry_run=False, keep_raw_vocab=False, trim_rule=None, update=False): From 1e4ef3ee9cb99ac4b175518269bbe4977ab09bc1 Mon Sep 17 00:00:00 2001 From: jodevak Date: Wed, 18 Oct 2017 22:59:28 +0300 Subject: [PATCH 09/20] Adding test --- gensim/test/test_word2vec.py | 49 +++++++++++++++++++++++++++++++++++- 1 file changed, 48 insertions(+), 1 deletion(-) diff --git a/gensim/test/test_word2vec.py b/gensim/test/test_word2vec.py index 81123ccd7a..85a7855412 100644 --- a/gensim/test/test_word2vec.py +++ b/gensim/test/test_word2vec.py @@ -84,6 +84,53 @@ def load_on_instance(): class TestWord2VecModel(unittest.TestCase): + def testBuildVocabFromFreq(self): + """Test that the algorithm is able to build vocabulary from given + frequency table""" + freq_dict={ + 'minors': 2, 'graph': 3, 'system': 4, + 'trees': 3, 'eps': 2, 'computer': 2, + 'survey': 2, 'user': 3, 'human': 2, + 'time': 2, 'interface': 2, 'response': 2 + } + model_hs = word2vec.Word2Vec(size=10, min_count=0, seed=42, hs=1, negative=0) + model_neg = word2vec.Word2Vec(size=10, min_count=0, seed=42, hs=0, negative=5) + model_hs.build_vocab_from_freq(freq_dict) + model_neg.build_vocab_from_freq(freq_dict) + self.assertTrue(len(model_hs.wv.vocab), 12) + self.assertTrue(len(model_neg.wv.vocab), 12) + self.assertEqual(model_hs.wv.vocab['minors'].count, 2) + self.assertEqual(model_hs.wv.vocab['graph'].count, 3) + self.assertEqual(model_hs.wv.vocab['system'].count, 4) + self.assertEqual(model_hs.wv.vocab['trees'].count, 3) + self.assertEqual(model_hs.wv.vocab['eps'].count, 2) + self.assertEqual(model_hs.wv.vocab['computer'].count, 2) + self.assertEqual(model_hs.wv.vocab['survey'].count, 2) + self.assertEqual(model_hs.wv.vocab['user'].count, 3) + self.assertEqual(model_hs.wv.vocab['human'].count, 2) + self.assertEqual(model_hs.wv.vocab['time'].count, 2) + self.assertEqual(model_hs.wv.vocab['interface'].count, 2) + self.assertEqual(model_hs.wv.vocab['response'].count, 2) + self.assertEqual(model_neg.wv.vocab['minors'].count, 2) + self.assertEqual(model_neg.wv.vocab['graph'].count, 3) + self.assertEqual(model_neg.wv.vocab['system'].count, 4) + self.assertEqual(model_neg.wv.vocab['trees'].count, 3) + self.assertEqual(model_neg.wv.vocab['eps'].count, 2) + self.assertEqual(model_neg.wv.vocab['computer'].count, 2) + self.assertEqual(model_neg.wv.vocab['survey'].count, 2) + self.assertEqual(model_neg.wv.vocab['user'].count, 3) + self.assertEqual(model_neg.wv.vocab['human'].count, 2) + self.assertEqual(model_neg.wv.vocab['time'].count, 2) + self.assertEqual(model_neg.wv.vocab['interface'].count, 2) + self.assertEqual(model_neg.wv.vocab['response'].count, 2) + new_freq_dict={'computer': 1, 'artificial': 4, 'human': 1, 'graph': 1, 'intelligence': 4, 'system': 1, 'trees': 1} + model_hs.build_vocab_from_freq(new_freq_dict,update=True) + model_neg.build_vocab_from_freq(new_freq_dict,update=True) + self.assertTrue(model_hs.wv.vocab['graph'].count, 4) + self.assertTrue(model_hs.wv.vocab['artificial'].count, 4) + self.assertEqual(len(model_hs.wv.vocab), 14) + self.assertEqual(len(model_neg.wv.vocab), 14) + def testOnlineLearning(self): """Test that the algorithm is able to add new words to the vocabulary and to a trained model when using a sorted vocabulary""" @@ -831,4 +878,4 @@ def assertLess(self, a, b, msg=None): level=logging.DEBUG ) logging.info("using optimization %s", word2vec.FAST_VERSION) - unittest.main() + unittest.main() \ No newline at end of file From 9ae7a84c946e9758ff45cdaaf7f16e7487105824 Mon Sep 17 00:00:00 2001 From: jodevak Date: Wed, 18 Oct 2017 23:34:43 +0300 Subject: [PATCH 10/20] fix spaces --- gensim/test/test_word2vec.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/gensim/test/test_word2vec.py b/gensim/test/test_word2vec.py index 85a7855412..20fb26ce4b 100644 --- a/gensim/test/test_word2vec.py +++ b/gensim/test/test_word2vec.py @@ -87,7 +87,7 @@ class TestWord2VecModel(unittest.TestCase): def testBuildVocabFromFreq(self): """Test that the algorithm is able to build vocabulary from given frequency table""" - freq_dict={ + freq_dict = { 'minors': 2, 'graph': 3, 'system': 4, 'trees': 3, 'eps': 2, 'computer': 2, 'survey': 2, 'user': 3, 'human': 2, @@ -123,9 +123,9 @@ def testBuildVocabFromFreq(self): self.assertEqual(model_neg.wv.vocab['time'].count, 2) self.assertEqual(model_neg.wv.vocab['interface'].count, 2) self.assertEqual(model_neg.wv.vocab['response'].count, 2) - new_freq_dict={'computer': 1, 'artificial': 4, 'human': 1, 'graph': 1, 'intelligence': 4, 'system': 1, 'trees': 1} - model_hs.build_vocab_from_freq(new_freq_dict,update=True) - model_neg.build_vocab_from_freq(new_freq_dict,update=True) + new_freq_dict = {'computer': 1, 'artificial': 4, 'human': 1, 'graph': 1, 'intelligence': 4, 'system': 1, 'trees': 1} + model_hs.build_vocab_from_freq(new_freq_dict, update=True) + model_neg.build_vocab_from_freq(new_freq_dict, update=True) self.assertTrue(model_hs.wv.vocab['graph'].count, 4) self.assertTrue(model_hs.wv.vocab['artificial'].count, 4) self.assertEqual(len(model_hs.wv.vocab), 14) @@ -878,4 +878,4 @@ def assertLess(self, a, b, msg=None): level=logging.DEBUG ) logging.info("using optimization %s", word2vec.FAST_VERSION) - unittest.main() \ No newline at end of file + unittest.main() From 1e82811cc1a4afa3cb07a86daacef5b631ffd5d3 Mon Sep 17 00:00:00 2001 From: jodevak Date: Mon, 6 Nov 2017 12:43:00 +0200 Subject: [PATCH 11/20] iteration 2 on code --- gensim/models/word2vec.py | 16 ++++++++++------ gensim/test/test_word2vec.py | 26 ++++++++++++++++++++++++++ 2 files changed, 36 insertions(+), 6 deletions(-) diff --git a/gensim/models/word2vec.py b/gensim/models/word2vec.py index 66a242e288..ab4521de63 100644 --- a/gensim/models/word2vec.py +++ b/gensim/models/word2vec.py @@ -647,13 +647,17 @@ def build_vocab_from_freq(self, word_freq, keep_raw_vocab=False, corpus_count=No Examples -------- - >>> build_vocab_from_freq({"Word1":15,"Word2":20}, update=True) + >>> model.build_vocab_from_freq({"Word1":15,"Word2":20}, update=True) """ logger.info("Processing provided word frequencies") - vocab = defaultdict(int, word_freq) + raw_vocab = word_freq #Instead of scanning text, this will assign provided word frequencies dictionary(word_freq) to be directly the raw vocab + logger.info( + "collected %i different raw word, with total frequency of %i", + len(raw_vocab), sum(itervalues(raw_vocab)) + ) - self.corpus_count = corpus_count if corpus_count else 0 - self.raw_vocab = vocab + self.corpus_count = corpus_count if corpus_count else 0 #Since no sentences are provided, this is to control the corpus_count + self.raw_vocab = raw_vocab self.scale_vocab(keep_raw_vocab=keep_raw_vocab, trim_rule=trim_rule, update=update) # trim by min_count & precalculate downsampling self.finalize_vocab(update=update) # build tables & arrays @@ -675,14 +679,14 @@ def scan_vocab(self, sentences, progress_per=10000, trim_rule=None): type(sentence) ) checked_string_types += 1 - if sentence_no % progress_per == 0: + if sentence_no % progress_per == 0 and sentence_no != 0: logger.info( "PROGRESS: at sentence #%i, processed %i words, keeping %i word types", sentence_no, total_words, len(vocab) ) for word in sentence: vocab[word] += 1 - total_words += 1 + total_words += len(sentence) if self.max_vocab_size and len(vocab) > self.max_vocab_size: utils.prune_vocab(vocab, min_reduce, trim_rule=trim_rule) diff --git a/gensim/test/test_word2vec.py b/gensim/test/test_word2vec.py index 20fb26ce4b..45aee7366f 100644 --- a/gensim/test/test_word2vec.py +++ b/gensim/test/test_word2vec.py @@ -131,6 +131,32 @@ def testBuildVocabFromFreq(self): self.assertEqual(len(model_hs.wv.vocab), 14) self.assertEqual(len(model_neg.wv.vocab), 14) + def testPruneVocab(self): + """Test Prune vocab while scanning sentences""" + sentences = [ + ["graph", "system"], + ["graph", "system"], + ["system", "eps"], + ["graph", "system"] + ] + model = word2vec.Word2Vec(sentences, size=10, min_count=0, max_vocab_size=2, seed=42, hs=1, negative=0) + self.assertTrue(len(model.wv.vocab), 2) + self.assertEqual(model.wv.vocab['graph'].count, 3) + self.assertEqual(model.wv.vocab['system'].count, 4) + + sentences = [ + ["graph", "system"], + ["graph", "system"], + ["system", "eps"], + ["graph", "system"], + ["minors", "survey", "minors", "survey", "minors"] + ] + model = word2vec.Word2Vec(sentences, size=10, min_count=0, max_vocab_size=2, seed=42, hs=1, negative=0) + self.assertTrue(len(model.wv.vocab), 3) + self.assertEqual(model.wv.vocab['graph'].count, 3) + self.assertEqual(model.wv.vocab['minors'].count, 3) + self.assertEqual(model.wv.vocab['system'].count, 4) + def testOnlineLearning(self): """Test that the algorithm is able to add new words to the vocabulary and to a trained model when using a sorted vocabulary""" From aa9227d7714906dee1524d91e6022370a2812b24 Mon Sep 17 00:00:00 2001 From: jodevak Date: Mon, 6 Nov 2017 12:50:51 +0200 Subject: [PATCH 12/20] iteration 2 on code --- gensim/models/word2vec.py | 56 +++++++++++++++++------------------- gensim/test/test_word2vec.py | 30 +++++++++++++++++-- 2 files changed, 55 insertions(+), 31 deletions(-) diff --git a/gensim/models/word2vec.py b/gensim/models/word2vec.py index 39a7219433..ab4521de63 100644 --- a/gensim/models/word2vec.py +++ b/gensim/models/word2vec.py @@ -647,13 +647,17 @@ def build_vocab_from_freq(self, word_freq, keep_raw_vocab=False, corpus_count=No Examples -------- - >>> build_vocab_from_freq({"Word1":15,"Word2":20}, update=True) + >>> model.build_vocab_from_freq({"Word1":15,"Word2":20}, update=True) """ logger.info("Processing provided word frequencies") - vocab = defaultdict(int, word_freq) + raw_vocab = word_freq #Instead of scanning text, this will assign provided word frequencies dictionary(word_freq) to be directly the raw vocab + logger.info( + "collected %i different raw word, with total frequency of %i", + len(raw_vocab), sum(itervalues(raw_vocab)) + ) - self.corpus_count = corpus_count if corpus_count else 0 - self.raw_vocab = vocab + self.corpus_count = corpus_count if corpus_count else 0 #Since no sentences are provided, this is to control the corpus_count + self.raw_vocab = raw_vocab self.scale_vocab(keep_raw_vocab=keep_raw_vocab, trim_rule=trim_rule, update=update) # trim by min_count & precalculate downsampling self.finalize_vocab(update=update) # build tables & arrays @@ -675,14 +679,14 @@ def scan_vocab(self, sentences, progress_per=10000, trim_rule=None): type(sentence) ) checked_string_types += 1 - if sentence_no % progress_per == 0: + if sentence_no % progress_per == 0 and sentence_no != 0: logger.info( "PROGRESS: at sentence #%i, processed %i words, keeping %i word types", sentence_no, total_words, len(vocab) ) for word in sentence: vocab[word] += 1 - total_words += 1 + total_words += len(sentence) if self.max_vocab_size and len(vocab) > self.max_vocab_size: utils.prune_vocab(vocab, min_reduce, trim_rule=trim_rule) @@ -1112,10 +1116,10 @@ def score(self, sentences, total_sentences=int(1e6), chunksize=100, queue_factor Note that you should specify total_sentences; we'll run into problems if you ask to score more than this number of sentences but it is inefficient to set the value too high. - See the article by [#taddy]_ and the gensim demo at [#deepir]_ for examples of how to use such scores in document classification. + See the article by [taddy]_ and the gensim demo at [deepir]_ for examples of how to use such scores in document classification. - .. [#taddy] Taddy, Matt. Document Classification by Inversion of Distributed Language Representations, in Proceedings of the 2015 Conference of the Association of Computational Linguistics. - .. [#deepir] https://github.com/piskvorky/gensim/blob/develop/docs/notebooks/deepir.ipynb + .. [taddy] Taddy, Matt. Document Classification by Inversion of Distributed Language Representations, in Proceedings of the 2015 Conference of the Association of Computational Linguistics. + .. [deepir] https://github.com/piskvorky/gensim/blob/develop/docs/notebooks/deepir.ipynb """ if FAST_VERSION < 0: @@ -1625,7 +1629,7 @@ class LineSentence(object): def __init__(self, source, max_sentence_length=MAX_WORDS_IN_BATCH, limit=None): """ `source` can be either a string or a file object. Clip the file to the first - `limit` lines (or not clipped if limit is None, the default). + `limit` lines (or no clipped if limit is None, the default). Example:: @@ -1666,20 +1670,15 @@ def __iter__(self): class PathLineSentences(object): """ - - Works like word2vec.LineSentence, but will process all files in a directory in alphabetical order by filename. - The directory can only contain files that can be read by LineSentence: .bz2, .gz, and text files. Any file not ending - with .bz2 or .gz is assumed to be a text file. Does not work with subdirectories. - - The format of files (either text, or compressed text files) in the path is one sentence = one line, with words already - preprocessed and separated by whitespace. - + Simple format: one sentence = one line; words already preprocessed and separated by whitespace. + Like LineSentence, but will process all files in a directory in alphabetical order by filename """ def __init__(self, source, max_sentence_length=MAX_WORDS_IN_BATCH, limit=None): """ `source` should be a path to a directory (as a string) where all files can be opened by the - LineSentence class. Each file will be read up to `limit` lines (or not clipped if limit is None, the default). + LineSentence class. Each file will be read up to + `limit` lines (or no clipped if limit is None, the default). Example:: @@ -1693,23 +1692,23 @@ def __init__(self, source, max_sentence_length=MAX_WORDS_IN_BATCH, limit=None): self.limit = limit if os.path.isfile(self.source): - logger.debug('single file given as source, rather than a directory of files') - logger.debug('consider using models.word2vec.LineSentence for a single file') + logging.warning('single file read, better to use models.word2vec.LineSentence') self.input_files = [self.source] # force code compatibility with list of files elif os.path.isdir(self.source): self.source = os.path.join(self.source, '') # ensures os-specific slash at end of path - logger.info('reading directory %s', self.source) + logging.debug('reading directory %s', self.source) self.input_files = os.listdir(self.source) - self.input_files = [self.source + filename for filename in self.input_files] # make full paths + self.input_files = [self.source + file for file in self.input_files] # make full paths self.input_files.sort() # makes sure it happens in filename order else: # not a file or a directory, then we can't do anything with it raise ValueError('input is neither a file nor a path') - logger.info('files read into PathLineSentences:%s', '\n'.join(self.input_files)) + + logging.info('files read into PathLineSentences:%s', '\n'.join(self.input_files)) def __iter__(self): """iterate through the files""" for file_name in self.input_files: - logger.info('reading file %s', file_name) + logging.info('reading file %s', file_name) with utils.smart_open(file_name) as fin: for line in itertools.islice(fin, self.limit): line = utils.to_unicode(line).split() @@ -1724,10 +1723,9 @@ def __iter__(self): import argparse logging.basicConfig( format='%(asctime)s : %(threadName)s : %(levelname)s : %(message)s', - level=logging.INFO - ) - logger.info("running %s", " ".join(sys.argv)) - logger.info("using optimization %s", FAST_VERSION) + level=logging.INFO) + logging.info("running %s", " ".join(sys.argv)) + logging.info("using optimization %s", FAST_VERSION) # check and process cmdline input program = os.path.basename(sys.argv[0]) diff --git a/gensim/test/test_word2vec.py b/gensim/test/test_word2vec.py index 4c642ce5d2..45aee7366f 100644 --- a/gensim/test/test_word2vec.py +++ b/gensim/test/test_word2vec.py @@ -131,6 +131,32 @@ def testBuildVocabFromFreq(self): self.assertEqual(len(model_hs.wv.vocab), 14) self.assertEqual(len(model_neg.wv.vocab), 14) + def testPruneVocab(self): + """Test Prune vocab while scanning sentences""" + sentences = [ + ["graph", "system"], + ["graph", "system"], + ["system", "eps"], + ["graph", "system"] + ] + model = word2vec.Word2Vec(sentences, size=10, min_count=0, max_vocab_size=2, seed=42, hs=1, negative=0) + self.assertTrue(len(model.wv.vocab), 2) + self.assertEqual(model.wv.vocab['graph'].count, 3) + self.assertEqual(model.wv.vocab['system'].count, 4) + + sentences = [ + ["graph", "system"], + ["graph", "system"], + ["system", "eps"], + ["graph", "system"], + ["minors", "survey", "minors", "survey", "minors"] + ] + model = word2vec.Word2Vec(sentences, size=10, min_count=0, max_vocab_size=2, seed=42, hs=1, negative=0) + self.assertTrue(len(model.wv.vocab), 3) + self.assertEqual(model.wv.vocab['graph'].count, 3) + self.assertEqual(model.wv.vocab['minors'].count, 3) + self.assertEqual(model.wv.vocab['system'].count, 4) + def testOnlineLearning(self): """Test that the algorithm is able to add new words to the vocabulary and to a trained model when using a sorted vocabulary""" @@ -291,11 +317,11 @@ def testPersistenceWord2VecFormat(self): self.assertFalse(np.allclose(model['human'], norm_only_model['human'])) self.assertTrue(np.allclose(model.wv.syn0norm[model.wv.vocab['human'].index], norm_only_model['human'])) limited_model_kv = keyedvectors.KeyedVectors.load_word2vec_format(testfile(), binary=True, limit=3) - self.assertEqual(len(limited_model_kv.syn0), 3) + self.assertEquals(len(limited_model_kv.syn0), 3) half_precision_model_kv = keyedvectors.KeyedVectors.load_word2vec_format( testfile(), binary=True, datatype=np.float16 ) - self.assertEqual(binary_model_kv.syn0.nbytes, half_precision_model_kv.syn0.nbytes * 2) + self.assertEquals(binary_model_kv.syn0.nbytes, half_precision_model_kv.syn0.nbytes * 2) def testNoTrainingCFormat(self): model = word2vec.Word2Vec(sentences, min_count=1) From 2066a2afb1b5044729e41e662c2c790d792623c9 Mon Sep 17 00:00:00 2001 From: jodevak Date: Mon, 6 Nov 2017 15:24:49 +0200 Subject: [PATCH 13/20] Fixing old version of word2vec.py merge problems --- gensim/models/word2vec.py | 47 +++++++++++++++++++++--------------- gensim/test/test_word2vec.py | 16 ++++++------ 2 files changed, 35 insertions(+), 28 deletions(-) diff --git a/gensim/models/word2vec.py b/gensim/models/word2vec.py index ab4521de63..1c72a86099 100644 --- a/gensim/models/word2vec.py +++ b/gensim/models/word2vec.py @@ -650,16 +650,17 @@ def build_vocab_from_freq(self, word_freq, keep_raw_vocab=False, corpus_count=No >>> model.build_vocab_from_freq({"Word1":15,"Word2":20}, update=True) """ logger.info("Processing provided word frequencies") - raw_vocab = word_freq #Instead of scanning text, this will assign provided word frequencies dictionary(word_freq) to be directly the raw vocab + raw_vocab = word_freq # Instead of scanning text, this will assign provided word frequencies dictionary(word_freq) to be directly the raw vocab logger.info( "collected %i different raw word, with total frequency of %i", len(raw_vocab), sum(itervalues(raw_vocab)) ) - self.corpus_count = corpus_count if corpus_count else 0 #Since no sentences are provided, this is to control the corpus_count + self.corpus_count = corpus_count if corpus_count else 0 # Since no sentences are provided, this is to control the corpus_count self.raw_vocab = raw_vocab - self.scale_vocab(keep_raw_vocab=keep_raw_vocab, trim_rule=trim_rule, update=update) # trim by min_count & precalculate downsampling + self.scale_vocab(keep_raw_vocab=keep_raw_vocab, trim_rule=trim_rule, + update=update) # trim by min_count & precalculate downsampling self.finalize_vocab(update=update) # build tables & arrays def scan_vocab(self, sentences, progress_per=10000, trim_rule=None): @@ -1116,10 +1117,10 @@ def score(self, sentences, total_sentences=int(1e6), chunksize=100, queue_factor Note that you should specify total_sentences; we'll run into problems if you ask to score more than this number of sentences but it is inefficient to set the value too high. - See the article by [taddy]_ and the gensim demo at [deepir]_ for examples of how to use such scores in document classification. + See the article by [#taddy]_ and the gensim demo at [#deepir]_ for examples of how to use such scores in document classification. - .. [taddy] Taddy, Matt. Document Classification by Inversion of Distributed Language Representations, in Proceedings of the 2015 Conference of the Association of Computational Linguistics. - .. [deepir] https://github.com/piskvorky/gensim/blob/develop/docs/notebooks/deepir.ipynb + .. [#taddy] Taddy, Matt. Document Classification by Inversion of Distributed Language Representations, in Proceedings of the 2015 Conference of the Association of Computational Linguistics. + .. [#deepir] https://github.com/piskvorky/gensim/blob/develop/docs/notebooks/deepir.ipynb """ if FAST_VERSION < 0: @@ -1629,7 +1630,7 @@ class LineSentence(object): def __init__(self, source, max_sentence_length=MAX_WORDS_IN_BATCH, limit=None): """ `source` can be either a string or a file object. Clip the file to the first - `limit` lines (or no clipped if limit is None, the default). + `limit` lines (or not clipped if limit is None, the default). Example:: @@ -1670,15 +1671,20 @@ def __iter__(self): class PathLineSentences(object): """ - Simple format: one sentence = one line; words already preprocessed and separated by whitespace. - Like LineSentence, but will process all files in a directory in alphabetical order by filename + + Works like word2vec.LineSentence, but will process all files in a directory in alphabetical order by filename. + The directory can only contain files that can be read by LineSentence: .bz2, .gz, and text files. Any file not ending + with .bz2 or .gz is assumed to be a text file. Does not work with subdirectories. + + The format of files (either text, or compressed text files) in the path is one sentence = one line, with words already + preprocessed and separated by whitespace. + """ def __init__(self, source, max_sentence_length=MAX_WORDS_IN_BATCH, limit=None): """ `source` should be a path to a directory (as a string) where all files can be opened by the - LineSentence class. Each file will be read up to - `limit` lines (or no clipped if limit is None, the default). + LineSentence class. Each file will be read up to `limit` lines (or not clipped if limit is None, the default). Example:: @@ -1692,23 +1698,23 @@ def __init__(self, source, max_sentence_length=MAX_WORDS_IN_BATCH, limit=None): self.limit = limit if os.path.isfile(self.source): - logging.warning('single file read, better to use models.word2vec.LineSentence') + logger.debug('single file given as source, rather than a directory of files') + logger.debug('consider using models.word2vec.LineSentence for a single file') self.input_files = [self.source] # force code compatibility with list of files elif os.path.isdir(self.source): self.source = os.path.join(self.source, '') # ensures os-specific slash at end of path - logging.debug('reading directory %s', self.source) + logger.info('reading directory %s', self.source) self.input_files = os.listdir(self.source) - self.input_files = [self.source + file for file in self.input_files] # make full paths + self.input_files = [self.source + filename for filename in self.input_files] # make full paths self.input_files.sort() # makes sure it happens in filename order else: # not a file or a directory, then we can't do anything with it raise ValueError('input is neither a file nor a path') - - logging.info('files read into PathLineSentences:%s', '\n'.join(self.input_files)) + logger.info('files read into PathLineSentences:%s', '\n'.join(self.input_files)) def __iter__(self): """iterate through the files""" for file_name in self.input_files: - logging.info('reading file %s', file_name) + logger.info('reading file %s', file_name) with utils.smart_open(file_name) as fin: for line in itertools.islice(fin, self.limit): line = utils.to_unicode(line).split() @@ -1723,9 +1729,10 @@ def __iter__(self): import argparse logging.basicConfig( format='%(asctime)s : %(threadName)s : %(levelname)s : %(message)s', - level=logging.INFO) - logging.info("running %s", " ".join(sys.argv)) - logging.info("using optimization %s", FAST_VERSION) + level=logging.INFO + ) + logger.info("running %s", " ".join(sys.argv)) + logger.info("using optimization %s", FAST_VERSION) # check and process cmdline input program = os.path.basename(sys.argv[0]) diff --git a/gensim/test/test_word2vec.py b/gensim/test/test_word2vec.py index 45aee7366f..9da2bb3d15 100644 --- a/gensim/test/test_word2vec.py +++ b/gensim/test/test_word2vec.py @@ -97,8 +97,8 @@ def testBuildVocabFromFreq(self): model_neg = word2vec.Word2Vec(size=10, min_count=0, seed=42, hs=0, negative=5) model_hs.build_vocab_from_freq(freq_dict) model_neg.build_vocab_from_freq(freq_dict) - self.assertTrue(len(model_hs.wv.vocab), 12) - self.assertTrue(len(model_neg.wv.vocab), 12) + self.assertEqual(len(model_hs.wv.vocab), 12) + self.assertEqual(len(model_neg.wv.vocab), 12) self.assertEqual(model_hs.wv.vocab['minors'].count, 2) self.assertEqual(model_hs.wv.vocab['graph'].count, 3) self.assertEqual(model_hs.wv.vocab['system'].count, 4) @@ -126,8 +126,8 @@ def testBuildVocabFromFreq(self): new_freq_dict = {'computer': 1, 'artificial': 4, 'human': 1, 'graph': 1, 'intelligence': 4, 'system': 1, 'trees': 1} model_hs.build_vocab_from_freq(new_freq_dict, update=True) model_neg.build_vocab_from_freq(new_freq_dict, update=True) - self.assertTrue(model_hs.wv.vocab['graph'].count, 4) - self.assertTrue(model_hs.wv.vocab['artificial'].count, 4) + self.assertEqual(model_hs.wv.vocab['graph'].count, 4) + self.assertEqual(model_hs.wv.vocab['artificial'].count, 4) self.assertEqual(len(model_hs.wv.vocab), 14) self.assertEqual(len(model_neg.wv.vocab), 14) @@ -140,7 +140,7 @@ def testPruneVocab(self): ["graph", "system"] ] model = word2vec.Word2Vec(sentences, size=10, min_count=0, max_vocab_size=2, seed=42, hs=1, negative=0) - self.assertTrue(len(model.wv.vocab), 2) + self.assertEqual(len(model.wv.vocab), 2) self.assertEqual(model.wv.vocab['graph'].count, 3) self.assertEqual(model.wv.vocab['system'].count, 4) @@ -152,7 +152,7 @@ def testPruneVocab(self): ["minors", "survey", "minors", "survey", "minors"] ] model = word2vec.Word2Vec(sentences, size=10, min_count=0, max_vocab_size=2, seed=42, hs=1, negative=0) - self.assertTrue(len(model.wv.vocab), 3) + self.assertEqual(len(model.wv.vocab), 3) self.assertEqual(model.wv.vocab['graph'].count, 3) self.assertEqual(model.wv.vocab['minors'].count, 3) self.assertEqual(model.wv.vocab['system'].count, 4) @@ -317,11 +317,11 @@ def testPersistenceWord2VecFormat(self): self.assertFalse(np.allclose(model['human'], norm_only_model['human'])) self.assertTrue(np.allclose(model.wv.syn0norm[model.wv.vocab['human'].index], norm_only_model['human'])) limited_model_kv = keyedvectors.KeyedVectors.load_word2vec_format(testfile(), binary=True, limit=3) - self.assertEquals(len(limited_model_kv.syn0), 3) + self.assertEqual(len(limited_model_kv.syn0), 3) half_precision_model_kv = keyedvectors.KeyedVectors.load_word2vec_format( testfile(), binary=True, datatype=np.float16 ) - self.assertEquals(binary_model_kv.syn0.nbytes, half_precision_model_kv.syn0.nbytes * 2) + self.assertEqual(binary_model_kv.syn0.nbytes, half_precision_model_kv.syn0.nbytes * 2) def testNoTrainingCFormat(self): model = word2vec.Word2Vec(sentences, min_count=1) From 62ed129291ff5b1892e93f09bd700de22669f76a Mon Sep 17 00:00:00 2001 From: jodevak Date: Mon, 6 Nov 2017 15:34:26 +0200 Subject: [PATCH 14/20] Fixing indent --- gensim/models/word2vec.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/gensim/models/word2vec.py b/gensim/models/word2vec.py index 1c72a86099..c3209e745d 100644 --- a/gensim/models/word2vec.py +++ b/gensim/models/word2vec.py @@ -659,8 +659,7 @@ def build_vocab_from_freq(self, word_freq, keep_raw_vocab=False, corpus_count=No self.corpus_count = corpus_count if corpus_count else 0 # Since no sentences are provided, this is to control the corpus_count self.raw_vocab = raw_vocab - self.scale_vocab(keep_raw_vocab=keep_raw_vocab, trim_rule=trim_rule, - update=update) # trim by min_count & precalculate downsampling + self.scale_vocab(keep_raw_vocab=keep_raw_vocab, trim_rule=trim_rule,update=update) # trim by min_count & precalculate downsampling self.finalize_vocab(update=update) # build tables & arrays def scan_vocab(self, sentences, progress_per=10000, trim_rule=None): From 473d7e6a77330a84c62916f4b0b7398fbefc90ef Mon Sep 17 00:00:00 2001 From: jodevak Date: Mon, 6 Nov 2017 16:37:36 +0200 Subject: [PATCH 15/20] Fixing Styling --- gensim/models/word2vec.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/gensim/models/word2vec.py b/gensim/models/word2vec.py index c3209e745d..45bf6f90ae 100644 --- a/gensim/models/word2vec.py +++ b/gensim/models/word2vec.py @@ -647,7 +647,9 @@ def build_vocab_from_freq(self, word_freq, keep_raw_vocab=False, corpus_count=No Examples -------- - >>> model.build_vocab_from_freq({"Word1":15,"Word2":20}, update=True) + >>> from gensim.models.word2vec import Word2Vec + >>> model=Word2Vec() + >>> model.build_vocab_from_freq({"Word1": 15, "Word2": 20}) """ logger.info("Processing provided word frequencies") raw_vocab = word_freq # Instead of scanning text, this will assign provided word frequencies dictionary(word_freq) to be directly the raw vocab @@ -659,7 +661,7 @@ def build_vocab_from_freq(self, word_freq, keep_raw_vocab=False, corpus_count=No self.corpus_count = corpus_count if corpus_count else 0 # Since no sentences are provided, this is to control the corpus_count self.raw_vocab = raw_vocab - self.scale_vocab(keep_raw_vocab=keep_raw_vocab, trim_rule=trim_rule,update=update) # trim by min_count & precalculate downsampling + self.scale_vocab(keep_raw_vocab=keep_raw_vocab, trim_rule=trim_rule, update=update) # trim by min_count & precalculate downsampling self.finalize_vocab(update=update) # build tables & arrays def scan_vocab(self, sentences, progress_per=10000, trim_rule=None): From a65e36bc0fddcc39322b4b266226f6db241bb4e4 Mon Sep 17 00:00:00 2001 From: jodevak Date: Mon, 6 Nov 2017 17:15:34 +0200 Subject: [PATCH 16/20] Fixing Styling --- gensim/models/word2vec.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gensim/models/word2vec.py b/gensim/models/word2vec.py index 45bf6f90ae..46c45e2abe 100644 --- a/gensim/models/word2vec.py +++ b/gensim/models/word2vec.py @@ -648,7 +648,7 @@ def build_vocab_from_freq(self, word_freq, keep_raw_vocab=False, corpus_count=No Examples -------- >>> from gensim.models.word2vec import Word2Vec - >>> model=Word2Vec() + >>> model= Word2Vec() >>> model.build_vocab_from_freq({"Word1": 15, "Word2": 20}) """ logger.info("Processing provided word frequencies") From 7f46a051a1027243872ccc9ea34e5b444e87b457 Mon Sep 17 00:00:00 2001 From: jodevak Date: Mon, 6 Nov 2017 18:24:30 +0200 Subject: [PATCH 17/20] test --- gensim/models/word2vec.py | 1 + gensim/test/test_word2vec.py | 7 +++++++ 2 files changed, 8 insertions(+) diff --git a/gensim/models/word2vec.py b/gensim/models/word2vec.py index 46c45e2abe..751b1ce4c4 100644 --- a/gensim/models/word2vec.py +++ b/gensim/models/word2vec.py @@ -701,6 +701,7 @@ def scan_vocab(self, sentences, progress_per=10000, trim_rule=None): self.corpus_count = sentence_no + 1 self.raw_vocab = vocab + def scale_vocab(self, min_count=None, sample=None, dry_run=False, keep_raw_vocab=False, trim_rule=None, update=False): """ diff --git a/gensim/test/test_word2vec.py b/gensim/test/test_word2vec.py index 9da2bb3d15..c11299f6d2 100644 --- a/gensim/test/test_word2vec.py +++ b/gensim/test/test_word2vec.py @@ -21,6 +21,7 @@ from gensim import utils from gensim.models import word2vec, keyedvectors from testfixtures import log_capture +from six import itervalues try: from pyemd import emd # noqa:F401 @@ -157,6 +158,12 @@ def testPruneVocab(self): self.assertEqual(model.wv.vocab['minors'].count, 3) self.assertEqual(model.wv.vocab['system'].count, 4) + def testTotalWordCount(self): + model = word2vec.Word2Vec(size=10, min_count=0, seed=42) + model.build_vocab(sentences, keep_raw_vocab=True) + total_words = sum(itervalues(model.raw_vocab)) + self.assertEqual(total_words, 29) + def testOnlineLearning(self): """Test that the algorithm is able to add new words to the vocabulary and to a trained model when using a sorted vocabulary""" From f744c4f199d8753bee0fe4f67c1a6a5fc2be8167 Mon Sep 17 00:00:00 2001 From: jodevak Date: Mon, 6 Nov 2017 18:25:13 +0200 Subject: [PATCH 18/20] test --- gensim/models/word2vec.py | 1 - 1 file changed, 1 deletion(-) diff --git a/gensim/models/word2vec.py b/gensim/models/word2vec.py index 751b1ce4c4..46c45e2abe 100644 --- a/gensim/models/word2vec.py +++ b/gensim/models/word2vec.py @@ -701,7 +701,6 @@ def scan_vocab(self, sentences, progress_per=10000, trim_rule=None): self.corpus_count = sentence_no + 1 self.raw_vocab = vocab - def scale_vocab(self, min_count=None, sample=None, dry_run=False, keep_raw_vocab=False, trim_rule=None, update=False): """ From 64711642d65c5a6f95165e05631863b4120dfa0b Mon Sep 17 00:00:00 2001 From: jodevak Date: Tue, 7 Nov 2017 11:59:10 +0200 Subject: [PATCH 19/20] adding total words count test --- gensim/models/word2vec.py | 1 + gensim/test/test_word2vec.py | 4 +--- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/gensim/models/word2vec.py b/gensim/models/word2vec.py index 46c45e2abe..64b389b4f3 100644 --- a/gensim/models/word2vec.py +++ b/gensim/models/word2vec.py @@ -700,6 +700,7 @@ def scan_vocab(self, sentences, progress_per=10000, trim_rule=None): ) self.corpus_count = sentence_no + 1 self.raw_vocab = vocab + return total_words def scale_vocab(self, min_count=None, sample=None, dry_run=False, keep_raw_vocab=False, trim_rule=None, update=False): diff --git a/gensim/test/test_word2vec.py b/gensim/test/test_word2vec.py index c11299f6d2..242b6d39bd 100644 --- a/gensim/test/test_word2vec.py +++ b/gensim/test/test_word2vec.py @@ -21,7 +21,6 @@ from gensim import utils from gensim.models import word2vec, keyedvectors from testfixtures import log_capture -from six import itervalues try: from pyemd import emd # noqa:F401 @@ -160,8 +159,7 @@ def testPruneVocab(self): def testTotalWordCount(self): model = word2vec.Word2Vec(size=10, min_count=0, seed=42) - model.build_vocab(sentences, keep_raw_vocab=True) - total_words = sum(itervalues(model.raw_vocab)) + total_words = model.scan_vocab(sentences) self.assertEqual(total_words, 29) def testOnlineLearning(self): From 9bc6b78daa82bdbdfe438c941aa9dbcc4c4efee1 Mon Sep 17 00:00:00 2001 From: jodevak Date: Tue, 7 Nov 2017 12:02:32 +0200 Subject: [PATCH 20/20] adding total words count test --- gensim/models/word2vec.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gensim/models/word2vec.py b/gensim/models/word2vec.py index 64b389b4f3..4ca0974a17 100644 --- a/gensim/models/word2vec.py +++ b/gensim/models/word2vec.py @@ -681,7 +681,7 @@ def scan_vocab(self, sentences, progress_per=10000, trim_rule=None): type(sentence) ) checked_string_types += 1 - if sentence_no % progress_per == 0 and sentence_no != 0: + if sentence_no % progress_per == 0: logger.info( "PROGRESS: at sentence #%i, processed %i words, keeping %i word types", sentence_no, total_words, len(vocab)