Skip to content

Commit

Permalink
fix online w2v code and add sanity checking
Browse files Browse the repository at this point in the history
  • Loading branch information
isomap committed Jul 12, 2016
1 parent 48c9f29 commit 22dab54
Show file tree
Hide file tree
Showing 2 changed files with 61 additions and 48 deletions.
52 changes: 21 additions & 31 deletions gensim/models/word2vec.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,7 @@

from numpy import exp, log, dot, zeros, outer, random, dtype, float32 as REAL,\
double, uint32, seterr, array, uint8, vstack, fromstring, sqrt, newaxis,\
ndarray, empty, sum as np_sum, prod, ones, ascontiguousarray
ndarray, empty, sum as np_sum, prod, ones, ascontiguousarray, concatenate

from gensim import utils, matutils # utility fnc for pickling, common scipy operations etc
from gensim.corpora.dictionary import Dictionary
Expand Down Expand Up @@ -600,21 +600,20 @@ def scale_vocab(self, min_count=None, sample=None, dry_run=False, keep_raw_vocab
else:
logger.info("Updating model with new vocabulary")
for word, v in iteritems(self.raw_vocab):
if not word in self.vocab:
# the word does not already exist in vocab
if keep_vocab_item(word, v, min_count,
trim_rule=trim_rule):
retain_words.append(word)
retain_total += v
original_total += v
if not dry_run:
self.vocab[word] = Vocab(count=v,
index=len(self.index2word))
if keep_vocab_item(word, v, min_count, trim_rule=trim_rule):
retain_words.append(word)
retain_total += v
original_total += v
if not dry_run:
if word in self.vocab:
self.vocab[word].count += v
else:
self.vocab[word] = Vocab(count=v, index=len(self.index2word))
self.index2word.append(word)
else:
drop_unique += 1
drop_total += v
original_total += v
else:
drop_unique += 1
drop_total += v
original_total += v

logger.info("min_count=%d retains %i unique words (drops %i)",
min_count, len(retain_words), drop_unique)
Expand Down Expand Up @@ -1036,28 +1035,19 @@ def update_weights(self):
added vocabulary.
"""
logger.info("updating layer weights")
newsyn0 = empty((len(self.vocab), self.vector_size), dtype=REAL)

# copy the weights that are already learned
for i in xrange(0, len(self.syn0)):
newsyn0[i] = deepcopy(self.syn0[i])
gained_vocab = len(self.vocab) - len(self.syn0)
newsyn0 = empty((gained_vocab, self.vector_size), dtype=REAL)

# randomize the remaining words
for i in xrange(len(self.vocab), len(newsyn0)):
for i in xrange(len(self.syn0), len(self.vocab)):
# construct deterministic seed from word AND seed argument
self.syn0[i] = self.seeded_vector(self.index2word[i] + str(self.seed))
self.syn0 = deepcopy(newsyn0)
newsyn0[i-len(self.syn0)] = self.seeded_vector(self.index2word[i] + str(self.seed))
self.syn0 = concatenate([self.syn0, newsyn0])

if self.hs:
oldsyn1 = deepcopy(self.syn1)
self.syn1 = zeros((len(self.vocab), self.layer1_size), dtype=REAL)
self.syn1[i] = deepcopy(oldsyn1[i])

self.syn1 = concatenate([self.syn1, zeros((gained_vocab, self.layer1_size), dtype=REAL)])
if self.negative:
oldneg = deepcopy(self.syn1neg)
self.syn1neg = zeros((len(self.vocab), self.layer1_size), dtype=REAL)
self.syn1neg[i] = deepcopy(oldneg[i])

self.syn1neg = concatenate([self.syn1neg, zeros((gained_vocab, self.layer1_size), dtype=REAL)])
self.syn0norm = None

# do not suppress learning for already learned words
Expand Down
57 changes: 40 additions & 17 deletions gensim/test/test_word2vec.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ def __iter__(self):
['human', 'intelligence'],
['artificial', 'graph'],
['intelligence'],
['survey', 'user', 'artificial', 'system', 'response', 'time']
['artificial', 'intelligence', 'system']
]

def testfile():
Expand Down Expand Up @@ -111,27 +111,50 @@ class TestWord2VecModel(unittest.TestCase):
def testOnlineLearning(self):
"""Test that the algorithm is able to add new words to the
vocabulary and to a trained model when using a sorted vocabulary"""
model_hs = word2vec.Word2Vec(sentences, min_count=0, seed=42, hs=1, negative=0, sorted_vocab=0, iter=10)
model_neg = word2vec.Word2Vec(sentences, min_count=0, seed=42, hs=0, negative=5, sorted_vocab=0, iter=10)
model_hs = word2vec.Word2Vec(sentences, size=10, min_count=0, seed=42, hs=1, negative=0)
model_neg = word2vec.Word2Vec(sentences, size=10, min_count=0, seed=42, hs=0, negative=5)
self.assertTrue(len(model_hs.vocab), 12)
self.assertTrue(model_hs.vocab['graph'].count, 3)
model_hs.build_vocab(new_sentences, update=True)
model_neg.build_vocab(new_sentences, update=True)
orig0hs = numpy.copy(model_hs.syn0)
orig0neg = numpy.copy(model_neg.syn0)
self.assertTrue(numpy.allclose(model_hs.syn0, orig0hs))
self.assertTrue(numpy.allclose(model_neg.syn0, orig0neg))
orig1hs = numpy.copy(model_hs.syn1)
orig1neg = numpy.copy(model_neg.syn1neg)
self.assertTrue(numpy.allclose(model_hs.syn1, orig1hs))
self.assertTrue(numpy.allclose(model_neg.syn1neg, orig1neg))
model_hs.train(new_sentences)
model_neg.train(new_sentences)
self.assertTrue(model_hs.vocab['graph'].count, 4)
self.assertTrue(model_hs.vocab['artificial'].count, 4)
self.assertEqual(len(model_hs.vocab), 14)
self.assertEqual(len(model_neg.vocab), 14)
self.assertFalse(numpy.allclose(model_hs.syn1, orig1hs))
self.assertFalse(numpy.allclose(model_neg.syn1neg, orig1neg))
self.assertFalse(numpy.allclose(model_hs.syn0, orig0hs))
self.assertFalse(numpy.allclose(model_neg.syn0, orig0neg))

def onlineSanity(self, model):
terro = [l for l in list_corpus if 'terrorism' in l]
others = [l for l in list_corpus if 'terrorism' in l]
model.build_vocab(others)
model.train(others)
model.build_vocab(terro, update=True)
orig0 = numpy.copy(model.syn0)
model.train(terro)
self.assertFalse(numpy.allclose(model.syn0, orig0))
sim = model.n_similarity(['war'], ['terrorism'])
self.assertGreater(sim, 0.5)

def test_sg_hs_online(self):
"""Test skipgram w/ hierarchical softmax"""
model = word2vec.Word2Vec(sg=1, window=4, hs=1, negative=0, min_count=5, iter=10, workers=2)
self.onlineSanity(model)

def test_sg_neg_online(self):
"""Test skipgram w/ negative sampling"""
model = word2vec.Word2Vec(sg=1, window=4, hs=0, negative=15, min_count=5, iter=10, workers=2)
self.onlineSanity(model)

def test_cbow_hs_online(self):
"""Test CBOW w/ hierarchical softmax"""
model = word2vec.Word2Vec(sg=0, cbow_mean=1, alpha=0.05, window=1, hs=1, negative=0,
min_count=5, iter=10, workers=2, batch_words=1000)
self.onlineSanity(model)

def test_cbow_neg_online(self):
"""Test CBOW w/ negative sampling"""
model = word2vec.Word2Vec(sg=0, cbow_mean=1, alpha=0.05, window=5, hs=0, negative=15,
min_count=5, iter=10, workers=2, sample=0)
self.onlineSanity(model)

def testPersistenceWord2VecFormat(self):
"""Test storing/loading the entire model in word2vec format."""
Expand Down

0 comments on commit 22dab54

Please sign in to comment.