Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add build_vocab to poincare model #2505

Merged
merged 18 commits into from
Jul 7, 2019
Merged
Show file tree
Hide file tree
Changes from 6 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
82 changes: 57 additions & 25 deletions gensim/models/poincare.py
Original file line number Diff line number Diff line change
Expand Up @@ -152,6 +152,10 @@ def __init__(self, train_data, size=50, alpha=0.1, negative=10, workers=1, epsil
"""
self.train_data = train_data
self.kv = PoincareKeyedVectors(size)
self.all_relations = []
self.node_relations = defaultdict(set)
self._negatives_buffer = NegativesBuffer([])
self._negatives_buffer_size = 2000
self.size = size
self.train_alpha = alpha # Learning rate for training
self.burn_in_alpha = burn_in_alpha # Learning rate for burn-in
Expand All @@ -167,47 +171,74 @@ def __init__(self, train_data, size=50, alpha=0.1, negative=10, workers=1, epsil
self._np_random = np_random.RandomState(seed)
self.init_range = init_range
self._loss_grad = None
self._load_relations()
self._init_embeddings()
self.build_vocab(train_data)

def build_vocab(self, relations, update=False):
"""Build vocabulary from a relations.
Each relations must be a tuples of unicode strings.

Parameters
----------
relations : list of tuples
List of tuples of positive examples of the form (node_1_index, node_2_index).
update : bool
koiizukag marked this conversation as resolved.
Show resolved Hide resolved
If true, the new nodes in `relations` will be added to model's vocab.
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

But relations doesn't contain nodes. Its description above says it contains "node indexes" (btw where does the user find those?).

Also, what happens if False (the default)?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I modified the description of relations using init description.

But relations doesn't contain nodes. Its description above says it contains "node indexes" (btw where does the user find those?).

If update=False, the embeddings are initialized by random values.
(It means that the trained embeddings are cleaned.)

Also, what happens if False (the default)?

Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

OK, I don't really understand that, but that information should appear in the documentation.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I added update=False description in build_vocab 👍


Examples
--------
Train a model and update vocab for online training:

def _load_relations(self):
"""Load relations from the train data and build vocab."""
vocab = {}
index2word = []
all_relations = [] # List of all relation pairs
node_relations = defaultdict(set) # Mapping from node index to its related node indices
.. sourcecode:: pycon

>>> from gensim.models.poincare import PoincareModel
>>> relations_1 = [('kangaroo', 'marsupial'), ('kangaroo', 'mammal')]
koiizukag marked this conversation as resolved.
Show resolved Hide resolved
>>> relations_2 = [('striped_skunk', 'mammal')]
>>>
>>> model = PoincareModel(relations_1, negative=1)
koiizukag marked this conversation as resolved.
Show resolved Hide resolved
>>> model.train(epochs=50)
>>>
>>> model.build_vocab(relations_2, update=True)
koiizukag marked this conversation as resolved.
Show resolved Hide resolved
>>> model.train(epochs=50)

"""
old_index2word_len = len(self.kv.index2word)

logger.info("loading relations from train data..")
for relation in self.train_data:
for relation in relations:
mpenkov marked this conversation as resolved.
Show resolved Hide resolved
if len(relation) != 2:
raise ValueError('Relation pair "%s" should have exactly two items' % repr(relation))
for item in relation:
if item in vocab:
vocab[item].count += 1
if item in self.kv.vocab:
mpenkov marked this conversation as resolved.
Show resolved Hide resolved
self.kv.vocab[item].count += 1
else:
vocab[item] = Vocab(count=1, index=len(index2word))
index2word.append(item)
self.kv.vocab[item] = Vocab(count=1, index=len(self.kv.index2word))
self.kv.index2word.append(item)
node_1, node_2 = relation
node_1_index, node_2_index = vocab[node_1].index, vocab[node_2].index
node_relations[node_1_index].add(node_2_index)
node_1_index, node_2_index = self.kv.vocab[node_1].index, self.kv.vocab[node_2].index
self.node_relations[node_1_index].add(node_2_index)
relation = (node_1_index, node_2_index)
all_relations.append(relation)
logger.info("loaded %d relations from train data, %d nodes", len(all_relations), len(vocab))
self.kv.vocab = vocab
self.kv.index2word = index2word
self.indices_set = set(range(len(index2word))) # Set of all node indices
self.indices_array = np.fromiter(range(len(index2word)), dtype=int) # Numpy array of all node indices
self.all_relations = all_relations
self.node_relations = node_relations
self.all_relations.append(relation)
logger.info("loaded %d relations from train data, %d nodes", len(self.all_relations), len(self.kv.vocab))
self.indices_set = set(range(len(self.kv.index2word))) # Set of all node indices
self.indices_array = np.fromiter(range(len(self.kv.index2word)), dtype=int) # Numpy array of all node indices
self._init_node_probabilities()
self._negatives_buffer = NegativesBuffer([]) # Buffer for negative samples, to reduce calls to sampling method
self._negatives_buffer_size = 2000

if not update:
self._init_embeddings()
else:
self._update_embeddings(old_index2word_len)

def _init_embeddings(self):
"""Randomly initialize vectors for the items in the vocab."""
shape = (len(self.kv.index2word), self.size)
self.kv.syn0 = self._np_random.uniform(self.init_range[0], self.init_range[1], shape).astype(self.dtype)

def _update_embeddings(self, old_index2word_len):
"""Randomly initialize vectors for the items in the additional vocab."""
shape = (len(self.kv.index2word) - old_index2word_len, self.size)
v = self._np_random.uniform(self.init_range[0], self.init_range[1], shape).astype(self.dtype)
self.kv.syn0 = np.concatenate([self.kv.syn0, v])

def _init_node_probabilities(self):
"""Initialize a-priori probabilities."""
counts = np.fromiter((
Expand Down Expand Up @@ -829,6 +860,7 @@ def __init__(self, vector_size):
super(PoincareKeyedVectors, self).__init__(vector_size)
self.max_distance = 0
self.index2word = []
self.vocab = {}

@property
def vectors(self):
Expand Down
10 changes: 10 additions & 0 deletions gensim/test/test_poincare.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,16 @@ def test_persistence_separate_file(self):
loaded = PoincareModel.load(testfile())
self.models_equal(model, loaded)

def test_online_learning(self):
"""Tests whether additional input data is loaded correctly and completely."""
model = PoincareModel(self.data, burn_in=0, negative=3)
self.assertEqual(len(model.kv.vocab), 7)
self.assertEqual(model.kv.vocab['kangaroo.n.01'].count, 3)
self.assertEqual(model.kv.vocab['cat.n.01'].count, 1)
model.build_vocab([('kangaroo.n.01', 'cat.n.01')], update=True) # update vocab
self.assertEqual(model.kv.vocab['kangaroo.n.01'].count, 4)
self.assertEqual(model.kv.vocab['cat.n.01'].count, 2)

def test_train_after_load(self):
"""Tests whether the model can be trained correctly after loading from disk."""
model = PoincareModel(self.data, burn_in=0, negative=3)
Expand Down