From e4f3d956de29aa400661da70e1a0548f94ca9fb5 Mon Sep 17 00:00:00 2001 From: "kojiro.iizuka" Date: Wed, 29 May 2019 15:06:30 +0900 Subject: [PATCH 01/15] add build_vocab to poincare model --- gensim/models/poincare.py | 133 ++++++++++++++++++++++++++++------- gensim/test/test_poincare.py | 15 ++++ 2 files changed, 124 insertions(+), 24 deletions(-) diff --git a/gensim/models/poincare.py b/gensim/models/poincare.py index 0c49c761f2..c4180572d5 100644 --- a/gensim/models/poincare.py +++ b/gensim/models/poincare.py @@ -152,6 +152,10 @@ def __init__(self, train_data, size=50, alpha=0.1, negative=10, workers=1, epsil """ self.train_data = train_data self.kv = PoincareKeyedVectors(size) + self.all_relations = [] + self.node_relations = defaultdict(set) + self._negatives_buffer = NegativesBuffer([]) + self._negatives_buffer_size = 2000 self.size = size self.train_alpha = alpha # Learning rate for training self.burn_in_alpha = burn_in_alpha # Learning rate for burn-in @@ -167,47 +171,48 @@ def __init__(self, train_data, size=50, alpha=0.1, negative=10, workers=1, epsil self._np_random = np_random.RandomState(seed) self.init_range = init_range self._loss_grad = None - self._load_relations() - self._init_embeddings() + self.build_vocab(train_data) - def _load_relations(self): + def build_vocab(self, relations=None, update=False): """Load relations from the train data and build vocab.""" - vocab = {} - index2word = [] - all_relations = [] # List of all relation pairs - node_relations = defaultdict(set) # Mapping from node index to its related node indices + old_index2word_len = len(self.kv.index2word) logger.info("loading relations from train data..") - for relation in self.train_data: + for relation in relations: if len(relation) != 2: raise ValueError('Relation pair "%s" should have exactly two items' % repr(relation)) for item in relation: - if item in vocab: - vocab[item].count += 1 + if item in self.kv.vocab: + self.kv.vocab[item].count += 1 else: - vocab[item] = Vocab(count=1, index=len(index2word)) - index2word.append(item) + self.kv.vocab[item] = Vocab(count=1, index=len(self.kv.index2word)) + self.kv.index2word.append(item) node_1, node_2 = relation - node_1_index, node_2_index = vocab[node_1].index, vocab[node_2].index - node_relations[node_1_index].add(node_2_index) + node_1_index, node_2_index = self.kv.vocab[node_1].index, self.kv.vocab[node_2].index + self.node_relations[node_1_index].add(node_2_index) relation = (node_1_index, node_2_index) - all_relations.append(relation) - logger.info("loaded %d relations from train data, %d nodes", len(all_relations), len(vocab)) - self.kv.vocab = vocab - self.kv.index2word = index2word - self.indices_set = set(range(len(index2word))) # Set of all node indices - self.indices_array = np.fromiter(range(len(index2word)), dtype=int) # Numpy array of all node indices - self.all_relations = all_relations - self.node_relations = node_relations + self.all_relations.append(relation) + logger.info("loaded %d relations from train data, %d nodes", len(self.all_relations), len(self.kv.vocab)) + self.indices_set = set(range(len(self.kv.index2word))) # Set of all node indices + self.indices_array = np.fromiter(range(len(self.kv.index2word)), dtype=int) # Numpy array of all node indices self._init_node_probabilities() - self._negatives_buffer = NegativesBuffer([]) # Buffer for negative samples, to reduce calls to sampling method - self._negatives_buffer_size = 2000 + + if not update: + self._init_embeddings() + else: + self._update_embeddings(old_index2word_len) def _init_embeddings(self): """Randomly initialize vectors for the items in the vocab.""" shape = (len(self.kv.index2word), self.size) self.kv.syn0 = self._np_random.uniform(self.init_range[0], self.init_range[1], shape).astype(self.dtype) + def _update_embeddings(self, old_index2word_len): + """Randomly initialize vectors for the items in the additional vocab.""" + shape = (len(self.kv.index2word) - old_index2word_len, self.size) + v = self._np_random.uniform(self.init_range[0], self.init_range[1], shape).astype(self.dtype) + self.kv.syn0 = np.concatenate([self.kv.syn0, v]) + def _init_node_probabilities(self): """Initialize a-priori probabilities.""" counts = np.fromiter(( @@ -564,6 +569,85 @@ def _update_vectors_batch(self, batch): self.kv.syn0[indices_v] -= v_updates self.kv.syn0[indices_v] = self._clip_vectors(self.kv.syn0[indices_v], self.epsilon) + def _build_vocab(self, relations=None, corpus_file=None, update=False, progress_per=10000, keep_raw_vocab=False, + trim_rule=None, **kwargs): + """Build vocabulary from a sequence of sentences (can be a once-only generator stream). + Each sentence must be a list of unicode strings. + + Parameters + ---------- + relations : iterable of list of str, optional + Can be simply a list of lists of tokens, but for larger corpora, + consider an iterable that streams the sentences directly from disk/network. + See :class:`~gensim.models.word2vec.BrownCorpus`, :class:`~gensim.models.word2vec.Text8Corpus` + or :class:`~gensim.models.word2vec.LineSentence` in :mod:`~gensim.models.word2vec` module for such examples. + corpus_file : str, optional + Path to a corpus file in :class:`~gensim.models.word2vec.LineSentence` format. + You may use this argument instead of `sentences` to get performance boost. Only one of `sentences` or + `corpus_file` arguments need to be passed (not both of them). + update : bool + If true, the new words in `sentences` will be added to model's vocab. + progress_per : int + Indicates how many words to process before showing/updating the progress. + keep_raw_vocab : bool + If not true, delete the raw vocabulary after the scaling is done and free up RAM. + trim_rule : function, optional + Vocabulary trimming rule, specifies whether certain words should remain in the vocabulary, + be trimmed away, or handled using the default (discard if word count < min_count). + Can be None (min_count will be used, look to :func:`~gensim.utils.keep_vocab_item`), + or a callable that accepts parameters (word, count, min_count) and returns either + :attr:`gensim.utils.RULE_DISCARD`, :attr:`gensim.utils.RULE_KEEP` or :attr:`gensim.utils.RULE_DEFAULT`. + The rule, if given, is only used to prune vocabulary during + :meth:`~gensim.models.fasttext.FastText.build_vocab` and is not stored as part of the model. + + The input parameters are of the following types: + * `word` (str) - the word we are examining + * `count` (int) - the word's frequency count in the corpus + * `min_count` (int) - the minimum count threshold. + + **kwargs + Additional key word parameters passed to + :meth:`~gensim.models.base_any2vec.BaseWordEmbeddingsModel.build_vocab`. + + Examples + -------- + Train a model and update vocab for online training: + + .. sourcecode:: pycon + + >>> from gensim.models import FastText + >>> relations_1 = [('kangaroo', 'marsupial'), ('kangaroo', 'mammal')] + >>> relations_2 = [('gib', 'cat')] + >>> + >>> model = PoincareModel(relations, negative=2) + >>> model.build_vocab(relations_1) + >>> model.train(relations_1, epochs=5) + >>> + >>> model.build_vocab(relations_2, update=True) + >>> model.train(relations_2, epochs=5) + + """ + if not update: + self.wv.init_ngrams_weights(self.trainables.seed) + elif not len(self.wv.vocab): + raise RuntimeError( + "You cannot do an online vocabulary-update of a model which has no prior vocabulary. " + "First build the vocabulary of your model with a corpus " + "by calling the gensim.models.fasttext.FastText.build_vocab method " + "before doing an online update." + ) + else: + self.vocabulary.old_vocab_len = len(self.wv.vocab) + + retval = super(FastText, self).build_vocab( + sentences=sentences, corpus_file=corpus_file, update=update, progress_per=progress_per, + keep_raw_vocab=keep_raw_vocab, trim_rule=trim_rule, **kwargs) + + if update: + self.wv.update_ngrams_weights(self.trainables.seed, self.vocabulary.old_vocab_len) + + return retval + def train(self, epochs, batch_size=10, print_every=1000, check_gradients_every=None): """Train Poincare embeddings using loaded data and model parameters. @@ -829,6 +913,7 @@ def __init__(self, vector_size): super(PoincareKeyedVectors, self).__init__(vector_size) self.max_distance = 0 self.index2word = [] + self.vocab = {} @property def vectors(self): diff --git a/gensim/test/test_poincare.py b/gensim/test/test_poincare.py index c057c81bf0..f2f51a576d 100644 --- a/gensim/test/test_poincare.py +++ b/gensim/test/test_poincare.py @@ -93,6 +93,21 @@ def test_persistence_separate_file(self): loaded = PoincareModel.load(testfile()) self.models_equal(model, loaded) + def test_online_learning(self): + """Tests whether additional input data is loaded correctly and completely.""" + model = PoincareModel(self.data, burn_in=0, negative=3) + self.assertEqual(len(model.kv.vocab), 7) + self.assertEqual(model.kv.vocab['kangaroo.n.01'].count, 3) + self.assertEqual(model.kv.vocab['cat.n.01'].count, 1) + model.train(epochs=2) + old_vectors = np.copy(model.kv.syn0) + + model.build_vocab([('kangaroo.n.01', 'cat.n.01')], update=True) # update vocab + self.assertEqual(model.kv.vocab['kangaroo.n.01'].count, 4) + self.assertEqual(model.kv.vocab['cat.n.01'].count, 2) + model.train(epochs=2) + self.assertFalse(np.allclose(old_vectors, model.kv.syn0)) + def test_train_after_load(self): """Tests whether the model can be trained correctly after loading from disk.""" model = PoincareModel(self.data, burn_in=0, negative=3) From 8ed8ab3dadced45677707f29d1458cb3f8f5a20d Mon Sep 17 00:00:00 2001 From: "kojiro.iizuka" Date: Wed, 29 May 2019 15:08:53 +0900 Subject: [PATCH 02/15] delete unused func --- gensim/models/poincare.py | 79 --------------------------------------- 1 file changed, 79 deletions(-) diff --git a/gensim/models/poincare.py b/gensim/models/poincare.py index c4180572d5..45905afab4 100644 --- a/gensim/models/poincare.py +++ b/gensim/models/poincare.py @@ -569,85 +569,6 @@ def _update_vectors_batch(self, batch): self.kv.syn0[indices_v] -= v_updates self.kv.syn0[indices_v] = self._clip_vectors(self.kv.syn0[indices_v], self.epsilon) - def _build_vocab(self, relations=None, corpus_file=None, update=False, progress_per=10000, keep_raw_vocab=False, - trim_rule=None, **kwargs): - """Build vocabulary from a sequence of sentences (can be a once-only generator stream). - Each sentence must be a list of unicode strings. - - Parameters - ---------- - relations : iterable of list of str, optional - Can be simply a list of lists of tokens, but for larger corpora, - consider an iterable that streams the sentences directly from disk/network. - See :class:`~gensim.models.word2vec.BrownCorpus`, :class:`~gensim.models.word2vec.Text8Corpus` - or :class:`~gensim.models.word2vec.LineSentence` in :mod:`~gensim.models.word2vec` module for such examples. - corpus_file : str, optional - Path to a corpus file in :class:`~gensim.models.word2vec.LineSentence` format. - You may use this argument instead of `sentences` to get performance boost. Only one of `sentences` or - `corpus_file` arguments need to be passed (not both of them). - update : bool - If true, the new words in `sentences` will be added to model's vocab. - progress_per : int - Indicates how many words to process before showing/updating the progress. - keep_raw_vocab : bool - If not true, delete the raw vocabulary after the scaling is done and free up RAM. - trim_rule : function, optional - Vocabulary trimming rule, specifies whether certain words should remain in the vocabulary, - be trimmed away, or handled using the default (discard if word count < min_count). - Can be None (min_count will be used, look to :func:`~gensim.utils.keep_vocab_item`), - or a callable that accepts parameters (word, count, min_count) and returns either - :attr:`gensim.utils.RULE_DISCARD`, :attr:`gensim.utils.RULE_KEEP` or :attr:`gensim.utils.RULE_DEFAULT`. - The rule, if given, is only used to prune vocabulary during - :meth:`~gensim.models.fasttext.FastText.build_vocab` and is not stored as part of the model. - - The input parameters are of the following types: - * `word` (str) - the word we are examining - * `count` (int) - the word's frequency count in the corpus - * `min_count` (int) - the minimum count threshold. - - **kwargs - Additional key word parameters passed to - :meth:`~gensim.models.base_any2vec.BaseWordEmbeddingsModel.build_vocab`. - - Examples - -------- - Train a model and update vocab for online training: - - .. sourcecode:: pycon - - >>> from gensim.models import FastText - >>> relations_1 = [('kangaroo', 'marsupial'), ('kangaroo', 'mammal')] - >>> relations_2 = [('gib', 'cat')] - >>> - >>> model = PoincareModel(relations, negative=2) - >>> model.build_vocab(relations_1) - >>> model.train(relations_1, epochs=5) - >>> - >>> model.build_vocab(relations_2, update=True) - >>> model.train(relations_2, epochs=5) - - """ - if not update: - self.wv.init_ngrams_weights(self.trainables.seed) - elif not len(self.wv.vocab): - raise RuntimeError( - "You cannot do an online vocabulary-update of a model which has no prior vocabulary. " - "First build the vocabulary of your model with a corpus " - "by calling the gensim.models.fasttext.FastText.build_vocab method " - "before doing an online update." - ) - else: - self.vocabulary.old_vocab_len = len(self.wv.vocab) - - retval = super(FastText, self).build_vocab( - sentences=sentences, corpus_file=corpus_file, update=update, progress_per=progress_per, - keep_raw_vocab=keep_raw_vocab, trim_rule=trim_rule, **kwargs) - - if update: - self.wv.update_ngrams_weights(self.trainables.seed, self.vocabulary.old_vocab_len) - - return retval - def train(self, epochs, batch_size=10, print_every=1000, check_gradients_every=None): """Train Poincare embeddings using loaded data and model parameters. From 7bef962cb0ccfb346869da01c31f9e09af8b5bfd Mon Sep 17 00:00:00 2001 From: "kojiro.iizuka" Date: Wed, 29 May 2019 15:14:22 +0900 Subject: [PATCH 03/15] fix test --- gensim/test/test_poincare.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/gensim/test/test_poincare.py b/gensim/test/test_poincare.py index f2f51a576d..9ea020da51 100644 --- a/gensim/test/test_poincare.py +++ b/gensim/test/test_poincare.py @@ -99,14 +99,9 @@ def test_online_learning(self): self.assertEqual(len(model.kv.vocab), 7) self.assertEqual(model.kv.vocab['kangaroo.n.01'].count, 3) self.assertEqual(model.kv.vocab['cat.n.01'].count, 1) - model.train(epochs=2) - old_vectors = np.copy(model.kv.syn0) - model.build_vocab([('kangaroo.n.01', 'cat.n.01')], update=True) # update vocab self.assertEqual(model.kv.vocab['kangaroo.n.01'].count, 4) self.assertEqual(model.kv.vocab['cat.n.01'].count, 2) - model.train(epochs=2) - self.assertFalse(np.allclose(old_vectors, model.kv.syn0)) def test_train_after_load(self): """Tests whether the model can be trained correctly after loading from disk.""" From 304b6318af088bb7e0cfecaedf5fd47e930b7a0e Mon Sep 17 00:00:00 2001 From: "kojiro.iizuka" Date: Fri, 21 Jun 2019 17:30:28 +0900 Subject: [PATCH 04/15] fix TypeError add doc --- gensim/models/poincare.py | 30 ++++++++++++++++++++++++++++-- 1 file changed, 28 insertions(+), 2 deletions(-) diff --git a/gensim/models/poincare.py b/gensim/models/poincare.py index 45905afab4..d3f55d942d 100644 --- a/gensim/models/poincare.py +++ b/gensim/models/poincare.py @@ -173,8 +173,34 @@ def __init__(self, train_data, size=50, alpha=0.1, negative=10, workers=1, epsil self._loss_grad = None self.build_vocab(train_data) - def build_vocab(self, relations=None, update=False): - """Load relations from the train data and build vocab.""" + def build_vocab(self, relations, update=False): + """Build vocabulary from a relations. + Each relations must be a tuples of unicode strings. + + Parameters + ---------- + relations : list of tuples + List of tuples of positive examples of the form (node_1_index, node_2_index). + update : bool + If true, the new nodes in `relations` will be added to model's vocab. + + Examples + -------- + Train a model and update vocab for online training: + + .. sourcecode:: pycon + + >>> from gensim.models.poincare import PoincareModel + >>> relations_1 = [('kangaroo', 'marsupial'), ('kangaroo', 'mammal')] + >>> relations_2 = [('striped_skunk', 'mammal')] + >>> + >>> model = PoincareModel(relations_1, negative=1) + >>> model.train(epochs=50) + >>> + >>> model.build_vocab(relations_2, update=True) + >>> model.train(epochs=50) + + """ old_index2word_len = len(self.kv.index2word) logger.info("loading relations from train data..") From cf312ccc9990003938c15f3fb02abfb13147b6fc Mon Sep 17 00:00:00 2001 From: "kojiro.iizuka" Date: Fri, 21 Jun 2019 17:45:39 +0900 Subject: [PATCH 05/15] mod description of relations in build_vocab --- gensim/models/poincare.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/gensim/models/poincare.py b/gensim/models/poincare.py index d3f55d942d..45dd89da2a 100644 --- a/gensim/models/poincare.py +++ b/gensim/models/poincare.py @@ -175,12 +175,14 @@ def __init__(self, train_data, size=50, alpha=0.1, negative=10, workers=1, epsil def build_vocab(self, relations, update=False): """Build vocabulary from a relations. - Each relations must be a tuples of unicode strings. Parameters ---------- - relations : list of tuples - List of tuples of positive examples of the form (node_1_index, node_2_index). + relations : {iterable of (str, str), :class:`gensim.models.poincare.PoincareRelations`} + Iterable of relations, e.g. a list of tuples, or a :class:`gensim.models.poincare.PoincareRelations` + instance streaming from a file. Note that the relations are treated as ordered pairs, + i.e. a relation (a, b) does not imply the opposite relation (b, a). In case the relations are symmetric, + the data should contain both relations (a, b) and (b, a). update : bool If true, the new nodes in `relations` will be added to model's vocab. From fba43e9d24c2d1a5a836ff173c246ebcd98b344d Mon Sep 17 00:00:00 2001 From: "kojiro.iizuka" Date: Fri, 21 Jun 2019 18:08:31 +0900 Subject: [PATCH 06/15] add update=False description in build_vocab --- gensim/models/poincare.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/gensim/models/poincare.py b/gensim/models/poincare.py index c88c41088e..61db094682 100644 --- a/gensim/models/poincare.py +++ b/gensim/models/poincare.py @@ -185,7 +185,8 @@ def build_vocab(self, relations, update=False): i.e. a relation (a, b) does not imply the opposite relation (b, a). In case the relations are symmetric, the data should contain both relations (a, b) and (b, a). update : bool - If true, the new nodes in `relations` will be added to model's vocab. + If true, new nodes's embeddings are initialized. + If false, all node's embeddings are initialized. Examples -------- From eeac67f4ca1a9c05b9d4b601c3350e3efe9287fc Mon Sep 17 00:00:00 2001 From: "kojiro.iizuka" Date: Fri, 21 Jun 2019 18:11:19 +0900 Subject: [PATCH 07/15] mod top description of build_vocab --- gensim/models/poincare.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gensim/models/poincare.py b/gensim/models/poincare.py index 61db094682..d57fe31cd0 100644 --- a/gensim/models/poincare.py +++ b/gensim/models/poincare.py @@ -175,7 +175,7 @@ def __init__(self, train_data, size=50, alpha=0.1, negative=10, workers=1, epsil self.build_vocab(train_data) def build_vocab(self, relations, update=False): - """Build vocabulary from a relations. + """Build vocabulary from relations. Parameters ---------- From de0688d52703617eae76697e9846e2b64dcda2c6 Mon Sep 17 00:00:00 2001 From: koiizukag <41324565+koiizukag@users.noreply.github.com> Date: Wed, 26 Jun 2019 14:19:45 +0900 Subject: [PATCH 08/15] Update gensim/models/poincare.py Co-Authored-By: Michael Penkov --- gensim/models/poincare.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gensim/models/poincare.py b/gensim/models/poincare.py index d57fe31cd0..da4a6cd170 100644 --- a/gensim/models/poincare.py +++ b/gensim/models/poincare.py @@ -175,7 +175,7 @@ def __init__(self, train_data, size=50, alpha=0.1, negative=10, workers=1, epsil self.build_vocab(train_data) def build_vocab(self, relations, update=False): - """Build vocabulary from relations. + """Build the model's vocabulary from known relations. Parameters ---------- From e8d7fb9c247597316e2fae79fcb199f07de61997 Mon Sep 17 00:00:00 2001 From: koiizukag <41324565+koiizukag@users.noreply.github.com> Date: Wed, 26 Jun 2019 14:20:31 +0900 Subject: [PATCH 09/15] Update gensim/models/poincare.py Co-Authored-By: Michael Penkov --- gensim/models/poincare.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gensim/models/poincare.py b/gensim/models/poincare.py index da4a6cd170..41adf48d38 100644 --- a/gensim/models/poincare.py +++ b/gensim/models/poincare.py @@ -184,7 +184,7 @@ def build_vocab(self, relations, update=False): instance streaming from a file. Note that the relations are treated as ordered pairs, i.e. a relation (a, b) does not imply the opposite relation (b, a). In case the relations are symmetric, the data should contain both relations (a, b) and (b, a). - update : bool + update : bool, optional If true, new nodes's embeddings are initialized. If false, all node's embeddings are initialized. From afd0e21189f41e5358aedd6fe335b4bbe889844d Mon Sep 17 00:00:00 2001 From: koiizukag <41324565+koiizukag@users.noreply.github.com> Date: Wed, 26 Jun 2019 14:20:39 +0900 Subject: [PATCH 10/15] Update gensim/models/poincare.py Co-Authored-By: Michael Penkov --- gensim/models/poincare.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/gensim/models/poincare.py b/gensim/models/poincare.py index 41adf48d38..e85e674cf3 100644 --- a/gensim/models/poincare.py +++ b/gensim/models/poincare.py @@ -185,7 +185,8 @@ def build_vocab(self, relations, update=False): i.e. a relation (a, b) does not imply the opposite relation (b, a). In case the relations are symmetric, the data should contain both relations (a, b) and (b, a). update : bool, optional - If true, new nodes's embeddings are initialized. + If true, only new nodes's embeddings are initialized. + Use this when the model already has an existing vocabulary and you want to update it. If false, all node's embeddings are initialized. Examples From 666b93cddd818f23f69c0ea9f73052a3a87cbc7c Mon Sep 17 00:00:00 2001 From: koiizukag <41324565+koiizukag@users.noreply.github.com> Date: Wed, 26 Jun 2019 14:20:59 +0900 Subject: [PATCH 11/15] Update gensim/models/poincare.py Co-Authored-By: Michael Penkov --- gensim/models/poincare.py | 1 + 1 file changed, 1 insertion(+) diff --git a/gensim/models/poincare.py b/gensim/models/poincare.py index e85e674cf3..1da93ca52e 100644 --- a/gensim/models/poincare.py +++ b/gensim/models/poincare.py @@ -188,6 +188,7 @@ def build_vocab(self, relations, update=False): If true, only new nodes's embeddings are initialized. Use this when the model already has an existing vocabulary and you want to update it. If false, all node's embeddings are initialized. + Use this when you're creating a new vocabulary from scratch. Examples -------- From 32198846e5253b9f62b1b18a53a5b519d975a716 Mon Sep 17 00:00:00 2001 From: koiizukag <41324565+koiizukag@users.noreply.github.com> Date: Wed, 26 Jun 2019 14:22:41 +0900 Subject: [PATCH 12/15] Update gensim/models/poincare.py Co-Authored-By: Michael Penkov --- gensim/models/poincare.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/gensim/models/poincare.py b/gensim/models/poincare.py index 1da93ca52e..0cac6cb1cf 100644 --- a/gensim/models/poincare.py +++ b/gensim/models/poincare.py @@ -200,7 +200,8 @@ def build_vocab(self, relations, update=False): >>> relations_1 = [('kangaroo', 'marsupial'), ('kangaroo', 'mammal')] >>> relations_2 = [('striped_skunk', 'mammal')] >>> - >>> model = PoincareModel(relations_1, negative=1) + >>> # train a new model from initial data + >>> model = PoincareModel(initial_relations, negative=1) >>> model.train(epochs=50) >>> >>> model.build_vocab(relations_2, update=True) From 02467a25554f79636fae76b92ff737068acc1a0b Mon Sep 17 00:00:00 2001 From: koiizukag <41324565+koiizukag@users.noreply.github.com> Date: Wed, 26 Jun 2019 14:22:58 +0900 Subject: [PATCH 13/15] Update gensim/models/poincare.py Co-Authored-By: Michael Penkov --- gensim/models/poincare.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/gensim/models/poincare.py b/gensim/models/poincare.py index 0cac6cb1cf..53f6b43d75 100644 --- a/gensim/models/poincare.py +++ b/gensim/models/poincare.py @@ -204,7 +204,9 @@ def build_vocab(self, relations, update=False): >>> model = PoincareModel(initial_relations, negative=1) >>> model.train(epochs=50) >>> - >>> model.build_vocab(relations_2, update=True) + >>> # online training: update the vocabulary and continue training + >>> online_relations = [('striped_skunk', 'mammal')] + >>> model.build_vocab(online_relations, update=True) >>> model.train(epochs=50) """ From 38b5b7c964b8421251abe3a64c60f3ef2df5a547 Mon Sep 17 00:00:00 2001 From: koiizukag <41324565+koiizukag@users.noreply.github.com> Date: Wed, 26 Jun 2019 14:23:21 +0900 Subject: [PATCH 14/15] Update gensim/models/poincare.py Co-Authored-By: Michael Penkov --- gensim/models/poincare.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gensim/models/poincare.py b/gensim/models/poincare.py index 53f6b43d75..a39bd17932 100644 --- a/gensim/models/poincare.py +++ b/gensim/models/poincare.py @@ -197,7 +197,7 @@ def build_vocab(self, relations, update=False): .. sourcecode:: pycon >>> from gensim.models.poincare import PoincareModel - >>> relations_1 = [('kangaroo', 'marsupial'), ('kangaroo', 'mammal')] + >>> initial_relations = [('kangaroo', 'marsupial'), ('kangaroo', 'mammal')] >>> relations_2 = [('striped_skunk', 'mammal')] >>> >>> # train a new model from initial data From 28007e68b045cd824035c725b371ae5dc759cb6e Mon Sep 17 00:00:00 2001 From: Michael Penkov Date: Wed, 26 Jun 2019 15:34:37 +0900 Subject: [PATCH 15/15] Update poincare.py minor update --- gensim/models/poincare.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/gensim/models/poincare.py b/gensim/models/poincare.py index a39bd17932..42a3a60d48 100644 --- a/gensim/models/poincare.py +++ b/gensim/models/poincare.py @@ -197,10 +197,9 @@ def build_vocab(self, relations, update=False): .. sourcecode:: pycon >>> from gensim.models.poincare import PoincareModel - >>> initial_relations = [('kangaroo', 'marsupial'), ('kangaroo', 'mammal')] - >>> relations_2 = [('striped_skunk', 'mammal')] >>> >>> # train a new model from initial data + >>> initial_relations = [('kangaroo', 'marsupial'), ('kangaroo', 'mammal')] >>> model = PoincareModel(initial_relations, negative=1) >>> model.train(epochs=50) >>>