diff --git a/gensim/models/ldamodel.py b/gensim/models/ldamodel.py index bea5a0f507..3639b267c4 100755 --- a/gensim/models/ldamodel.py +++ b/gensim/models/ldamodel.py @@ -43,6 +43,7 @@ from scipy.special import polygamma from six.moves import xrange import six +import json # log(sum(exp(x))) that tries to avoid overflow try: @@ -979,7 +980,7 @@ def __getitem__(self, bow, eps=None): """ return self.get_document_topics(bow, eps) - def save(self, fname, ignore=['state', 'dispatcher'], *args, **kwargs): + def save(self, fname, ignore=['state', 'dispatcher'], separately = None, *args, **kwargs): """ Save the model to file. @@ -1018,7 +1019,41 @@ def save(self, fname, ignore=['state', 'dispatcher'], *args, **kwargs): ignore = list(set(['state', 'dispatcher']) | set(ignore)) else: ignore = ['state', 'dispatcher'] - super(LdaModel, self).save(fname, *args, ignore=ignore, **kwargs) + + # make sure 'expElogbeta' and 'sstats' are ignored from the pickled object, even if + # someone sets the separately list themselves. + separately_explicit = ['expElogbeta', 'sstats'] + # Also add 'alpha' and 'eta' to separately list if they are set 'auto' or some + # array manually. + if (isinstance(self.alpha, six.string_types) and self.alpha == 'auto') or len(self.alpha.shape) != 1: + separately_explicit.append('alpha') + if (isinstance(self.eta, six.string_types) and self.eta == 'auto') or len(self.eta.shape) != 1: + separately_explicit.append('eta') + # Merge separately_explicit with separately. + if separately: + if isinstance(separately, six.string_types): + separately = [separately] + separately = [e for e in separately if e] # make sure None and '' are not in the list + separately = list(set(separately_explicit) | set(separately)) + else: + separately = separately_explicit + + # id2word needs to saved separately. + # If id2word is not already in ignore, then saving it separately in json. + id2word = None + if self.id2word is not None and 'id2word' not in ignore: + id2word = dict((k,v) for k,v in self.id2word.iteritems()) + self.id2word = None # remove the dictionary from model + super(LdaModel, self).save(fname, ignore=ignore, separately = separately, *args, **kwargs) + self.id2word = id2word # restore the dictionary. + + # Save the dictionary separately in json. + id2word_fname = utils.smart_extension(fname, '.json') + try: + with utils.smart_open(id2word_fname, 'w', encoding='utf-8') as fout: + json.dump(id2word, fout) + except Exception as e: + logging.warning("failed to save id2words dictionary in %s: %s", id2word_fname, e) @classmethod def load(cls, fname, *args, **kwargs): @@ -1032,6 +1067,18 @@ def load(cls, fname, *args, **kwargs): """ kwargs['mmap'] = kwargs.get('mmap', None) result = super(LdaModel, cls).load(fname, *args, **kwargs) + # Load the separately stored id2word dictionary saved in json. + id2word_fname = utils.smart_extension(fname, '.json') + try: + with utils.smart_open(id2word_fname, 'r') as fin: + id2word = json.load(fin) + if id2word is not None: + result.id2word = utils.FakeDict(id2word) + else: + result.id2word = None + except Exception as e: + logging.warning("failed to load id2words from %s: %s", id2word_fname, e) + state_fname = utils.smart_extension(fname, '.state') try: result.state = super(LdaModel, cls).load(state_fname, *args, **kwargs) diff --git a/gensim/test/ldamodel_python_2_7 b/gensim/test/ldamodel_python_2_7 new file mode 100644 index 0000000000..f2ee3d6f09 Binary files /dev/null and b/gensim/test/ldamodel_python_2_7 differ diff --git a/gensim/test/ldamodel_python_2_7.eta.npy b/gensim/test/ldamodel_python_2_7.eta.npy new file mode 100644 index 0000000000..a5ceb80b2d Binary files /dev/null and b/gensim/test/ldamodel_python_2_7.eta.npy differ diff --git a/gensim/test/ldamodel_python_2_7.expElogbeta.npy b/gensim/test/ldamodel_python_2_7.expElogbeta.npy new file mode 100644 index 0000000000..1971e44b14 Binary files /dev/null and b/gensim/test/ldamodel_python_2_7.expElogbeta.npy differ diff --git a/gensim/test/ldamodel_python_2_7.json b/gensim/test/ldamodel_python_2_7.json new file mode 100644 index 0000000000..5ff1321c8e --- /dev/null +++ b/gensim/test/ldamodel_python_2_7.json @@ -0,0 +1 @@ +{"0": "interface", "1": "computer", "2": "human", "3": "response", "4": "time", "5": "survey", "6": "system", "7": "user", "8": "eps", "9": "trees", "10": "graph", "11": "minors"} \ No newline at end of file diff --git a/gensim/test/ldamodel_python_2_7.state b/gensim/test/ldamodel_python_2_7.state new file mode 100644 index 0000000000..fcc60724d1 Binary files /dev/null and b/gensim/test/ldamodel_python_2_7.state differ diff --git a/gensim/test/ldamodel_python_3_5 b/gensim/test/ldamodel_python_3_5 new file mode 100644 index 0000000000..61412b9f7e Binary files /dev/null and b/gensim/test/ldamodel_python_3_5 differ diff --git a/gensim/test/ldamodel_python_3_5.eta.npy b/gensim/test/ldamodel_python_3_5.eta.npy new file mode 100644 index 0000000000..a5ceb80b2d Binary files /dev/null and b/gensim/test/ldamodel_python_3_5.eta.npy differ diff --git a/gensim/test/ldamodel_python_3_5.expElogbeta.npy b/gensim/test/ldamodel_python_3_5.expElogbeta.npy new file mode 100644 index 0000000000..1971e44b14 Binary files /dev/null and b/gensim/test/ldamodel_python_3_5.expElogbeta.npy differ diff --git a/gensim/test/ldamodel_python_3_5.json b/gensim/test/ldamodel_python_3_5.json new file mode 100644 index 0000000000..f64397704b --- /dev/null +++ b/gensim/test/ldamodel_python_3_5.json @@ -0,0 +1 @@ +{"0": "interface", "1": "human", "2": "computer", "3": "response", "4": "system", "5": "user", "6": "time", "7": "survey", "8": "eps", "9": "trees", "10": "graph", "11": "minors"} \ No newline at end of file diff --git a/gensim/test/ldamodel_python_3_5.state b/gensim/test/ldamodel_python_3_5.state new file mode 100644 index 0000000000..8f995aeab8 Binary files /dev/null and b/gensim/test/ldamodel_python_3_5.state differ diff --git a/gensim/test/test_ldamodel.py b/gensim/test/test_ldamodel.py index 2bb9796b5b..4003482bd4 100644 --- a/gensim/test/test_ldamodel.py +++ b/gensim/test/test_ldamodel.py @@ -366,6 +366,24 @@ def testPersistence(self): tstvec = [] self.assertTrue(np.allclose(model[tstvec], model2[tstvec])) # try projecting an empty vector + # Method used to save LDA models in Python 2.7 and 3.5 environments. + # def testSaveModelsForPythonVersion(self): + # fname = os.path.join(os.path.dirname(__file__), 'ldamodel_python_2_7') + # corpus = mmcorpus.MmCorpus(datapath('testcorpus.mm')) + # model = ldamodel.LdaModel(corpus, id2word=dictionary, num_topics=2, passes=100, random_state = 1000007) + # model.save(fname) + # logging.warning("LDA Model saved") + + def testModelCompatibilityWithPythonVersions(self): + fname_model_2_7 = os.path.join(os.path.dirname(__file__), 'ldamodel_python_2_7') + model_2_7 = self.class_.load(fname_model_2_7) + fname_model_3_5 = os.path.join(os.path.dirname(__file__), 'ldamodel_python_3_5') + model_3_5 = self.class_.load(fname_model_3_5) + self.assertEqual(model_2_7.num_topics, model_3_5.num_topics) + self.assertTrue(numpy.allclose(model_2_7.expElogbeta, model_3_5.expElogbeta)) + tstvec = [] + self.assertTrue(numpy.allclose(model_2_7[tstvec], model_3_5[tstvec])) # try projecting an empty vector + def testPersistenceIgnore(self): fname = testfile() model = ldamodel.LdaModel(self.corpus, num_topics=2) diff --git a/gensim/test/test_word2vec.py b/gensim/test/test_word2vec.py index e7369835e8..fb6431c065 100644 --- a/gensim/test/test_word2vec.py +++ b/gensim/test/test_word2vec.py @@ -233,6 +233,19 @@ def testPersistenceWord2VecFormatCombinationWithStandardPersistence(self): binary_model_with_vocab.save(testfile()) binary_model_with_vocab = word2vec.Word2Vec.load(testfile()) self.assertEqual(model.vocab['human'].count, binary_model_with_vocab.vocab['human'].count) + + # def testSaveModelsForPythonVersion(self): + # fname = os.path.join(os.path.dirname(__file__), 'word2vecmodel_python_3_5') + # model = word2vec.Word2Vec(sentences, size=10, min_count=0, seed=42, hs=1, negative=0) + # model.save(fname) + # logging.warning("Word2Vec model saved") + + def testModelCompatibilityWithPythonVersions(self): + fname_model_2_7 = os.path.join(os.path.dirname(__file__), 'word2vecmodel_python_2_7') + model_2_7 = word2vec.Word2Vec.load(fname_model_2_7) + fname_model_3_5 = os.path.join(os.path.dirname(__file__), 'word2vecmodel_python_3_5') + model_3_5 = word2vec.Word2Vec.load(fname_model_3_5) + self.models_equal(model_2_7, model_3_5) def testLargeMmap(self): """Test storing/loading the entire model.""" diff --git a/gensim/test/word2vecmodel_python_2_7 b/gensim/test/word2vecmodel_python_2_7 new file mode 100644 index 0000000000..bf73d4c63d Binary files /dev/null and b/gensim/test/word2vecmodel_python_2_7 differ diff --git a/gensim/test/word2vecmodel_python_3_5 b/gensim/test/word2vecmodel_python_3_5 new file mode 100644 index 0000000000..b9a30df521 Binary files /dev/null and b/gensim/test/word2vecmodel_python_3_5 differ diff --git a/gensim/utils.py b/gensim/utils.py index 606060bb38..dfca56f5f3 100644 --- a/gensim/utils.py +++ b/gensim/utils.py @@ -907,10 +907,12 @@ def pickle(obj, fname, protocol=2): def unpickle(fname): """Load pickled object from `fname`""" - with smart_open(fname) as f: + with smart_open(fname, 'rb') as f: # Because of loading from S3 load can't be used (missing readline in smart_open) - return _pickle.loads(f.read()) - + if sys.version_info > (3,0): + return _pickle.load(f, encoding='latin1') + else: + return _pickle.loads(f.read()) def revdict(d): """