diff --git a/CHANGELOG.md b/CHANGELOG.md index 5e3f3f6b9c..30c025f67b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,6 +6,7 @@ Changes - In hdpmodel and dtmmodel - NOT BACKWARDS COMPATIBLE! * Added random_state parameter to LdaState initializer and check_random_state() (@droudy, #113) +* Implemented LsiModel.docs_processed attribute 0.13.1, 2016-06-22 diff --git a/gensim/models/lsimodel.py b/gensim/models/lsimodel.py index 26f249d79a..12a3c17d18 100644 --- a/gensim/models/lsimodel.py +++ b/gensim/models/lsimodel.py @@ -361,6 +361,7 @@ def add_documents(self, corpus, chunksize=None, decay=None): num_terms=self.num_terms, chunksize=chunksize, extra_dims=self.extra_samples, power_iters=self.power_iters) self.projection.merge(update, decay=decay) + self.docs_processed += len(corpus) if hasattr(corpus, '__len__') else 0 else: # the one-pass algo doc_no = 0 @@ -395,6 +396,7 @@ def add_documents(self, corpus, chunksize=None, decay=None): if self.dispatcher: logger.info("reached the end of input; now waiting for all remaining jobs to finish") self.projection = self.dispatcher.getstate() + self.docs_processed += doc_no # logger.info("top topics after adding %i documents" % doc_no) # self.print_debug(10) else: @@ -403,6 +405,7 @@ def add_documents(self, corpus, chunksize=None, decay=None): update = Projection(self.num_terms, self.num_topics, corpus.tocsc(), extra_dims=self.extra_samples, power_iters=self.power_iters) self.projection.merge(update, decay=decay) logger.info("processed sparse job of %i documents", corpus.shape[1]) + self.docs_processed += corpus.shape[1] def __str__(self): return "LsiModel(num_terms=%s, num_topics=%s, decay=%s, chunksize=%s)" % ( diff --git a/gensim/test/test_corpora.py b/gensim/test/test_corpora.py index b5c3b1db29..abe65c0e3c 100644 --- a/gensim/test/test_corpora.py +++ b/gensim/test/test_corpora.py @@ -16,14 +16,17 @@ import numpy -from gensim.utils import to_unicode, smart_extension +from gensim.utils import to_unicode from gensim.interfaces import TransformedCorpus from gensim.corpora import (bleicorpus, mmcorpus, lowcorpus, svmlightcorpus, ucicorpus, malletcorpus, textcorpus, indexedcorpus) # needed because sample data files are located in the same folder module_path = os.path.dirname(__file__) -datapath = lambda fname: os.path.join(module_path, 'test_data', fname) + + +def datapath(fname): + return os.path.join(module_path, 'test_data', fname) def testfile(): @@ -180,7 +183,7 @@ def test_indexing(self): self.assertEqual(len(docs), len(corpus)) self.assertEqual(len(docs), len(corpus[:])) self.assertEqual(len(docs[::2]), len(corpus[::2])) - + def _get_slice(corpus, slice_): # assertRaises for python 2.6 takes a callable return corpus[slice_] @@ -200,9 +203,9 @@ def _get_slice(corpus, slice_): # corpus does, and throws an error otherwise if hasattr(corpus, 'index') and corpus.index is not None: corpus_ = TransformedCorpus(DummyTransformer(), corpus) - self.assertEqual(corpus_[0][0][1], docs[0][0][1]+1) + self.assertEqual(corpus_[0][0][1], docs[0][0][1] + 1) self.assertRaises(ValueError, _get_slice, corpus_, set([1])) - transformed_docs = [val+1 for i, d in enumerate(docs) for _, val in d if i in [1, 3, 4]] + transformed_docs = [val + 1 for i, d in enumerate(docs) for _, val in d if i in [1, 3, 4]] self.assertEquals(transformed_docs, list(v for doc in corpus_[[1, 3, 4]] for _, v in doc)) self.assertEqual(3, len(corpus_[[1, 3, 4]])) else: @@ -214,12 +217,19 @@ def _get_slice(corpus, slice_): class TestMmCorpus(CorpusTestCase): def setUp(self): self.corpus_class = mmcorpus.MmCorpus + self.corpus = self.corpus_class(datapath('testcorpus.mm')) self.file_extension = '.mm' def test_serialize_compressed(self): # MmCorpus needs file write with seek => doesn't support compressed output (only input) pass + def test_load(self): + self.assertEqual(self.corpus.num_docs, 9) + self.assertEqual(self.corpus.num_terms, 12) + self.assertEqual(self.corpus.num_nnz, 28) + self.assertEqual(tuple(self.corpus.index), (97, 121, 169, 201, 225, 249, 258, 276, 303)) + class TestSvmLightCorpus(CorpusTestCase): def setUp(self): diff --git a/gensim/test/test_lsimodel.py b/gensim/test/test_lsimodel.py index 457725ebbb..26df7c011e 100644 --- a/gensim/test/test_lsimodel.py +++ b/gensim/test/test_lsimodel.py @@ -24,20 +24,23 @@ from gensim import matutils -module_path = os.path.dirname(__file__) # needed because sample data files are located in the same folder -datapath = lambda fname: os.path.join(module_path, 'test_data', fname) +module_path = os.path.dirname(__file__) # needed because sample data files are located in the same folder + + +def datapath(fname): + return os.path.join(module_path, 'test_data', fname) # set up vars used in testing ("Deerwester" from the web tutorial) texts = [['human', 'interface', 'computer'], - ['survey', 'user', 'computer', 'system', 'response', 'time'], - ['eps', 'user', 'interface', 'system'], - ['system', 'human', 'system', 'eps'], - ['user', 'response', 'time'], - ['trees'], - ['graph', 'trees'], - ['graph', 'minors', 'trees'], - ['graph', 'minors', 'survey']] + ['survey', 'user', 'computer', 'system', 'response', 'time'], + ['eps', 'user', 'interface', 'system'], + ['system', 'human', 'system', 'eps'], + ['user', 'response', 'time'], + ['trees'], + ['graph', 'trees'], + ['graph', 'minors', 'trees'], + ['graph', 'minors', 'survey']] dictionary = Dictionary(texts) corpus = [dictionary.doc2bow(text) for text in texts] @@ -59,16 +62,15 @@ def testTransform(self): # make sure the decomposition is enough accurate u, s, vt = scipy.linalg.svd(matutils.corpus2dense(self.corpus, self.corpus.num_terms), full_matrices=False) - self.assertTrue(numpy.allclose(s[:2], model.projection.s)) # singular values must match + self.assertTrue(numpy.allclose(s[:2], model.projection.s)) # singular values must match # transform one document doc = list(self.corpus)[0] transformed = model[doc] - vec = matutils.sparse2full(transformed, 2) # convert to dense vector, for easier equality tests - expected = numpy.array([-0.6594664, 0.142115444]) # scaled LSI version - # expected = numpy.array([-0.1973928, 0.05591352]) # non-scaled LSI version - self.assertTrue(numpy.allclose(abs(vec), abs(expected))) # transformed entries must be equal up to sign - + vec = matutils.sparse2full(transformed, 2) # convert to dense vector, for easier equality tests + expected = numpy.array([-0.6594664, 0.142115444]) # scaled LSI version + # expected = numpy.array([-0.1973928, 0.05591352]) # non-scaled LSI version + self.assertTrue(numpy.allclose(abs(vec), abs(expected))) # transformed entries must be equal up to sign def testShowTopic(self): topic = self.model.show_topic(1) @@ -77,7 +79,6 @@ def testShowTopic(self): self.assertTrue(isinstance(k, six.string_types)) self.assertTrue(isinstance(v, float)) - def testShowTopics(self): topics = self.model.show_topics(formatted=False) @@ -88,49 +89,47 @@ def testShowTopics(self): self.assertTrue(isinstance(k, six.string_types)) self.assertTrue(isinstance(v, float)) - def testCorpusTransform(self): """Test lsi[corpus] transformation.""" model = self.model got = numpy.vstack(matutils.sparse2full(doc, 2) for doc in model[self.corpus]) expected = numpy.array([ - [ 0.65946639, 0.14211544], - [ 2.02454305, -0.42088759], - [ 1.54655361, 0.32358921], - [ 1.81114125, 0.5890525 ], - [ 0.9336738 , -0.27138939], - [ 0.01274618, -0.49016181], - [ 0.04888203, -1.11294699], - [ 0.08063836, -1.56345594], - [ 0.27381003, -1.34694159]]) - self.assertTrue(numpy.allclose(abs(got), abs(expected))) # must equal up to sign - + [0.65946639, 0.14211544], + [2.02454305, -0.42088759], + [1.54655361, 0.32358921], + [1.81114125, 0.5890525 ], + [0.9336738 , -0.27138939], + [0.01274618, -0.49016181], + [0.04888203, -1.11294699], + [0.08063836, -1.56345594], + [0.27381003, -1.34694159]]) + self.assertTrue(numpy.allclose(abs(got), abs(expected))) # must equal up to sign def testOnlineTransform(self): corpus = list(self.corpus) - doc = corpus[0] # use the corpus' first document for testing + doc = corpus[0] # use the corpus' first document for testing # create the transformation model - model2 = lsimodel.LsiModel(corpus=corpus, num_topics=5) # compute everything at once - model = lsimodel.LsiModel(corpus=None, id2word=model2.id2word, num_topics=5) # start with no documents, we will add them later + model2 = lsimodel.LsiModel(corpus=corpus, num_topics=5) # compute everything at once + model = lsimodel.LsiModel(corpus=None, id2word=model2.id2word, num_topics=5) # start with no documents, we will add them later # train model on a single document model.add_documents([corpus[0]]) # transform the testing document with this partial transformation transformed = model[doc] - vec = matutils.sparse2full(transformed, model.num_topics) # convert to dense vector, for easier equality tests - expected = numpy.array([-1.73205078, 0.0, 0.0, 0.0, 0.0]) # scaled LSI version - self.assertTrue(numpy.allclose(abs(vec), abs(expected), atol=1e-6)) # transformed entries must be equal up to sign + vec = matutils.sparse2full(transformed, model.num_topics) # convert to dense vector, for easier equality tests + expected = numpy.array([-1.73205078, 0.0, 0.0, 0.0, 0.0]) # scaled LSI version + self.assertTrue(numpy.allclose(abs(vec), abs(expected), atol=1e-6)) # transformed entries must be equal up to sign # train on another 4 documents - model.add_documents(corpus[1:5], chunksize=2) # train on 4 extra docs, in chunks of 2 documents, for the lols + model.add_documents(corpus[1:5], chunksize=2) # train on 4 extra docs, in chunks of 2 documents, for the lols # transform a document with this partial transformation transformed = model[doc] - vec = matutils.sparse2full(transformed, model.num_topics) # convert to dense vector, for easier equality tests - expected = numpy.array([-0.66493785, -0.28314203, -1.56376302, 0.05488682, 0.17123269]) # scaled LSI version - self.assertTrue(numpy.allclose(abs(vec), abs(expected), atol=1e-6)) # transformed entries must be equal up to sign + vec = matutils.sparse2full(transformed, model.num_topics) # convert to dense vector, for easier equality tests + expected = numpy.array([-0.66493785, -0.28314203, -1.56376302, 0.05488682, 0.17123269]) # scaled LSI version + self.assertTrue(numpy.allclose(abs(vec), abs(expected), atol=1e-6)) # transformed entries must be equal up to sign # train on the rest of documents model.add_documents(corpus[5:]) @@ -138,8 +137,7 @@ def testOnlineTransform(self): # make sure the final transformation is the same as if we had decomposed the whole corpus at once vec1 = matutils.sparse2full(model[doc], model.num_topics) vec2 = matutils.sparse2full(model2[doc], model2.num_topics) - self.assertTrue(numpy.allclose(abs(vec1), abs(vec2), atol=1e-5)) # the two LSI representations must equal up to sign - + self.assertTrue(numpy.allclose(abs(vec1), abs(vec2), atol=1e-5)) # the two LSI representations must equal up to sign def testPersistence(self): fname = testfile() @@ -150,7 +148,7 @@ def testPersistence(self): self.assertTrue(numpy.allclose(model.projection.u, model2.projection.u)) self.assertTrue(numpy.allclose(model.projection.s, model2.projection.s)) tstvec = [] - self.assertTrue(numpy.allclose(model[tstvec], model2[tstvec])) # try projecting an empty vector + self.assertTrue(numpy.allclose(model[tstvec], model2[tstvec])) # try projecting an empty vector def testPersistenceCompressed(self): fname = testfile() + '.gz' @@ -161,7 +159,7 @@ def testPersistenceCompressed(self): self.assertTrue(numpy.allclose(model.projection.u, model2.projection.u)) self.assertTrue(numpy.allclose(model.projection.s, model2.projection.s)) tstvec = [] - self.assertTrue(numpy.allclose(model[tstvec], model2[tstvec])) # try projecting an empty vector + self.assertTrue(numpy.allclose(model[tstvec], model2[tstvec])) # try projecting an empty vector def testLargeMmap(self): fname = testfile() @@ -178,7 +176,7 @@ def testLargeMmap(self): self.assertTrue(numpy.allclose(model.projection.u, model2.projection.u)) self.assertTrue(numpy.allclose(model.projection.s, model2.projection.s)) tstvec = [] - self.assertTrue(numpy.allclose(model[tstvec], model2[tstvec])) # try projecting an empty vector + self.assertTrue(numpy.allclose(model[tstvec], model2[tstvec])) # try projecting an empty vector def testLargeMmapCompressed(self): fname = testfile() + '.gz' @@ -194,7 +192,11 @@ def testLargeMmapCompressed(self): # to be mmaped! self.assertRaises(IOError, lsimodel.LsiModel.load, fname, mmap='r') -#endclass TestLsiModel + def testDocsProcessed(self): + self.assertEqual(self.model.docs_processed, 9) + self.assertEqual(self.model.docs_processed, self.corpus.num_docs) + +# endclass TestLsiModel if __name__ == '__main__':