Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

LsiModel.docs_processed attribute #763

Merged
merged 17 commits into from
Jun 30, 2016
Merged
Show file tree
Hide file tree
Changes from 13 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion gensim/corpora/mmcorpus.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
from gensim.corpora import IndexedCorpus


logger = logging.getLogger('gensim.corpora.mmcorpus')
logger = logging.getLogger(__name__)


class MmCorpus(matutils.MmReader, IndexedCorpus):
Expand All @@ -34,6 +34,7 @@ def __iter__(self):
(yielding one document at a time).
"""
for doc_id, doc in super(MmCorpus, self).__iter__():
logger.debug('{0}'.format(doc_id))
yield doc # get rid of doc id, return the sparse vector only

@staticmethod
Expand Down
3 changes: 3 additions & 0 deletions gensim/models/lsimodel.py
Original file line number Diff line number Diff line change
Expand Up @@ -361,6 +361,7 @@ def add_documents(self, corpus, chunksize=None, decay=None):
num_terms=self.num_terms, chunksize=chunksize,
extra_dims=self.extra_samples, power_iters=self.power_iters)
self.projection.merge(update, decay=decay)
self.docs_processed += len(corpus) if hasattr(corpus, '__len__') else 0
else:
# the one-pass algo
doc_no = 0
Expand Down Expand Up @@ -395,6 +396,7 @@ def add_documents(self, corpus, chunksize=None, decay=None):
if self.dispatcher:
logger.info("reached the end of input; now waiting for all remaining jobs to finish")
self.projection = self.dispatcher.getstate()
self.docs_processed += len(corpus) if hasattr(corpus, '__len__') else doc_no
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can we always use doc_no?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yea, I guess so, for this line.

# logger.info("top topics after adding %i documents" % doc_no)
# self.print_debug(10)
else:
Expand All @@ -403,6 +405,7 @@ def add_documents(self, corpus, chunksize=None, decay=None):
update = Projection(self.num_terms, self.num_topics, corpus.tocsc(), extra_dims=self.extra_samples, power_iters=self.power_iters)
self.projection.merge(update, decay=decay)
logger.info("processed sparse job of %i documents", corpus.shape[1])
self.docs_processed += corpus.shape[1]

def __str__(self):
return "LsiModel(num_terms=%s, num_topics=%s, decay=%s, chunksize=%s)" % (
Expand Down
20 changes: 15 additions & 5 deletions gensim/test/test_corpora.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,14 +16,17 @@

import numpy

from gensim.utils import to_unicode, smart_extension
from gensim.utils import to_unicode # , smart_extension
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Remove import if no longer needed.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done.

Side Note: There are many more unused imports throughout gensim. They can be dangerous to remove, though, for someone like me unfamiliar with the internals of those packages being imported. For example import seaborn has side-effects, and obviously from future import division does too.

from gensim.interfaces import TransformedCorpus
from gensim.corpora import (bleicorpus, mmcorpus, lowcorpus, svmlightcorpus,
ucicorpus, malletcorpus, textcorpus, indexedcorpus)

# needed because sample data files are located in the same folder
module_path = os.path.dirname(__file__)
datapath = lambda fname: os.path.join(module_path, 'test_data', fname)


def datapath(fname):
return os.path.join(module_path, 'test_data', fname)


def testfile():
Expand Down Expand Up @@ -180,7 +183,7 @@ def test_indexing(self):
self.assertEqual(len(docs), len(corpus))
self.assertEqual(len(docs), len(corpus[:]))
self.assertEqual(len(docs[::2]), len(corpus[::2]))

def _get_slice(corpus, slice_):
# assertRaises for python 2.6 takes a callable
return corpus[slice_]
Expand All @@ -200,9 +203,9 @@ def _get_slice(corpus, slice_):
# corpus does, and throws an error otherwise
if hasattr(corpus, 'index') and corpus.index is not None:
corpus_ = TransformedCorpus(DummyTransformer(), corpus)
self.assertEqual(corpus_[0][0][1], docs[0][0][1]+1)
self.assertEqual(corpus_[0][0][1], docs[0][0][1] + 1)
self.assertRaises(ValueError, _get_slice, corpus_, set([1]))
transformed_docs = [val+1 for i, d in enumerate(docs) for _, val in d if i in [1, 3, 4]]
transformed_docs = [val + 1 for i, d in enumerate(docs) for _, val in d if i in [1, 3, 4]]
self.assertEquals(transformed_docs, list(v for doc in corpus_[[1, 3, 4]] for _, v in doc))
self.assertEqual(3, len(corpus_[[1, 3, 4]]))
else:
Expand All @@ -214,12 +217,19 @@ def _get_slice(corpus, slice_):
class TestMmCorpus(CorpusTestCase):
def setUp(self):
self.corpus_class = mmcorpus.MmCorpus
self.corpus = self.corpus_class(datapath('testcorpus.mm'))
self.file_extension = '.mm'

def test_serialize_compressed(self):
# MmCorpus needs file write with seek => doesn't support compressed output (only input)
pass

def test_load(self):
self.assertEqual(self.corpus.num_docs, 9)
self.assertEqual(self.corpus.num_terms, 12)
self.assertEqual(self.corpus.num_nnz, 28)
self.assertEqual(tuple(self.corpus.index), (97, 121, 169, 201, 225, 249, 258, 276, 303))


class TestSvmLightCorpus(CorpusTestCase):
def setUp(self):
Expand Down
92 changes: 47 additions & 45 deletions gensim/test/test_lsimodel.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,20 +24,23 @@
from gensim import matutils


module_path = os.path.dirname(__file__) # needed because sample data files are located in the same folder
datapath = lambda fname: os.path.join(module_path, 'test_data', fname)
module_path = os.path.dirname(__file__) # needed because sample data files are located in the same folder


def datapath(fname):
return os.path.join(module_path, 'test_data', fname)


# set up vars used in testing ("Deerwester" from the web tutorial)
texts = [['human', 'interface', 'computer'],
['survey', 'user', 'computer', 'system', 'response', 'time'],
['eps', 'user', 'interface', 'system'],
['system', 'human', 'system', 'eps'],
['user', 'response', 'time'],
['trees'],
['graph', 'trees'],
['graph', 'minors', 'trees'],
['graph', 'minors', 'survey']]
['survey', 'user', 'computer', 'system', 'response', 'time'],
['eps', 'user', 'interface', 'system'],
['system', 'human', 'system', 'eps'],
['user', 'response', 'time'],
['trees'],
['graph', 'trees'],
['graph', 'minors', 'trees'],
['graph', 'minors', 'survey']]
dictionary = Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]

Expand All @@ -59,16 +62,15 @@ def testTransform(self):

# make sure the decomposition is enough accurate
u, s, vt = scipy.linalg.svd(matutils.corpus2dense(self.corpus, self.corpus.num_terms), full_matrices=False)
self.assertTrue(numpy.allclose(s[:2], model.projection.s)) # singular values must match
self.assertTrue(numpy.allclose(s[:2], model.projection.s)) # singular values must match

# transform one document
doc = list(self.corpus)[0]
transformed = model[doc]
vec = matutils.sparse2full(transformed, 2) # convert to dense vector, for easier equality tests
expected = numpy.array([-0.6594664, 0.142115444]) # scaled LSI version
# expected = numpy.array([-0.1973928, 0.05591352]) # non-scaled LSI version
self.assertTrue(numpy.allclose(abs(vec), abs(expected))) # transformed entries must be equal up to sign

vec = matutils.sparse2full(transformed, 2) # convert to dense vector, for easier equality tests
expected = numpy.array([-0.6594664, 0.142115444]) # scaled LSI version
# expected = numpy.array([-0.1973928, 0.05591352]) # non-scaled LSI version
self.assertTrue(numpy.allclose(abs(vec), abs(expected))) # transformed entries must be equal up to sign

def testShowTopic(self):
topic = self.model.show_topic(1)
Expand All @@ -77,7 +79,6 @@ def testShowTopic(self):
self.assertTrue(isinstance(k, six.string_types))
self.assertTrue(isinstance(v, float))


def testShowTopics(self):
topics = self.model.show_topics(formatted=False)

Expand All @@ -88,58 +89,55 @@ def testShowTopics(self):
self.assertTrue(isinstance(k, six.string_types))
self.assertTrue(isinstance(v, float))


def testCorpusTransform(self):
"""Test lsi[corpus] transformation."""
model = self.model
got = numpy.vstack(matutils.sparse2full(doc, 2) for doc in model[self.corpus])
expected = numpy.array([
[ 0.65946639, 0.14211544],
[ 2.02454305, -0.42088759],
[ 1.54655361, 0.32358921],
[ 1.81114125, 0.5890525 ],
[ 0.9336738 , -0.27138939],
[ 0.01274618, -0.49016181],
[ 0.04888203, -1.11294699],
[ 0.08063836, -1.56345594],
[ 0.27381003, -1.34694159]])
self.assertTrue(numpy.allclose(abs(got), abs(expected))) # must equal up to sign

[0.65946639, 0.14211544],
[2.02454305, -0.42088759],
[1.54655361, 0.32358921],
[1.81114125, 0.5890525 ],
[0.9336738 , -0.27138939],
[0.01274618, -0.49016181],
[0.04888203, -1.11294699],
[0.08063836, -1.56345594],
[0.27381003, -1.34694159]])
self.assertTrue(numpy.allclose(abs(got), abs(expected))) # must equal up to sign

def testOnlineTransform(self):
corpus = list(self.corpus)
doc = corpus[0] # use the corpus' first document for testing
doc = corpus[0] # use the corpus' first document for testing

# create the transformation model
model2 = lsimodel.LsiModel(corpus=corpus, num_topics=5) # compute everything at once
model = lsimodel.LsiModel(corpus=None, id2word=model2.id2word, num_topics=5) # start with no documents, we will add them later
model2 = lsimodel.LsiModel(corpus=corpus, num_topics=5) # compute everything at once
model = lsimodel.LsiModel(corpus=None, id2word=model2.id2word, num_topics=5) # start with no documents, we will add them later

# train model on a single document
model.add_documents([corpus[0]])

# transform the testing document with this partial transformation
transformed = model[doc]
vec = matutils.sparse2full(transformed, model.num_topics) # convert to dense vector, for easier equality tests
expected = numpy.array([-1.73205078, 0.0, 0.0, 0.0, 0.0]) # scaled LSI version
self.assertTrue(numpy.allclose(abs(vec), abs(expected), atol=1e-6)) # transformed entries must be equal up to sign
vec = matutils.sparse2full(transformed, model.num_topics) # convert to dense vector, for easier equality tests
expected = numpy.array([-1.73205078, 0.0, 0.0, 0.0, 0.0]) # scaled LSI version
self.assertTrue(numpy.allclose(abs(vec), abs(expected), atol=1e-6)) # transformed entries must be equal up to sign

# train on another 4 documents
model.add_documents(corpus[1:5], chunksize=2) # train on 4 extra docs, in chunks of 2 documents, for the lols
model.add_documents(corpus[1:5], chunksize=2) # train on 4 extra docs, in chunks of 2 documents, for the lols

# transform a document with this partial transformation
transformed = model[doc]
vec = matutils.sparse2full(transformed, model.num_topics) # convert to dense vector, for easier equality tests
expected = numpy.array([-0.66493785, -0.28314203, -1.56376302, 0.05488682, 0.17123269]) # scaled LSI version
self.assertTrue(numpy.allclose(abs(vec), abs(expected), atol=1e-6)) # transformed entries must be equal up to sign
vec = matutils.sparse2full(transformed, model.num_topics) # convert to dense vector, for easier equality tests
expected = numpy.array([-0.66493785, -0.28314203, -1.56376302, 0.05488682, 0.17123269]) # scaled LSI version
self.assertTrue(numpy.allclose(abs(vec), abs(expected), atol=1e-6)) # transformed entries must be equal up to sign

# train on the rest of documents
model.add_documents(corpus[5:])

# make sure the final transformation is the same as if we had decomposed the whole corpus at once
vec1 = matutils.sparse2full(model[doc], model.num_topics)
vec2 = matutils.sparse2full(model2[doc], model2.num_topics)
self.assertTrue(numpy.allclose(abs(vec1), abs(vec2), atol=1e-5)) # the two LSI representations must equal up to sign

self.assertTrue(numpy.allclose(abs(vec1), abs(vec2), atol=1e-5)) # the two LSI representations must equal up to sign

def testPersistence(self):
fname = testfile()
Expand All @@ -150,7 +148,7 @@ def testPersistence(self):
self.assertTrue(numpy.allclose(model.projection.u, model2.projection.u))
self.assertTrue(numpy.allclose(model.projection.s, model2.projection.s))
tstvec = []
self.assertTrue(numpy.allclose(model[tstvec], model2[tstvec])) # try projecting an empty vector
self.assertTrue(numpy.allclose(model[tstvec], model2[tstvec])) # try projecting an empty vector

def testPersistenceCompressed(self):
fname = testfile() + '.gz'
Expand All @@ -161,7 +159,7 @@ def testPersistenceCompressed(self):
self.assertTrue(numpy.allclose(model.projection.u, model2.projection.u))
self.assertTrue(numpy.allclose(model.projection.s, model2.projection.s))
tstvec = []
self.assertTrue(numpy.allclose(model[tstvec], model2[tstvec])) # try projecting an empty vector
self.assertTrue(numpy.allclose(model[tstvec], model2[tstvec])) # try projecting an empty vector

def testLargeMmap(self):
fname = testfile()
Expand All @@ -178,7 +176,7 @@ def testLargeMmap(self):
self.assertTrue(numpy.allclose(model.projection.u, model2.projection.u))
self.assertTrue(numpy.allclose(model.projection.s, model2.projection.s))
tstvec = []
self.assertTrue(numpy.allclose(model[tstvec], model2[tstvec])) # try projecting an empty vector
self.assertTrue(numpy.allclose(model[tstvec], model2[tstvec])) # try projecting an empty vector

def testLargeMmapCompressed(self):
fname = testfile() + '.gz'
Expand All @@ -194,7 +192,11 @@ def testLargeMmapCompressed(self):
# to be mmaped!
self.assertRaises(IOError, lsimodel.LsiModel.load, fname, mmap='r')

#endclass TestLsiModel
def testDocsProcessed(self):
self.assertEqual(self.model.docs_processed, 9)
self.assertEqual(self.model.docs_processed, self.corpus.num_docs)

# endclass TestLsiModel


if __name__ == '__main__':
Expand Down