From b1f004e19fed2cd9f9fd625eed24fb98405a6c05 Mon Sep 17 00:00:00 2001 From: Darin Deforest Date: Wed, 11 Apr 2018 11:02:41 -0700 Subject: [PATCH] Changed from using floats to ints for doc terms & frequencies --- gensim/models/ldamodel.py | 3 ++- gensim/utils.py | 4 ++-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/gensim/models/ldamodel.py b/gensim/models/ldamodel.py index 6d37ac0f7d..4bf0f5d18b 100755 --- a/gensim/models/ldamodel.py +++ b/gensim/models/ldamodel.py @@ -698,7 +698,8 @@ def rho(): dirty = False reallen = 0 - for chunk_no, chunk in enumerate(utils.grouper(corpus, chunksize, as_numpy=chunks_as_numpy)): + for chunk_no, chunk in enumerate(utils.grouper(corpus, chunksize, as_numpy=chunks_as_numpy, + dtype=self.dtype)): reallen += len(chunk) # keep track of how many documents we've processed so far if eval_every and ((reallen == lencorpus) or ((chunk_no + 1) % (eval_every * self.numworkers) == 0)): diff --git a/gensim/utils.py b/gensim/utils.py index f6e5c4fdf3..6ef2f4ba80 100644 --- a/gensim/utils.py +++ b/gensim/utils.py @@ -1119,7 +1119,7 @@ def substitute_entity(match): return RE_HTML_ENTITY.sub(substitute_entity, text) -def chunkize_serial(iterable, chunksize, as_numpy=False): +def chunkize_serial(iterable, chunksize, as_numpy=False,dtype=np.float32): """Give elements from the iterable in `chunksize`-ed lists. The last returned element may be smaller (if length of collection is not divisible by `chunksize`). @@ -1148,7 +1148,7 @@ def chunkize_serial(iterable, chunksize, as_numpy=False): if as_numpy: # convert each document to a 2d numpy array (~6x faster when transmitting # chunk data over the wire, in Pyro) - wrapped_chunk = [[np.array(doc) for doc in itertools.islice(it, int(chunksize))]] + wrapped_chunk = [[np.asarray(doc,dtype=dtype) for doc in itertools.islice(it, int(chunksize))]] else: wrapped_chunk = [list(itertools.islice(it, int(chunksize)))] if not wrapped_chunk[0]: