Skip to content

Commit

Permalink
Remove ignore of E731
Browse files Browse the repository at this point in the history
  • Loading branch information
horpto committed Nov 2, 2017
1 parent c583b28 commit 04d7f87
Show file tree
Hide file tree
Showing 43 changed files with 201 additions and 179 deletions.
3 changes: 2 additions & 1 deletion gensim/matutils.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,8 @@
from six.moves import xrange, zip as izip


blas = lambda name, ndarray: scipy.linalg.get_blas_funcs((name,), (ndarray,))[0]
def blas(name, ndarray):
return scipy.linalg.get_blas_funcs((name,), (ndarray,))[0]

logger = logging.getLogger(__name__)

Expand Down
9 changes: 6 additions & 3 deletions gensim/similarities/docsim.py
Original file line number Diff line number Diff line change
Expand Up @@ -336,18 +336,21 @@ def __getitem__(self, query):
# the following uses a lot of lazy evaluation and (optionally) parallel
# processing, to improve query latency and minimize memory footprint.
offsets = numpy.cumsum([0] + [len(shard) for shard in self.shards])
convert = lambda doc, shard_no: [(doc_index + offsets[shard_no], sim) for doc_index, sim in doc]

def convert(shard_no, doc):
return [(doc_index + offsets[shard_no], sim) for doc_index, sim in doc]

is_corpus, query = utils.is_corpus(query)
is_corpus = is_corpus or hasattr(query, 'ndim') and query.ndim > 1 and query.shape[0] > 1
if not is_corpus:
# user asked for num_best most similar and query is a single doc
results = (convert(result, shard_no) for shard_no, result in enumerate(shard_results))
results = (convert(shard_no, result) for shard_no, result in enumerate(shard_results))
result = heapq.nlargest(self.num_best, itertools.chain(*results), key=lambda item: item[1])
else:
# the trickiest combination: returning num_best results when query was a corpus
results = []
for shard_no, result in enumerate(shard_results):
shard_result = [convert(doc, shard_no) for doc in result]
shard_result = [convert(shard_no, doc) for doc in result]
results.append(shard_result)
result = []
for parts in izip(*results):
Expand Down
16 changes: 5 additions & 11 deletions gensim/sklearn_api/atmodel.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,17 +76,11 @@ def transform(self, author_names):
"This model has not been fitted yet. Call 'fit' with appropriate arguments before using this method."
)

check = lambda x: [x] if not isinstance(x, list) else x
author_names = check(author_names)
X = [[] for _ in range(0, len(author_names))]

for k, v in enumerate(author_names):
transformed_author = self.gensim_model[v]
# returning dense representation for compatibility with sklearn but we should go back to sparse representation in the future
probs_author = matutils.sparse2full(transformed_author, self.num_topics)
X[k] = probs_author

return np.reshape(np.array(X), (len(author_names), self.num_topics))
if not isinstance(author_names, list):
author_names = [author_names]
# returning dense representation for compatibility with sklearn but we should go back to sparse representation in the future
topics = [matutils.sparse2full(self.gensim_model[author_name], self.num_topics) for author_name in author_names]
return np.reshape(np.array(topics), (len(author_names), self.num_topics))

def partial_fit(self, X, author2doc=None, doc2author=None):
"""
Expand Down
13 changes: 4 additions & 9 deletions gensim/sklearn_api/d2vmodel.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,12 +87,7 @@ def transform(self, docs):
)

# The input as array of array
check = lambda x: [x] if isinstance(x[0], string_types) else x
docs = check(docs)
X = [[] for _ in range(0, len(docs))]

for k, v in enumerate(docs):
doc_vec = self.gensim_model.infer_vector(v)
X[k] = doc_vec

return np.reshape(np.array(X), (len(docs), self.gensim_model.vector_size))
if isinstance(docs[0], string_types):
docs = [docs]
vectors = [self.gensim_model.infer_vector(doc) for doc in docs]
return np.reshape(np.array(vectors), (len(docs), self.gensim_model.vector_size))
27 changes: 12 additions & 15 deletions gensim/sklearn_api/hdp.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,21 +77,18 @@ def transform(self, docs):
)

# The input as array of array
check = lambda x: [x] if isinstance(x[0], tuple) else x
docs = check(docs)
X = [[] for _ in range(0, len(docs))]

max_num_topics = 0
for k, v in enumerate(docs):
X[k] = self.gensim_model[v]
max_num_topics = max(max_num_topics, max(x[0] for x in X[k]) + 1)

for k, v in enumerate(X):
# returning dense representation for compatibility with sklearn but we should go back to sparse representation in the future
dense_vec = matutils.sparse2full(v, max_num_topics)
X[k] = dense_vec

return np.reshape(np.array(X), (len(docs), max_num_topics))
if isinstance(docs[0], tuple):
docs = [docs]
distribution, max_num_topics = [], 0

for doc in docs:
topicd = self.gensim_model[doc]
distribution.append(topicd)
max_num_topics = max(max_num_topics, max(topic[0] for topic in topicd) + 1)

# returning dense representation for compatibility with sklearn but we should go back to sparse representation in the future
distribution = [matutils.sparse2full(topicd, max_num_topics) for topicd in distribution]
return np.reshape(np.array(distribution), (len(docs), max_num_topics))

def partial_fit(self, X):
"""
Expand Down
15 changes: 5 additions & 10 deletions gensim/sklearn_api/ldamodel.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,16 +83,11 @@ def transform(self, docs):
raise NotFittedError("This model has not been fitted yet. Call 'fit' with appropriate arguments before using this method.")

# The input as array of array
check = lambda x: [x] if isinstance(x[0], tuple) else x
docs = check(docs)
X = [[] for _ in range(0, len(docs))]

for k, v in enumerate(docs):
doc_topics = self.gensim_model[v]
# returning dense representation for compatibility with sklearn but we should go back to sparse representation in the future
probs_docs = matutils.sparse2full(doc_topics, self.num_topics)
X[k] = probs_docs
return np.reshape(np.array(X), (len(docs), self.num_topics))
if isinstance(docs[0], tuple):
docs = [docs]
# returning dense representation for compatibility with sklearn but we should go back to sparse representation in the future
distribution = [matutils.sparse2full(self.gensim_model[doc], self.num_topics) for doc in docs]
return np.reshape(np.array(distribution), (len(docs), self.num_topics))

def partial_fit(self, X):
"""
Expand Down
13 changes: 4 additions & 9 deletions gensim/sklearn_api/ldaseqmodel.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,12 +69,7 @@ def transform(self, docs):
raise NotFittedError("This model has not been fitted yet. Call 'fit' with appropriate arguments before using this method.")

# The input as array of array
check = lambda x: [x] if isinstance(x[0], tuple) else x
docs = check(docs)
X = [[] for _ in range(0, len(docs))]

for k, v in enumerate(docs):
transformed_author = self.gensim_model[v]
X[k] = transformed_author

return np.reshape(np.array(X), (len(docs), self.num_topics))
if isinstance(docs[0], tuple):
docs = [docs]
proportions = [self.gensim_model[doc] for doc in docs]
return np.reshape(np.array(proportions), (len(docs), self.num_topics))
14 changes: 5 additions & 9 deletions gensim/sklearn_api/lsimodel.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,15 +67,11 @@ def transform(self, docs):
)

# The input as array of array
check = lambda x: [x] if isinstance(x[0], tuple) else x
docs = check(docs)
X = [[] for i in range(0, len(docs))]
for k, v in enumerate(docs):
doc_topics = self.gensim_model[v]
# returning dense representation for compatibility with sklearn but we should go back to sparse representation in the future
probs_docs = matutils.sparse2full(doc_topics, self.num_topics)
X[k] = probs_docs
return np.reshape(np.array(X), (len(docs), self.num_topics))
if isinstance(docs[0], tuple):
docs = [docs]
# returning dense representation for compatibility with sklearn but we should go back to sparse representation in the future
distribution = [matutils.sparse2full(self.gensim_model[doc], self.num_topics) for doc in docs]
return np.reshape(np.array(distribution), (len(docs), self.num_topics))

def partial_fit(self, X):
"""
Expand Down
12 changes: 3 additions & 9 deletions gensim/sklearn_api/phrases.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,15 +50,9 @@ def transform(self, docs):
raise NotFittedError("This model has not been fitted yet. Call 'fit' with appropriate arguments before using this method.")

# input as python lists
check = lambda x: [x] if isinstance(x[0], string_types) else x
docs = check(docs)
X = [[] for _ in range(0, len(docs))]

for k, v in enumerate(docs):
phrase_tokens = self.gensim_model[v]
X[k] = phrase_tokens

return X
if isinstance(docs[0], string_types):
docs = [docs]
return [self.gensim_model[doc] for doc in docs]

def partial_fit(self, X):
if self.gensim_model is None:
Expand Down
16 changes: 5 additions & 11 deletions gensim/sklearn_api/rpmodel.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,14 +52,8 @@ def transform(self, docs):
)

# The input as array of array
check = lambda x: [x] if isinstance(x[0], tuple) else x
docs = check(docs)
X = [[] for _ in range(0, len(docs))]

for k, v in enumerate(docs):
transformed_doc = self.gensim_model[v]
# returning dense representation for compatibility with sklearn but we should go back to sparse representation in the future
probs_docs = matutils.sparse2full(transformed_doc, self.num_topics)
X[k] = probs_docs

return np.reshape(np.array(X), (len(docs), self.num_topics))
if isinstance(docs[0], tuple):
docs = [docs]
# returning dense representation for compatibility with sklearn but we should go back to sparse representation in the future
presentation = [matutils.sparse2full(self.gensim_model[doc], self.num_topics) for doc in docs]
return np.reshape(np.array(presentation), (len(docs), self.num_topics))
14 changes: 4 additions & 10 deletions gensim/sklearn_api/text2bow.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,16 +48,10 @@ def transform(self, docs):
)

# input as python lists
check = lambda x: [x] if isinstance(x, string_types) else x
docs = check(docs)
tokenized_docs = [list(self.tokenizer(x)) for x in docs]
X = [[] for _ in range(0, len(tokenized_docs))]

for k, v in enumerate(tokenized_docs):
bow_val = self.gensim_model.doc2bow(v)
X[k] = bow_val

return X
if isinstance(docs, string_types):
docs = [docs]
tokenized_docs = (list(self.tokenizer(doc)) for doc in docs)
return [self.gensim_model.doc2bow(doc) for doc in tokenized_docs]

def partial_fit(self, X):
if self.gensim_model is None:
Expand Down
12 changes: 3 additions & 9 deletions gensim/sklearn_api/tfidf.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,12 +51,6 @@ def transform(self, docs):
)

# input as python lists
check = lambda x: [x] if isinstance(x[0], tuple) else x
docs = check(docs)
X = [[] for _ in range(0, len(docs))]

for k, v in enumerate(docs):
transformed_doc = self.gensim_model[v]
X[k] = transformed_doc

return X
if isinstance(docs[0], tuple):
docs = [docs]
return [self.gensim_model[doc] for doc in docs]
13 changes: 4 additions & 9 deletions gensim/sklearn_api/w2vmodel.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,15 +75,10 @@ def transform(self, words):
)

# The input as array of array
check = lambda x: [x] if isinstance(x, six.string_types) else x
words = check(words)
X = [[] for _ in range(0, len(words))]

for k, v in enumerate(words):
word_vec = self.gensim_model[v]
X[k] = word_vec

return np.reshape(np.array(X), (len(words), self.size))
if isinstance(words, six.string_types):
words = [words]
vectors = [self.gensim_model[word] for word in words]
return np.reshape(np.array(vectors), (len(words), self.size))

def partial_fit(self, X):
raise NotImplementedError(
Expand Down
19 changes: 12 additions & 7 deletions gensim/test/test_atmodel.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,10 @@
# Test that models are compatiple across versions, as done in LdaModel.

module_path = os.path.dirname(__file__) # needed because sample data files are located in the same folder
datapath = lambda fname: os.path.join(module_path, 'test_data', fname)


def datapath(fname):
return os.path.join(module_path, 'test_data', fname)

# set up vars used in testing ("Deerwester" from the web tutorial)
texts = [
Expand Down Expand Up @@ -475,24 +478,26 @@ def testPasses(self):
# long message includes the original error message with a custom one
self.longMessage = True
# construct what we expect when passes aren't involved
test_rhots = list()
test_rhots = []
model = self.class_(id2word=dictionary, chunksize=1, num_topics=2)
final_rhot = lambda: pow(model.offset + (1 * model.num_updates) / model.chunksize, -model.decay)

def final_rhot(model):
return pow(model.offset + (1 * model.num_updates) / model.chunksize, -model.decay)

# generate 5 updates to test rhot on
for x in range(5):
for _ in range(5):
model.update(corpus, author2doc)
test_rhots.append(final_rhot())
test_rhots.append(final_rhot(model))

for passes in [1, 5, 10, 50, 100]:
model = self.class_(id2word=dictionary, chunksize=1, num_topics=2, passes=passes)
self.assertEqual(final_rhot(), 1.0)
self.assertEqual(final_rhot(model), 1.0)
# make sure the rhot matches the test after each update
for test_rhot in test_rhots:
model.update(corpus, author2doc)

msg = "{}, {}, {}".format(passes, model.num_updates, model.state.numdocs)
self.assertAlmostEqual(final_rhot(), test_rhot, msg=msg)
self.assertAlmostEqual(final_rhot(model), test_rhot, msg=msg)

self.assertEqual(model.state.numdocs, len(corpus) * len(test_rhots))
self.assertEqual(model.num_updates, len(corpus) * len(test_rhots))
Expand Down
16 changes: 7 additions & 9 deletions gensim/test/test_coherencemodel.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
import unittest
from unittest import SkipTest
import multiprocessing as mp
from functools import partial

import numpy as np
from gensim.corpora.dictionary import Dictionary
Expand Down Expand Up @@ -215,20 +216,17 @@ def testErrors(self):
)

def testProcesses(self):
cpu = mp.cpu_count()
get_model = lambda p: CoherenceModel(
topics=self.topics1, corpus=self.corpus, dictionary=self.dictionary, coherence='u_mass', processes=p,
get_model = partial(CoherenceModel,
topics=self.topics1, corpus=self.corpus, dictionary=self.dictionary, coherence='u_mass'
)

model = CoherenceModel(
topics=self.topics1, corpus=self.corpus, dictionary=self.dictionary, coherence='u_mass',
)
self.assertEqual(model.processes, cpu - 1)
model, used_cpus = get_model(), mp.cpu_count() - 1
self.assertEqual(model.processes, used_cpus)
for p in range(-2, 1):
self.assertEqual(get_model(p).processes, cpu - 1)
self.assertEqual(get_model(processes=p).processes, used_cpus)

for p in range(1, 4):
self.assertEqual(get_model(p).processes, p)
self.assertEqual(get_model(processes=p).processes, p)

def testPersistence(self):
fname = testfile()
Expand Down
5 changes: 4 additions & 1 deletion gensim/test/test_doc2vec.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,10 @@
from gensim.models import doc2vec, keyedvectors

module_path = os.path.dirname(__file__) # needed because sample data files are located in the same folder
datapath = lambda fname: os.path.join(module_path, 'test_data', fname)


def datapath(fname):
return os.path.join(module_path, 'test_data', fname)


class DocsLeeCorpus(object):
Expand Down
5 changes: 4 additions & 1 deletion gensim/test/test_dtm.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,10 @@

# needed because sample data files are located in the same folder
module_path = os.path.dirname(__file__)
datapath = lambda fname: os.path.join(module_path, 'test_data', fname)


def datapath(fname):
return os.path.join(module_path, 'test_data', fname)


class TestDtmModel(unittest.TestCase):
Expand Down
5 changes: 4 additions & 1 deletion gensim/test/test_fasttext.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,12 +16,15 @@
from gensim.models.wrappers.fasttext import FastText as FT_wrapper

module_path = os.path.dirname(__file__) # needed because sample data files are located in the same folder
datapath = lambda fname: os.path.join(module_path, 'test_data', fname)
logger = logging.getLogger(__name__)

IS_WIN32 = (os.name == "nt") and (struct.calcsize('P') * 8 == 32)


def datapath(fname):
return os.path.join(module_path, 'test_data', fname)


class LeeCorpus(object):
def __iter__(self):
with open(datapath('lee_background.cor')) as f:
Expand Down
Loading

0 comments on commit 04d7f87

Please sign in to comment.