Skip to content

Commit

Permalink
Merge pull request #1369 from menshikh-iv/pep8_and_minor_changes
Browse files Browse the repository at this point in the history
Minor changes in fresh PRs (pep8, misprints, etc)
  • Loading branch information
menshikh-iv authored May 29, 2017
2 parents f1df69a + 025f63a commit 944d621
Show file tree
Hide file tree
Showing 16 changed files with 160 additions and 165 deletions.
63 changes: 28 additions & 35 deletions docs/notebooks/sklearn_wrapper.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -65,15 +65,17 @@
"outputs": [],
"source": [
"from gensim.corpora import Dictionary\n",
"texts = [['complier', 'system', 'computer'],\n",
" ['eulerian', 'node', 'cycle', 'graph', 'tree', 'path'],\n",
" ['graph', 'flow', 'network', 'graph'],\n",
" ['loading', 'computer', 'system'],\n",
" ['user', 'server', 'system'],\n",
" ['tree','hamiltonian'],\n",
" ['graph', 'trees'],\n",
" ['computer', 'kernel', 'malfunction','computer'],\n",
" ['server','system','computer']]\n",
"texts = [\n",
" ['complier', 'system', 'computer'],\n",
" ['eulerian', 'node', 'cycle', 'graph', 'tree', 'path'],\n",
" ['graph', 'flow', 'network', 'graph'],\n",
" ['loading', 'computer', 'system'],\n",
" ['user', 'server', 'system'],\n",
" ['tree', 'hamiltonian'],\n",
" ['graph', 'trees'],\n",
" ['computer', 'kernel', 'malfunction', 'computer'],\n",
" ['server', 'system', 'computer']\n",
"]\n",
"dictionary = Dictionary(texts)\n",
"corpus = [dictionary.doc2bow(text) for text in texts]"
]
Expand Down Expand Up @@ -119,7 +121,7 @@
}
],
"source": [
"model=SklearnWrapperLdaModel(num_topics=2,id2word=dictionary,iterations=20, random_state=1)\n",
"model=SklearnWrapperLdaModel(num_topics=2, id2word=dictionary, iterations=20, random_state=1)\n",
"model.fit(corpus)\n",
"model.print_topics(2)\n",
"model.transform(corpus)"
Expand Down Expand Up @@ -167,9 +169,7 @@
"source": [
"rand = np.random.mtrand.RandomState(1) # set seed for getting same result\n",
"cats = ['rec.sport.baseball', 'sci.crypt']\n",
"data = fetch_20newsgroups(subset='train',\n",
" categories=cats,\n",
" shuffle=True)"
"data = fetch_20newsgroups(subset='train', categories=cats, shuffle=True)"
]
},
{
Expand All @@ -190,9 +190,9 @@
"vec = CountVectorizer(min_df=10, stop_words='english')\n",
"\n",
"X = vec.fit_transform(data.data)\n",
"vocab = vec.get_feature_names() #vocab to be converted to id2word \n",
"vocab = vec.get_feature_names() # vocab to be converted to id2word \n",
"\n",
"id2word=dict([(i, s) for i, s in enumerate(vocab)])"
"id2word = dict([(i, s) for i, s in enumerate(vocab)])"
]
},
{
Expand Down Expand Up @@ -230,8 +230,8 @@
}
],
"source": [
"obj=SklearnWrapperLdaModel(id2word=id2word,num_topics=5,passes=20)\n",
"lda=obj.fit(X)\n",
"obj = SklearnWrapperLdaModel(id2word=id2word, num_topics=5, passes=20)\n",
"lda = obj.fit(X)\n",
"lda.print_topics()"
]
},
Expand Down Expand Up @@ -264,7 +264,7 @@
},
"outputs": [],
"source": [
"def scorer(estimator, X,y=None):\n",
"def scorer(estimator, X, y=None):\n",
" goodcm = CoherenceModel(model=estimator, texts= texts, dictionary=estimator.id2word, coherence='c_v')\n",
" return goodcm.get_coherence()"
]
Expand Down Expand Up @@ -297,8 +297,8 @@
}
],
"source": [
"obj=SklearnWrapperLdaModel(id2word=dictionary,num_topics=5,passes=20)\n",
"parameters = {'num_topics':(2, 3, 5, 10), 'iterations':(1,20,50)}\n",
"obj = SklearnWrapperLdaModel(id2word=dictionary, num_topics=5, passes=20)\n",
"parameters = {'num_topics': (2, 3, 5, 10), 'iterations': (1, 20, 50)}\n",
"model = GridSearchCV(obj, parameters, scoring=scorer, cv=5)\n",
"model.fit(corpus)"
]
Expand Down Expand Up @@ -342,12 +342,14 @@
"source": [
"from sklearn.pipeline import Pipeline\n",
"from sklearn import linear_model\n",
"\n",
"\n",
"def print_features_pipe(clf, vocab, n=10):\n",
" ''' Better printing for sorted list '''\n",
" coef = clf.named_steps['classifier'].coef_[0]\n",
" print coef\n",
" print 'Positive features: %s' % (' '.join(['%s:%.2f' % (vocab[j], coef[j]) for j in np.argsort(coef)[::-1][:n] if coef[j] > 0]))\n",
" print 'Negative features: %s' % (' '.join(['%s:%.2f' % (vocab[j], coef[j]) for j in np.argsort(coef)[:n] if coef[j] < 0]))\n"
" print 'Negative features: %s' % (' '.join(['%s:%.2f' % (vocab[j], coef[j]) for j in np.argsort(coef)[:n] if coef[j] < 0]))"
]
},
{
Expand All @@ -358,7 +360,7 @@
},
"outputs": [],
"source": [
"id2word=Dictionary(map(lambda x : x.split(),data.data))\n",
"id2word = Dictionary([_.split() for _ in data.data])\n",
"corpus = [id2word.doc2bow(i.split()) for i in data.data]"
]
},
Expand Down Expand Up @@ -391,8 +393,8 @@
}
],
"source": [
"model=SklearnWrapperLdaModel(num_topics=15,id2word=id2word,iterations=50, random_state=37)\n",
"clf=linear_model.LogisticRegression(penalty='l2', C=0.1) #l2 penalty used\n",
"model = SklearnWrapperLdaModel(num_topics=15, id2word=id2word, iterations=50, random_state=37)\n",
"clf = linear_model.LogisticRegression(penalty='l2', C=0.1) # l2 penalty used\n",
"pipe = Pipeline((('features', model,), ('classifier', clf)))\n",
"pipe.fit(corpus, data.target)\n",
"print_features_pipe(pipe, id2word.values())\n",
Expand Down Expand Up @@ -452,22 +454,13 @@
}
],
"source": [
"model=SklearnWrapperLsiModel(num_topics=15, id2word=id2word)\n",
"clf=linear_model.LogisticRegression(penalty='l2', C=0.1) #l2 penalty used\n",
"model = SklearnWrapperLsiModel(num_topics=15, id2word=id2word)\n",
"clf = linear_model.LogisticRegression(penalty='l2', C=0.1) # l2 penalty used\n",
"pipe = Pipeline((('features', model,), ('classifier', clf)))\n",
"pipe.fit(corpus, data.target)\n",
"print_features_pipe(pipe, id2word.values())\n",
"print pipe.score(corpus, data.target)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": []
}
],
"metadata": {
Expand Down
7 changes: 3 additions & 4 deletions docs/notebooks/word2vec.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -785,7 +785,7 @@
}
],
"source": [
"model.evaluate_word_pairs(test_data_dir +'wordsim353.tsv')"
"model.evaluate_word_pairs(test_data_dir + 'wordsim353.tsv')"
]
},
{
Expand Down Expand Up @@ -907,8 +907,7 @@
],
"source": [
"model = gensim.models.Word2Vec.load(temp_path)\n",
"more_sentences = [['Advanced', 'users', 'can', 'load', 'a', 'model', 'and', 'continue', \n",
" 'training', 'it', 'with', 'more', 'sentences']]\n",
"more_sentences = [['Advanced', 'users', 'can', 'load', 'a', 'model', 'and', 'continue', 'training', 'it', 'with', 'more', 'sentences']]\n",
"model.build_vocab(more_sentences, update=True)\n",
"model.train(more_sentences, total_examples=model.corpus_count, epochs=model.iter)\n",
"\n",
Expand Down Expand Up @@ -1023,7 +1022,7 @@
}
],
"source": [
"print(model.predict_output_word(['emergency','beacon','received']))"
"print(model.predict_output_word(['emergency', 'beacon', 'received']))"
]
},
{
Expand Down
8 changes: 5 additions & 3 deletions gensim/corpora/dictionary.py
Original file line number Diff line number Diff line change
Expand Up @@ -194,9 +194,11 @@ def filter_extremes(self, no_below=5, no_above=0.5, keep_n=100000, keep_tokens=N
# determine which tokens to keep
if keep_tokens:
keep_ids = [self.token2id[v] for v in keep_tokens if v in self.token2id]
good_ids = (v for v in itervalues(self.token2id)
if no_below <= self.dfs.get(v, 0) <= no_above_abs
or v in keep_ids)
good_ids = (
v for v in itervalues(self.token2id)
if no_below <= self.dfs.get(v, 0) <= no_above_abs
or v in keep_ids
)
else:
good_ids = (
v for v in itervalues(self.token2id)
Expand Down
13 changes: 11 additions & 2 deletions gensim/matutils.py
Original file line number Diff line number Diff line change
Expand Up @@ -532,8 +532,17 @@ def jaccard(vec1, vec2):
return 1 - float(len(intersection)) / float(len(union))


def jaccard_set(set1, set2):
return 1. - float(len(set1 & set2)) / float(len(set1 | set2))
def jaccard_distance(set1, set2):
"""
Calculate a distance between set representation (1 minus the intersection divided by union).
Return a value in range <0, 1> where values closer to 0 mean smaller distance and thus higher similarity.
"""

union_cardinality = len(set1 | set2)
if union_cardinality == 0: # Both sets are empty
return 1.

return 1. - float(len(set1 & set2)) / float(union_cardinality)


def dirichlet_expectation(alpha):
Expand Down
10 changes: 5 additions & 5 deletions gensim/models/hdpmodel.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,9 @@

from __future__ import with_statement

import logging, time
import logging
import time
import warnings
import numpy as np
from scipy.special import gammaln, psi # gamma function utils

Expand Down Expand Up @@ -614,16 +616,14 @@ def show_topics(self, num_topics=10, num_words=10, log=False, formatted=True):

def print_topic(self, topic_id, topn= None, num_words=None):
if num_words is not None: # deprecated num_words is used
logger.warning("The parameter num_words for print_topic() would be deprecated in the updated version.")
logger.warning("Please use topn instead.")
warnings.warn("The parameter num_words for print_topic() would be deprecated in the updated version. Please use topn instead.")
topn = num_words

return self.show_topic(topic_id, topn, formatted=True)

def show_topic(self, topic_id, topn=20, log=False, formatted=False, num_words= None,):
if num_words is not None: # deprecated num_words is used
logger.warning("The parameter num_words for show_topic() would be deprecated in the updated version.")
logger.warning("Please use topn instead.")
warnings.warn("The parameter num_words for show_topic() would be deprecated in the updated version. Please use topn instead.")
topn = num_words

lambdak = list(self.data[topic_id, :])
Expand Down
16 changes: 9 additions & 7 deletions gensim/models/ldamodel.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@
from gensim import interfaces, utils, matutils
from gensim.matutils import dirichlet_expectation
from gensim.models import basemodel
from gensim.matutils import kullback_leibler, hellinger, jaccard_set
from gensim.matutils import kullback_leibler, hellinger, jaccard_distance

from itertools import chain
from scipy.special import gammaln, psi # gamma function utils
Expand Down Expand Up @@ -989,9 +989,11 @@ def diff(self, other, distance="kulback_leibler", num_words=100, n_ann_terms=10,
>>> print(annotation) # get array with positive/negative words for each topic pair from `m1` and `m2`
"""

distances = {"kulback_leibler": kullback_leibler,
"hellinger": hellinger,
"jaccard": jaccard_set}
distances = {
"kulback_leibler": kullback_leibler,
"hellinger": hellinger,
"jaccard": jaccard_distance,
}

if distance not in distances:
valid_keys = ", ".join("`{}`".format(x) for x in distances.keys())
Expand Down Expand Up @@ -1019,7 +1021,7 @@ def diff(self, other, distance="kulback_leibler", num_words=100, n_ann_terms=10,
if np.abs(np.max(z)) > 1e-8:
z /= np.max(z)

annotation = [[None for _ in range(t1_size)] for _ in range(t2_size)]
annotation = [[None] * t1_size for _ in range(t2_size)]

for topic1 in range(t1_size):
for topic2 in range(t2_size):
Expand Down Expand Up @@ -1118,7 +1120,7 @@ def load(cls, fname, *args, **kwargs):
kwargs['mmap'] = kwargs.get('mmap', None)
result = super(LdaModel, cls).load(fname, *args, **kwargs)

# check if `random_state` attribute has been set after main pickel load
# check if `random_state` attribute has been set after main pickle load
# if set -> the model to be loaded was saved using a >= 0.13.2 version of Gensim
# if not set -> the model to be loaded was saved using a < 0.13.2 version of Gensim, so set `random_state` as the default value
if not hasattr(result, 'random_state'):
Expand All @@ -1134,7 +1136,7 @@ def load(cls, fname, *args, **kwargs):
id2word_fname = utils.smart_extension(fname, '.id2word')
# check if `id2word_fname` file is present on disk
# if present -> the model to be loaded was saved using a >= 0.13.2 version of Gensim, so set `result.id2word` using the `id2word_fname` file
# if not present -> the model to be loaded was saved using a < 0.13.2 version of Gensim, so `result.id2word` already set after the main pickel load
# if not present -> the model to be loaded was saved using a < 0.13.2 version of Gensim, so `result.id2word` already set after the main pickle load
if (os.path.isfile(id2word_fname)):
try:
result.id2word = utils.unpickle(id2word_fname)
Expand Down
Loading

0 comments on commit 944d621

Please sign in to comment.