Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

PEP8 Fixes for Summarization. #1017

Closed
wants to merge 10 commits into from
Closed
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion gensim/summarization/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@

# bring model classes directly into package namespace, to save some typing
from .summarizer import summarize, summarize_corpus
from .keywords import keywords
from .keywords import keywords
6 changes: 3 additions & 3 deletions gensim/summarization/bm25.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,16 +40,16 @@ def initialize(self):
self.df[word] += 1

for word, freq in iteritems(self.df):
self.idf[word] = math.log(self.corpus_size-freq+0.5) - math.log(freq+0.5)
self.idf[word] = math.log(self.corpus_size - freq + 0.5) - math.log(freq + 0.5)

def get_score(self, document, index, average_idf):
score = 0
for word in document:
if word not in self.f[index]:
continue
idf = self.idf[word] if self.idf[word] >= 0 else EPSILON * average_idf
score += (idf*self.f[index][word]*(PARAM_K1+1)
/ (self.f[index][word] + PARAM_K1*(1 - PARAM_B+PARAM_B*self.corpus_size / self.avgdl)))
score += (idf * self.f[index][word] * (PARAM_K1 + 1) /
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No vertical indent.
Splitting the mega-expression into something saner (subexpressions) will help both readability and line length.

(self.f[index][word] + PARAM_K1 * (1 - PARAM_B + PARAM_B * self.corpus_size / self.avgdl)))
return score

def get_scores(self, document, average_idf):
Expand Down
3 changes: 2 additions & 1 deletion gensim/summarization/commons.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,5 +16,6 @@ def build_graph(sequence):

def remove_unreachable_nodes(graph):
for node in graph.nodes():
if sum(graph.edge_weight((node, other)) for other in graph.neighbors(node)) == 0:
if sum(graph.edge_weight((node, other))
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

we don't check for lenght of line

for other in graph.neighbors(node)) == 0:
graph.del_node(node)
20 changes: 16 additions & 4 deletions gensim/summarization/keywords.py
Original file line number Diff line number Diff line change
Expand Up @@ -161,7 +161,8 @@ def _get_combined_keywords(_keywords, split_text):
if word in _keywords:
combined_word = [word]
if i + 1 == len_text:
result.append(word) # appends last word if keyword and doesn't iterate
# appends last word if keyword and doesn't iterate
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ditto

result.append(word)
for j in xrange(i + 1, len_text):
other_word = _strip_word(split_text[j])
if other_word in _keywords and other_word == split_text[j] and not other_word in combined_word:
Expand Down Expand Up @@ -197,7 +198,16 @@ def _format_results(_keywords, combined_keywords, split, scores):
return "\n".join(combined_keywords)


def keywords(text, ratio=0.2, words=None, split=False, scores=False, pos_filter=['NN', 'JJ'], lemmatize=False):
def keywords(
text,
ratio=0.2,
words=None,
split=False,
scores=False,
pos_filter=[
'NN',
'JJ'],
lemmatize=False):
# Gets a dict of word -> lemma
text = to_unicode(text)
tokens = _clean_text_by_word(text)
Expand All @@ -210,7 +220,8 @@ def keywords(text, ratio=0.2, words=None, split=False, scores=False, pos_filter=

_remove_unreachable_nodes(graph)

# Ranks the tokens using the PageRank algorithm. Returns dict of lemma -> score
# Ranks the tokens using the PageRank algorithm. Returns dict of lemma ->
# score
pagerank_scores = _pagerank(graph)

extracted_lemmas = _extract_tokens(graph.nodes(), pagerank_scores, ratio, words)
Expand All @@ -225,7 +236,8 @@ def keywords(text, ratio=0.2, words=None, split=False, scores=False, pos_filter=

keywords = _get_keywords_with_score(extracted_lemmas, lemmas_to_word)

# text.split() to keep numbers and punctuation marks, so separeted concepts are not combined
# text.split() to keep numbers and punctuation marks, so separeted
# concepts are not combined
combined_keywords = _get_combined_keywords(keywords, text.split())

return _format_results(keywords, combined_keywords, split, scores)
Expand Down
5 changes: 3 additions & 2 deletions gensim/summarization/pagerank_weighted.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,8 @@ def pagerank_weighted(graph, damping=0.85):

pagerank_matrix = damping * adjacency_matrix.todense() + (1 - damping) * probability_matrix

vals, vecs = eigs(pagerank_matrix.T, k=1) # TODO raise an error if matrix has complex eigenvectors?
# TODO raise an error if matrix has complex eigenvectors?
vals, vecs = eigs(pagerank_matrix.T, k=1)

return process_results(graph, vecs.real)

Expand All @@ -35,7 +36,7 @@ def build_adjacency_matrix(graph):

for i in xrange(length):
current_node = nodes[i]
neighbors_sum = sum(graph.edge_weight((current_node, neighbor)) for neighbor in graph.neighbors(current_node))
neighbors_sum = sum(graph.edge_weight((current_node, neighbor))for neighbor in graph.neighbors(current_node))
for j in xrange(length):
edge_weight = float(graph.edge_weight((current_node, nodes[j])))
if i != j and edge_weight != 0.0:
Expand Down
17 changes: 12 additions & 5 deletions gensim/summarization/summarizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -110,12 +110,17 @@ def _get_sentences_with_word_count(sentences, word_count):
return selected_sentences


def _extract_important_sentences(sentences, corpus, important_docs, word_count):
def _extract_important_sentences(
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Definitely not.

sentences,
corpus,
important_docs,
word_count):
important_sentences = _get_important_sentences(sentences, corpus, important_docs)

# If no "word_count" option is provided, the number of sentences is
# reduced by the provided ratio. Else, the ratio is ignored.
return important_sentences if word_count is None else _get_sentences_with_word_count(important_sentences, word_count)
return important_sentences if word_count is None else _get_sentences_with_word_count(
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No.

important_sentences, word_count)


def _format_results(extracted_sentences, split):
Expand Down Expand Up @@ -158,7 +163,8 @@ def summarize_corpus(corpus, ratio=0.2):
_set_graph_edge_weights(graph)
_remove_unreachable_nodes(graph)

# Cannot calculate eigenvectors if number of unique words in text < 3. Warns user to add more text. The function ends.
# Cannot calculate eigenvectors if number of unique words in text < 3.
# Warns user to add more text. The function ends.
if len(graph.nodes()) < 3:
logger.warning("Please add more sentences to the text. The number of reachable nodes is below 3")
return
Expand Down Expand Up @@ -198,10 +204,11 @@ def summarize(text, ratio=0.2, word_count=None, split=False):
logger.warning("Input text is empty.")
return

# If only one sentence is present, the function raises an error (Avoids ZeroDivisionError).
# If only one sentence is present, the function raises an error (Avoids
# ZeroDivisionError).
if len(sentences) == 1:
raise ValueError("input must have more than one sentence")

# Warns if the text is too short.
if len(sentences) < INPUT_MIN_LENGTH:
logger.warning("Input text is expected to have at least " + str(INPUT_MIN_LENGTH) + " sentences.")
Expand Down