diff --git a/gensim/summarization/__init__.py b/gensim/summarization/__init__.py index 57c9a7c815..c7efb84d4a 100644 --- a/gensim/summarization/__init__.py +++ b/gensim/summarization/__init__.py @@ -1,4 +1,4 @@ # bring model classes directly into package namespace, to save some typing from .summarizer import summarize, summarize_corpus -from .keywords import keywords \ No newline at end of file +from .keywords import keywords diff --git a/gensim/summarization/bm25.py b/gensim/summarization/bm25.py index 6704146d54..b32d2f040d 100644 --- a/gensim/summarization/bm25.py +++ b/gensim/summarization/bm25.py @@ -40,7 +40,7 @@ def initialize(self): self.df[word] += 1 for word, freq in iteritems(self.df): - self.idf[word] = math.log(self.corpus_size-freq+0.5) - math.log(freq+0.5) + self.idf[word] = math.log(self.corpus_size - freq + 0.5) - math.log(freq + 0.5) def get_score(self, document, index, average_idf): score = 0 @@ -48,8 +48,9 @@ def get_score(self, document, index, average_idf): if word not in self.f[index]: continue idf = self.idf[word] if self.idf[word] >= 0 else EPSILON * average_idf - score += (idf*self.f[index][word]*(PARAM_K1+1) - / (self.f[index][word] + PARAM_K1*(1 - PARAM_B+PARAM_B*self.corpus_size / self.avgdl))) + score += (idf * self.f[index][word] * (PARAM_K1 + 1) + / (self.f[index][word] + PARAM_K1 + * (1 - PARAM_B + PARAM_B * self.corpus_size / self.avgdl))) return score def get_scores(self, document, average_idf): diff --git a/gensim/summarization/keywords.py b/gensim/summarization/keywords.py index 3bb7cee100..8ccc8e0554 100644 --- a/gensim/summarization/keywords.py +++ b/gensim/summarization/keywords.py @@ -161,10 +161,10 @@ def _get_combined_keywords(_keywords, split_text): if word in _keywords: combined_word = [word] if i + 1 == len_text: - result.append(word) # appends last word if keyword and doesn't iterate + result.append(word) # appends last word if keyword and doesn't iterate for j in xrange(i + 1, len_text): other_word = _strip_word(split_text[j]) - if other_word in _keywords and other_word == split_text[j] and not other_word in combined_word: + if other_word in _keywords and other_word == split_text[j] and other_word not in combined_word: combined_word.append(other_word) else: for keyword in combined_word: @@ -210,7 +210,8 @@ def keywords(text, ratio=0.2, words=None, split=False, scores=False, pos_filter= _remove_unreachable_nodes(graph) - # Ranks the tokens using the PageRank algorithm. Returns dict of lemma -> score + # Ranks the tokens using the PageRank algorithm. Returns dict of lemma -> + # score pagerank_scores = _pagerank(graph) extracted_lemmas = _extract_tokens(graph.nodes(), pagerank_scores, ratio, words) @@ -225,7 +226,8 @@ def keywords(text, ratio=0.2, words=None, split=False, scores=False, pos_filter= keywords = _get_keywords_with_score(extracted_lemmas, lemmas_to_word) - # text.split() to keep numbers and punctuation marks, so separeted concepts are not combined + # text.split() to keep numbers and punctuation marks, so separeted + # concepts are not combined combined_keywords = _get_combined_keywords(keywords, text.split()) return _format_results(keywords, combined_keywords, split, scores) diff --git a/gensim/summarization/pagerank_weighted.py b/gensim/summarization/pagerank_weighted.py index 1978c6e1c7..be1a5dfef5 100644 --- a/gensim/summarization/pagerank_weighted.py +++ b/gensim/summarization/pagerank_weighted.py @@ -21,7 +21,8 @@ def pagerank_weighted(graph, damping=0.85): pagerank_matrix = damping * adjacency_matrix.todense() + (1 - damping) * probability_matrix - vals, vecs = eigs(pagerank_matrix.T, k=1) # TODO raise an error if matrix has complex eigenvectors? + # TODO raise an error if matrix has complex eigenvectors? + vals, vecs = eigs(pagerank_matrix.T, k=1) return process_results(graph, vecs.real) @@ -35,7 +36,7 @@ def build_adjacency_matrix(graph): for i in xrange(length): current_node = nodes[i] - neighbors_sum = sum(graph.edge_weight((current_node, neighbor)) for neighbor in graph.neighbors(current_node)) + neighbors_sum = sum(graph.edge_weight((current_node, neighbor))for neighbor in graph.neighbors(current_node)) for j in xrange(length): edge_weight = float(graph.edge_weight((current_node, nodes[j]))) if i != j and edge_weight != 0.0: diff --git a/gensim/summarization/summarizer.py b/gensim/summarization/summarizer.py index 0779011999..f6c83e319d 100644 --- a/gensim/summarization/summarizer.py +++ b/gensim/summarization/summarizer.py @@ -158,7 +158,8 @@ def summarize_corpus(corpus, ratio=0.2): _set_graph_edge_weights(graph) _remove_unreachable_nodes(graph) - # Cannot calculate eigenvectors if number of unique words in text < 3. Warns user to add more text. The function ends. + # Cannot calculate eigenvectors if number of unique words in text < 3. + # Warns user to add more text. The function ends. if len(graph.nodes()) < 3: logger.warning("Please add more sentences to the text. The number of reachable nodes is below 3") return @@ -198,10 +199,11 @@ def summarize(text, ratio=0.2, word_count=None, split=False): logger.warning("Input text is empty.") return - # If only one sentence is present, the function raises an error (Avoids ZeroDivisionError). + # If only one sentence is present, the function raises an error (Avoids + # ZeroDivisionError). if len(sentences) == 1: raise ValueError("input must have more than one sentence") - + # Warns if the text is too short. if len(sentences) < INPUT_MIN_LENGTH: logger.warning("Input text is expected to have at least " + str(INPUT_MIN_LENGTH) + " sentences.")