From f051cbf721fbed682d864a4536565ba506f35261 Mon Sep 17 00:00:00 2001 From: Sourav Singh Date: Sun, 13 Nov 2016 08:37:16 -0800 Subject: [PATCH 1/9] PEP8 Fixes for Summarization. --- gensim/summarization/__init__.py | 2 +- gensim/summarization/bm25.py | 25 +++++++--- gensim/summarization/commons.py | 3 +- gensim/summarization/graph.py | 14 ++++-- gensim/summarization/keywords.py | 39 ++++++++++++---- gensim/summarization/pagerank_weighted.py | 9 ++-- gensim/summarization/summarizer.py | 56 +++++++++++++++++------ gensim/summarization/syntactic_unit.py | 3 +- gensim/summarization/textcleaner.py | 40 +++++++++++----- 9 files changed, 139 insertions(+), 52 deletions(-) diff --git a/gensim/summarization/__init__.py b/gensim/summarization/__init__.py index 57c9a7c815..c7efb84d4a 100644 --- a/gensim/summarization/__init__.py +++ b/gensim/summarization/__init__.py @@ -1,4 +1,4 @@ # bring model classes directly into package namespace, to save some typing from .summarizer import summarize, summarize_corpus -from .keywords import keywords \ No newline at end of file +from .keywords import keywords diff --git a/gensim/summarization/bm25.py b/gensim/summarization/bm25.py index 6704146d54..f9aecbb43b 100644 --- a/gensim/summarization/bm25.py +++ b/gensim/summarization/bm25.py @@ -18,7 +18,8 @@ class BM25(object): def __init__(self, corpus): self.corpus_size = len(corpus) - self.avgdl = sum(map(lambda x: float(len(x)), corpus)) / self.corpus_size + self.avgdl = sum(map(lambda x: float(len(x)), corpus) + ) / self.corpus_size self.corpus = corpus self.f = [] self.df = {} @@ -40,16 +41,27 @@ def initialize(self): self.df[word] += 1 for word, freq in iteritems(self.df): - self.idf[word] = math.log(self.corpus_size-freq+0.5) - math.log(freq+0.5) + self.idf[word] = math.log( + self.corpus_size - freq + 0.5) - math.log(freq + 0.5) def get_score(self, document, index, average_idf): score = 0 for word in document: if word not in self.f[index]: continue - idf = self.idf[word] if self.idf[word] >= 0 else EPSILON * average_idf - score += (idf*self.f[index][word]*(PARAM_K1+1) - / (self.f[index][word] + PARAM_K1*(1 - PARAM_B+PARAM_B*self.corpus_size / self.avgdl))) + idf = self.idf[word] if self.idf[ + word] >= 0 else EPSILON * average_idf + score += (idf * + self.f[index][word] * + (PARAM_K1 + + 1) / + (self.f[index][word] + + PARAM_K1 * + (1 - + PARAM_B + + PARAM_B * + self.corpus_size / + self.avgdl))) return score def get_scores(self, document, average_idf): @@ -62,7 +74,8 @@ def get_scores(self, document, average_idf): def get_bm25_weights(corpus): bm25 = BM25(corpus) - average_idf = sum(map(lambda k: float(bm25.idf[k]), bm25.idf.keys())) / len(bm25.idf.keys()) + average_idf = sum(map(lambda k: float( + bm25.idf[k]), bm25.idf.keys())) / len(bm25.idf.keys()) weights = [] for doc in corpus: diff --git a/gensim/summarization/commons.py b/gensim/summarization/commons.py index 1c467098f9..4f19196066 100644 --- a/gensim/summarization/commons.py +++ b/gensim/summarization/commons.py @@ -16,5 +16,6 @@ def build_graph(sequence): def remove_unreachable_nodes(graph): for node in graph.nodes(): - if sum(graph.edge_weight((node, other)) for other in graph.neighbors(node)) == 0: + if sum(graph.edge_weight((node, other)) + for other in graph.neighbors(node)) == 0: graph.del_node(node) diff --git a/gensim/summarization/graph.py b/gensim/summarization/graph.py index bfed410b5e..7922ac7c9c 100644 --- a/gensim/summarization/graph.py +++ b/gensim/summarization/graph.py @@ -161,10 +161,13 @@ def __init__(self): def has_edge(self, edge): u, v = edge - return (u, v) in self.edge_properties and (v, u) in self.edge_properties + return ( + u, v) in self.edge_properties and ( + v, u) in self.edge_properties def edge_weight(self, edge): - return self.get_edge_properties(edge).setdefault(self.WEIGHT_ATTRIBUTE_NAME, self.DEFAULT_WEIGHT) + return self.get_edge_properties(edge).setdefault( + self.WEIGHT_ATTRIBUTE_NAME, self.DEFAULT_WEIGHT) def neighbors(self, node): return self.node_neighbors[node] @@ -218,7 +221,9 @@ def add_edge_attribute(self, edge, attr): self.edge_attr[edge] = self.edge_attributes(edge) + [attr] if edge[0] != edge[1]: - self.edge_attr[(edge[1], edge[0])] = self.edge_attributes((edge[1], edge[0])) + [attr] + self.edge_attr[ + (edge[1], edge[0])] = self.edge_attributes( + (edge[1], edge[0])) + [attr] def edge_attributes(self, edge): try: @@ -229,7 +234,8 @@ def edge_attributes(self, edge): def set_edge_properties(self, edge, **properties): self.edge_properties.setdefault(edge, {}).update(properties) if edge[0] != edge[1]: - self.edge_properties.setdefault((edge[1], edge[0]), {}).update(properties) + self.edge_properties.setdefault( + (edge[1], edge[0]), {}).update(properties) def del_edge(self, edge): u, v = edge diff --git a/gensim/summarization/keywords.py b/gensim/summarization/keywords.py index 3bb7cee100..7e9a919909 100644 --- a/gensim/summarization/keywords.py +++ b/gensim/summarization/keywords.py @@ -37,7 +37,8 @@ def _get_words_for_graph(tokens, pos_filter): include_filters = set(pos_filter) exclude_filters = frozenset([]) if include_filters and exclude_filters: - raise ValueError("Can't use both include and exclude filters, should use only one") + raise ValueError( + "Can't use both include and exclude filters, should use only one") result = [] for word, unit in iteritems(tokens): @@ -58,7 +59,8 @@ def _set_graph_edge(graph, tokens, word_a, word_b): lemma_b = tokens[word_b].token edge = (lemma_a, lemma_b) - if graph.has_node(lemma_a) and graph.has_node(lemma_b) and not graph.has_edge(edge): + if graph.has_node(lemma_a) and graph.has_node( + lemma_b) and not graph.has_edge(edge): graph.add_edge(edge) @@ -161,10 +163,12 @@ def _get_combined_keywords(_keywords, split_text): if word in _keywords: combined_word = [word] if i + 1 == len_text: - result.append(word) # appends last word if keyword and doesn't iterate + # appends last word if keyword and doesn't iterate + result.append(word) for j in xrange(i + 1, len_text): other_word = _strip_word(split_text[j]) - if other_word in _keywords and other_word == split_text[j] and not other_word in combined_word: + if other_word in _keywords and other_word == split_text[ + j] and not other_word in combined_word: combined_word.append(other_word) else: for keyword in combined_word: @@ -189,15 +193,27 @@ def _format_results(_keywords, combined_keywords, split, scores): :param keywords:dict of keywords:scores :param combined_keywords:list of word/s """ - combined_keywords.sort(key=lambda w: _get_average_score(w, _keywords), reverse=True) + combined_keywords.sort( + key=lambda w: _get_average_score( + w, _keywords), reverse=True) if scores: - return [(word, _get_average_score(word, _keywords)) for word in combined_keywords] + return [(word, _get_average_score(word, _keywords)) + for word in combined_keywords] if split: return combined_keywords return "\n".join(combined_keywords) -def keywords(text, ratio=0.2, words=None, split=False, scores=False, pos_filter=['NN', 'JJ'], lemmatize=False): +def keywords( + text, + ratio=0.2, + words=None, + split=False, + scores=False, + pos_filter=[ + 'NN', + 'JJ'], + lemmatize=False): # Gets a dict of word -> lemma text = to_unicode(text) tokens = _clean_text_by_word(text) @@ -210,10 +226,12 @@ def keywords(text, ratio=0.2, words=None, split=False, scores=False, pos_filter= _remove_unreachable_nodes(graph) - # Ranks the tokens using the PageRank algorithm. Returns dict of lemma -> score + # Ranks the tokens using the PageRank algorithm. Returns dict of lemma -> + # score pagerank_scores = _pagerank(graph) - extracted_lemmas = _extract_tokens(graph.nodes(), pagerank_scores, ratio, words) + extracted_lemmas = _extract_tokens( + graph.nodes(), pagerank_scores, ratio, words) # The results can be polluted by many variations of the same word if lemmatize: @@ -225,7 +243,8 @@ def keywords(text, ratio=0.2, words=None, split=False, scores=False, pos_filter= keywords = _get_keywords_with_score(extracted_lemmas, lemmas_to_word) - # text.split() to keep numbers and punctuation marks, so separeted concepts are not combined + # text.split() to keep numbers and punctuation marks, so separeted + # concepts are not combined combined_keywords = _get_combined_keywords(keywords, text.split()) return _format_results(keywords, combined_keywords, split, scores) diff --git a/gensim/summarization/pagerank_weighted.py b/gensim/summarization/pagerank_weighted.py index 1978c6e1c7..f2e97049e4 100644 --- a/gensim/summarization/pagerank_weighted.py +++ b/gensim/summarization/pagerank_weighted.py @@ -19,9 +19,11 @@ def pagerank_weighted(graph, damping=0.85): adjacency_matrix = build_adjacency_matrix(graph) probability_matrix = build_probability_matrix(graph) - pagerank_matrix = damping * adjacency_matrix.todense() + (1 - damping) * probability_matrix + pagerank_matrix = damping * adjacency_matrix.todense() + (1 - damping) * \ + probability_matrix - vals, vecs = eigs(pagerank_matrix.T, k=1) # TODO raise an error if matrix has complex eigenvectors? + # TODO raise an error if matrix has complex eigenvectors? + vals, vecs = eigs(pagerank_matrix.T, k=1) return process_results(graph, vecs.real) @@ -35,7 +37,8 @@ def build_adjacency_matrix(graph): for i in xrange(length): current_node = nodes[i] - neighbors_sum = sum(graph.edge_weight((current_node, neighbor)) for neighbor in graph.neighbors(current_node)) + neighbors_sum = sum(graph.edge_weight((current_node, neighbor)) + for neighbor in graph.neighbors(current_node)) for j in xrange(length): edge_weight = float(graph.edge_weight((current_node, nodes[j]))) if i != j and edge_weight != 0.0: diff --git a/gensim/summarization/summarizer.py b/gensim/summarization/summarizer.py index 0779011999..1c8fc4f219 100644 --- a/gensim/summarization/summarizer.py +++ b/gensim/summarization/summarizer.py @@ -72,7 +72,8 @@ def _get_similarity(doc1, doc2, vec1, vec2): length_1 = _get_doc_length(doc1) length_2 = _get_doc_length(doc2) - denominator = _log10(length_1) + _log10(length_2) if length_1 > 0 and length_2 > 0 else 0 + denominator = _log10( + length_1) + _log10(length_2) if length_1 > 0 and length_2 > 0 else 0 return numerator / denominator if denominator != 0 else 0 @@ -86,7 +87,8 @@ def _build_corpus(sentences): def _get_important_sentences(sentences, corpus, important_docs): hashable_corpus = _build_hasheable_corpus(corpus) sentences_by_corpus = dict(zip(hashable_corpus, sentences)) - return [sentences_by_corpus[tuple(important_doc)] for important_doc in important_docs] + return [sentences_by_corpus[tuple(important_doc)] + for important_doc in important_docs] def _get_sentences_with_word_count(sentences, word_count): @@ -101,7 +103,12 @@ def _get_sentences_with_word_count(sentences, word_count): # Checks if the inclusion of the sentence gives a better approximation # to the word parameter. - if abs(word_count - length - words_in_sentence) > abs(word_count - length): + if abs( + word_count - + length - + words_in_sentence) > abs( + word_count - + length): return selected_sentences selected_sentences.append(sentence) @@ -110,12 +117,18 @@ def _get_sentences_with_word_count(sentences, word_count): return selected_sentences -def _extract_important_sentences(sentences, corpus, important_docs, word_count): - important_sentences = _get_important_sentences(sentences, corpus, important_docs) +def _extract_important_sentences( + sentences, + corpus, + important_docs, + word_count): + important_sentences = _get_important_sentences( + sentences, corpus, important_docs) # If no "word_count" option is provided, the number of sentences is # reduced by the provided ratio. Else, the ratio is ignored. - return important_sentences if word_count is None else _get_sentences_with_word_count(important_sentences, word_count) + return important_sentences if word_count is None else _get_sentences_with_word_count( + important_sentences, word_count) def _format_results(extracted_sentences, split): @@ -152,20 +165,27 @@ def summarize_corpus(corpus, ratio=0.2): # Warns the user if there are too few documents. if len(corpus) < INPUT_MIN_LENGTH: - logger.warning("Input corpus is expected to have at least " + str(INPUT_MIN_LENGTH) + " documents.") + logger.warning( + "Input corpus is expected to have at least " + + str(INPUT_MIN_LENGTH) + + " documents.") graph = _build_graph(hashable_corpus) _set_graph_edge_weights(graph) _remove_unreachable_nodes(graph) - # Cannot calculate eigenvectors if number of unique words in text < 3. Warns user to add more text. The function ends. + # Cannot calculate eigenvectors if number of unique words in text < 3. + # Warns user to add more text. The function ends. if len(graph.nodes()) < 3: - logger.warning("Please add more sentences to the text. The number of reachable nodes is below 3") + logger.warning( + "Please add more sentences to the text. The number of reachable nodes is below 3") return pagerank_scores = _pagerank(graph) - hashable_corpus.sort(key=lambda doc: pagerank_scores.get(doc, 0), reverse=True) + hashable_corpus.sort( + key=lambda doc: pagerank_scores.get( + doc, 0), reverse=True) return [list(doc) for doc in hashable_corpus[:int(len(corpus) * ratio)]] @@ -198,20 +218,26 @@ def summarize(text, ratio=0.2, word_count=None, split=False): logger.warning("Input text is empty.") return - # If only one sentence is present, the function raises an error (Avoids ZeroDivisionError). + # If only one sentence is present, the function raises an error (Avoids + # ZeroDivisionError). if len(sentences) == 1: raise ValueError("input must have more than one sentence") - + # Warns if the text is too short. if len(sentences) < INPUT_MIN_LENGTH: - logger.warning("Input text is expected to have at least " + str(INPUT_MIN_LENGTH) + " sentences.") + logger.warning( + "Input text is expected to have at least " + + str(INPUT_MIN_LENGTH) + + " sentences.") corpus = _build_corpus(sentences) - most_important_docs = summarize_corpus(corpus, ratio=ratio if word_count is None else 1) + most_important_docs = summarize_corpus( + corpus, ratio=ratio if word_count is None else 1) # Extracts the most important sentences with the selected criterion. - extracted_sentences = _extract_important_sentences(sentences, corpus, most_important_docs, word_count) + extracted_sentences = _extract_important_sentences( + sentences, corpus, most_important_docs, word_count) # Sorts the extracted sentences by apparition order in the original text. extracted_sentences.sort(key=lambda s: s.index) diff --git a/gensim/summarization/syntactic_unit.py b/gensim/summarization/syntactic_unit.py index 89842e1122..5a84eca139 100644 --- a/gensim/summarization/syntactic_unit.py +++ b/gensim/summarization/syntactic_unit.py @@ -14,7 +14,8 @@ def __init__(self, text, token=None, tag=None): self.score = -1 def __str__(self): - return "Original unit: '" + self.text + "' *-*-*-* " + "Processed unit: '" + self.token + "'" + return "Original unit: '" + self.text + "' *-*-*-* " + \ + "Processed unit: '" + self.token + "'" def __repr__(self): return str(self) diff --git a/gensim/summarization/textcleaner.py b/gensim/summarization/textcleaner.py index 7609da469a..eafbb706b0 100644 --- a/gensim/summarization/textcleaner.py +++ b/gensim/summarization/textcleaner.py @@ -17,22 +17,29 @@ logger.info("'pattern' package found; tag filters are available for English") HAS_PATTERN = True except ImportError: - logger.info("'pattern' package not found; tag filters are not available for English") + logger.info( + "'pattern' package not found; tag filters are not available for English") HAS_PATTERN = False SEPARATOR = r"@" -RE_SENTENCE = re.compile('(\S.+?[.!?])(?=\s+|$)|(\S.+?)(?=[\n]|$)', re.UNICODE) # backup (\S.+?[.!?])(?=\s+|$)|(\S.+?)(?=[\n]|$) +# backup (\S.+?[.!?])(?=\s+|$)|(\S.+?)(?=[\n]|$) +RE_SENTENCE = re.compile('(\S.+?[.!?])(?=\s+|$)|(\S.+?)(?=[\n]|$)', re.UNICODE) AB_SENIOR = re.compile("([A-Z][a-z]{1,2}\.)\s(\w)", re.UNICODE) AB_ACRONYM = re.compile("(\.[a-zA-Z]\.)\s(\w)", re.UNICODE) AB_ACRONYM_LETTERS = re.compile("([a-zA-Z])\.([a-zA-Z])\.", re.UNICODE) -UNDO_AB_SENIOR = re.compile("([A-Z][a-z]{1,2}\.)" + SEPARATOR + "(\w)", re.UNICODE) +UNDO_AB_SENIOR = re.compile( + "([A-Z][a-z]{1,2}\.)" + + SEPARATOR + + "(\w)", + re.UNICODE) UNDO_AB_ACRONYM = re.compile("(\.[a-zA-Z]\.)" + SEPARATOR + "(\w)", re.UNICODE) def split_sentences(text): processed = replace_abbreviations(text) - return [undo_replacement(sentence) for sentence in get_sentences(processed)] + return [undo_replacement(sentence) + for sentence in get_sentences(processed)] def replace_abbreviations(text): @@ -40,7 +47,9 @@ def replace_abbreviations(text): def undo_replacement(sentence): - return replace_with_separator(sentence, r" ", [UNDO_AB_SENIOR, UNDO_AB_ACRONYM]) + return replace_with_separator( + sentence, r" ", [ + UNDO_AB_SENIOR, UNDO_AB_ACRONYM]) def replace_with_separator(text, separator, regexs): @@ -81,7 +90,8 @@ def clean_text_by_sentences(text): """ Tokenizes a given text into sentences, applying filters and lemmatizing them. Returns a SyntacticUnit list. """ original_sentences = split_sentences(text) - filtered_sentences = [join_words(sentence) for sentence in preprocess_documents(original_sentences)] + filtered_sentences = [ + join_words(sentence) for sentence in preprocess_documents(original_sentences)] return merge_syntactic_units(original_sentences, filtered_sentences) @@ -89,11 +99,18 @@ def clean_text_by_sentences(text): def clean_text_by_word(text): """ Tokenizes a given text into words, applying filters and lemmatizing them. Returns a dict of word -> syntacticUnit. """ - text_without_acronyms = replace_with_separator(text, "", [AB_ACRONYM_LETTERS]) - original_words = list(tokenize(text_without_acronyms, to_lower=True, deacc=True)) - filtered_words = [join_words(word_list, "") for word_list in preprocess_documents(original_words)] + text_without_acronyms = replace_with_separator( + text, "", [AB_ACRONYM_LETTERS]) + original_words = list( + tokenize( + text_without_acronyms, + to_lower=True, + deacc=True)) + filtered_words = [join_words(word_list, "") + for word_list in preprocess_documents(original_words)] if HAS_PATTERN: - tags = tag(join_words(original_words)) # tag needs the context of the words in the text + # tag needs the context of the words in the text + tags = tag(join_words(original_words)) else: tags = None units = merge_syntactic_units(original_words, filtered_words, tags) @@ -101,5 +118,6 @@ def clean_text_by_word(text): def tokenize_by_word(text): - text_without_acronyms = replace_with_separator(text, "", [AB_ACRONYM_LETTERS]) + text_without_acronyms = replace_with_separator( + text, "", [AB_ACRONYM_LETTERS]) return tokenize(text_without_acronyms, to_lower=True, deacc=True) From c3b772e31b7f3b8d4ad88aab5699906ffd1aa10d Mon Sep 17 00:00:00 2001 From: Sourav Singh Date: Tue, 22 Nov 2016 00:50:18 -0800 Subject: [PATCH 2/9] Undo some changes to code. --- gensim/summarization/bm25.py | 25 ++++---------- gensim/summarization/graph.py | 14 +++----- gensim/summarization/keywords.py | 19 ++++------ gensim/summarization/pagerank_weighted.py | 6 ++-- gensim/summarization/summarizer.py | 39 ++++++--------------- gensim/summarization/syntactic_unit.py | 3 +- gensim/summarization/textcleaner.py | 42 +++++++---------------- 7 files changed, 41 insertions(+), 107 deletions(-) diff --git a/gensim/summarization/bm25.py b/gensim/summarization/bm25.py index f9aecbb43b..45379e2ffd 100644 --- a/gensim/summarization/bm25.py +++ b/gensim/summarization/bm25.py @@ -18,8 +18,7 @@ class BM25(object): def __init__(self, corpus): self.corpus_size = len(corpus) - self.avgdl = sum(map(lambda x: float(len(x)), corpus) - ) / self.corpus_size + self.avgdl = sum(map(lambda x: float(len(x)), corpus)) / self.corpus_size self.corpus = corpus self.f = [] self.df = {} @@ -41,27 +40,16 @@ def initialize(self): self.df[word] += 1 for word, freq in iteritems(self.df): - self.idf[word] = math.log( - self.corpus_size - freq + 0.5) - math.log(freq + 0.5) + self.idf[word] = math.log(self.corpus_size - freq + 0.5) - math.log(freq + 0.5) def get_score(self, document, index, average_idf): score = 0 for word in document: if word not in self.f[index]: continue - idf = self.idf[word] if self.idf[ - word] >= 0 else EPSILON * average_idf - score += (idf * - self.f[index][word] * - (PARAM_K1 + - 1) / - (self.f[index][word] + - PARAM_K1 * - (1 - - PARAM_B + - PARAM_B * - self.corpus_size / - self.avgdl))) + idf = self.idf[word] if self.idf[word] >= 0 else EPSILON * average_idf + score += (idf * self.f[index][word] * (PARAM_K1 + 1) / + (self.f[index][word] + PARAM_K1 * (1 - PARAM_B + PARAM_B * self.corpus_size / self.avgdl))) return score def get_scores(self, document, average_idf): @@ -74,8 +62,7 @@ def get_scores(self, document, average_idf): def get_bm25_weights(corpus): bm25 = BM25(corpus) - average_idf = sum(map(lambda k: float( - bm25.idf[k]), bm25.idf.keys())) / len(bm25.idf.keys()) + average_idf = sum(map(lambda k: float(bm25.idf[k]), bm25.idf.keys())) / len(bm25.idf.keys()) weights = [] for doc in corpus: diff --git a/gensim/summarization/graph.py b/gensim/summarization/graph.py index 7922ac7c9c..bfed410b5e 100644 --- a/gensim/summarization/graph.py +++ b/gensim/summarization/graph.py @@ -161,13 +161,10 @@ def __init__(self): def has_edge(self, edge): u, v = edge - return ( - u, v) in self.edge_properties and ( - v, u) in self.edge_properties + return (u, v) in self.edge_properties and (v, u) in self.edge_properties def edge_weight(self, edge): - return self.get_edge_properties(edge).setdefault( - self.WEIGHT_ATTRIBUTE_NAME, self.DEFAULT_WEIGHT) + return self.get_edge_properties(edge).setdefault(self.WEIGHT_ATTRIBUTE_NAME, self.DEFAULT_WEIGHT) def neighbors(self, node): return self.node_neighbors[node] @@ -221,9 +218,7 @@ def add_edge_attribute(self, edge, attr): self.edge_attr[edge] = self.edge_attributes(edge) + [attr] if edge[0] != edge[1]: - self.edge_attr[ - (edge[1], edge[0])] = self.edge_attributes( - (edge[1], edge[0])) + [attr] + self.edge_attr[(edge[1], edge[0])] = self.edge_attributes((edge[1], edge[0])) + [attr] def edge_attributes(self, edge): try: @@ -234,8 +229,7 @@ def edge_attributes(self, edge): def set_edge_properties(self, edge, **properties): self.edge_properties.setdefault(edge, {}).update(properties) if edge[0] != edge[1]: - self.edge_properties.setdefault( - (edge[1], edge[0]), {}).update(properties) + self.edge_properties.setdefault((edge[1], edge[0]), {}).update(properties) def del_edge(self, edge): u, v = edge diff --git a/gensim/summarization/keywords.py b/gensim/summarization/keywords.py index 7e9a919909..6eb128678c 100644 --- a/gensim/summarization/keywords.py +++ b/gensim/summarization/keywords.py @@ -37,8 +37,7 @@ def _get_words_for_graph(tokens, pos_filter): include_filters = set(pos_filter) exclude_filters = frozenset([]) if include_filters and exclude_filters: - raise ValueError( - "Can't use both include and exclude filters, should use only one") + raise ValueError("Can't use both include and exclude filters, should use only one") result = [] for word, unit in iteritems(tokens): @@ -59,8 +58,7 @@ def _set_graph_edge(graph, tokens, word_a, word_b): lemma_b = tokens[word_b].token edge = (lemma_a, lemma_b) - if graph.has_node(lemma_a) and graph.has_node( - lemma_b) and not graph.has_edge(edge): + if graph.has_node(lemma_a) and graph.has_node(lemma_b) and not graph.has_edge(edge): graph.add_edge(edge) @@ -167,8 +165,7 @@ def _get_combined_keywords(_keywords, split_text): result.append(word) for j in xrange(i + 1, len_text): other_word = _strip_word(split_text[j]) - if other_word in _keywords and other_word == split_text[ - j] and not other_word in combined_word: + if other_word in _keywords and other_word == split_text[j] and not other_word in combined_word: combined_word.append(other_word) else: for keyword in combined_word: @@ -193,12 +190,9 @@ def _format_results(_keywords, combined_keywords, split, scores): :param keywords:dict of keywords:scores :param combined_keywords:list of word/s """ - combined_keywords.sort( - key=lambda w: _get_average_score( - w, _keywords), reverse=True) + combined_keywords.sort(key=lambda w: _get_average_score(w, _keywords), reverse=True) if scores: - return [(word, _get_average_score(word, _keywords)) - for word in combined_keywords] + return [(word, _get_average_score(word, _keywords)) for word in combined_keywords] if split: return combined_keywords return "\n".join(combined_keywords) @@ -230,8 +224,7 @@ def keywords( # score pagerank_scores = _pagerank(graph) - extracted_lemmas = _extract_tokens( - graph.nodes(), pagerank_scores, ratio, words) + extracted_lemmas = _extract_tokens(graph.nodes(), pagerank_scores, ratio, words) # The results can be polluted by many variations of the same word if lemmatize: diff --git a/gensim/summarization/pagerank_weighted.py b/gensim/summarization/pagerank_weighted.py index f2e97049e4..061a27f7ea 100644 --- a/gensim/summarization/pagerank_weighted.py +++ b/gensim/summarization/pagerank_weighted.py @@ -19,8 +19,7 @@ def pagerank_weighted(graph, damping=0.85): adjacency_matrix = build_adjacency_matrix(graph) probability_matrix = build_probability_matrix(graph) - pagerank_matrix = damping * adjacency_matrix.todense() + (1 - damping) * \ - probability_matrix + pagerank_matrix = damping * adjacency_matrix.todense() + (1 - damping) * \ probability_matrix # TODO raise an error if matrix has complex eigenvectors? vals, vecs = eigs(pagerank_matrix.T, k=1) @@ -37,8 +36,7 @@ def build_adjacency_matrix(graph): for i in xrange(length): current_node = nodes[i] - neighbors_sum = sum(graph.edge_weight((current_node, neighbor)) - for neighbor in graph.neighbors(current_node)) + neighbors_sum = sum(graph.edge_weight((current_node, neighbor))for neighbor in graph.neighbors(current_node)) for j in xrange(length): edge_weight = float(graph.edge_weight((current_node, nodes[j]))) if i != j and edge_weight != 0.0: diff --git a/gensim/summarization/summarizer.py b/gensim/summarization/summarizer.py index 1c8fc4f219..92716ed21b 100644 --- a/gensim/summarization/summarizer.py +++ b/gensim/summarization/summarizer.py @@ -72,8 +72,7 @@ def _get_similarity(doc1, doc2, vec1, vec2): length_1 = _get_doc_length(doc1) length_2 = _get_doc_length(doc2) - denominator = _log10( - length_1) + _log10(length_2) if length_1 > 0 and length_2 > 0 else 0 + denominator = _log10(length_1) + _log10(length_2) if length_1 > 0 and length_2 > 0 else 0 return numerator / denominator if denominator != 0 else 0 @@ -87,8 +86,7 @@ def _build_corpus(sentences): def _get_important_sentences(sentences, corpus, important_docs): hashable_corpus = _build_hasheable_corpus(corpus) sentences_by_corpus = dict(zip(hashable_corpus, sentences)) - return [sentences_by_corpus[tuple(important_doc)] - for important_doc in important_docs] + return [sentences_by_corpus[tuple(important_doc)] for important_doc in important_docs] def _get_sentences_with_word_count(sentences, word_count): @@ -103,12 +101,7 @@ def _get_sentences_with_word_count(sentences, word_count): # Checks if the inclusion of the sentence gives a better approximation # to the word parameter. - if abs( - word_count - - length - - words_in_sentence) > abs( - word_count - - length): + if abs(word_count - length - words_in_sentence) > abs(word_count - length): return selected_sentences selected_sentences.append(sentence) @@ -122,8 +115,7 @@ def _extract_important_sentences( corpus, important_docs, word_count): - important_sentences = _get_important_sentences( - sentences, corpus, important_docs) + important_sentences = _get_important_sentences(sentences, corpus, important_docs) # If no "word_count" option is provided, the number of sentences is # reduced by the provided ratio. Else, the ratio is ignored. @@ -165,10 +157,7 @@ def summarize_corpus(corpus, ratio=0.2): # Warns the user if there are too few documents. if len(corpus) < INPUT_MIN_LENGTH: - logger.warning( - "Input corpus is expected to have at least " + - str(INPUT_MIN_LENGTH) + - " documents.") + logger.warning("Input corpus is expected to have at least " + str(INPUT_MIN_LENGTH) + " documents.") graph = _build_graph(hashable_corpus) _set_graph_edge_weights(graph) @@ -177,15 +166,12 @@ def summarize_corpus(corpus, ratio=0.2): # Cannot calculate eigenvectors if number of unique words in text < 3. # Warns user to add more text. The function ends. if len(graph.nodes()) < 3: - logger.warning( - "Please add more sentences to the text. The number of reachable nodes is below 3") + logger.warning("Please add more sentences to the text. The number of reachable nodes is below 3") return pagerank_scores = _pagerank(graph) - hashable_corpus.sort( - key=lambda doc: pagerank_scores.get( - doc, 0), reverse=True) + hashable_corpus.sort(key=lambda doc: pagerank_scores.get(doc, 0), reverse=True) return [list(doc) for doc in hashable_corpus[:int(len(corpus) * ratio)]] @@ -225,19 +211,14 @@ def summarize(text, ratio=0.2, word_count=None, split=False): # Warns if the text is too short. if len(sentences) < INPUT_MIN_LENGTH: - logger.warning( - "Input text is expected to have at least " + - str(INPUT_MIN_LENGTH) + - " sentences.") + logger.warning("Input text is expected to have at least " + str(INPUT_MIN_LENGTH) + " sentences.") corpus = _build_corpus(sentences) - most_important_docs = summarize_corpus( - corpus, ratio=ratio if word_count is None else 1) + most_important_docs = summarize_corpus(corpus, ratio=ratio if word_count is None else 1) # Extracts the most important sentences with the selected criterion. - extracted_sentences = _extract_important_sentences( - sentences, corpus, most_important_docs, word_count) + extracted_sentences = _extract_important_sentences(sentences, corpus, most_important_docs, word_count) # Sorts the extracted sentences by apparition order in the original text. extracted_sentences.sort(key=lambda s: s.index) diff --git a/gensim/summarization/syntactic_unit.py b/gensim/summarization/syntactic_unit.py index 5a84eca139..89842e1122 100644 --- a/gensim/summarization/syntactic_unit.py +++ b/gensim/summarization/syntactic_unit.py @@ -14,8 +14,7 @@ def __init__(self, text, token=None, tag=None): self.score = -1 def __str__(self): - return "Original unit: '" + self.text + "' *-*-*-* " + \ - "Processed unit: '" + self.token + "'" + return "Original unit: '" + self.text + "' *-*-*-* " + "Processed unit: '" + self.token + "'" def __repr__(self): return str(self) diff --git a/gensim/summarization/textcleaner.py b/gensim/summarization/textcleaner.py index eafbb706b0..a591b51b9a 100644 --- a/gensim/summarization/textcleaner.py +++ b/gensim/summarization/textcleaner.py @@ -17,29 +17,22 @@ logger.info("'pattern' package found; tag filters are available for English") HAS_PATTERN = True except ImportError: - logger.info( - "'pattern' package not found; tag filters are not available for English") + logger.info("'pattern' package not found; tag filters are not available for English") HAS_PATTERN = False SEPARATOR = r"@" -# backup (\S.+?[.!?])(?=\s+|$)|(\S.+?)(?=[\n]|$) -RE_SENTENCE = re.compile('(\S.+?[.!?])(?=\s+|$)|(\S.+?)(?=[\n]|$)', re.UNICODE) +RE_SENTENCE = re.compile('(\S.+?[.!?])(?=\s+|$)|(\S.+?)(?=[\n]|$)', re.UNICODE) # backup (\S.+?[.!?])(?=\s+|$)|(\S.+?)(?=[\n]|$) AB_SENIOR = re.compile("([A-Z][a-z]{1,2}\.)\s(\w)", re.UNICODE) AB_ACRONYM = re.compile("(\.[a-zA-Z]\.)\s(\w)", re.UNICODE) AB_ACRONYM_LETTERS = re.compile("([a-zA-Z])\.([a-zA-Z])\.", re.UNICODE) -UNDO_AB_SENIOR = re.compile( - "([A-Z][a-z]{1,2}\.)" + - SEPARATOR + - "(\w)", - re.UNICODE) +UNDO_AB_SENIOR = re.compile("([A-Z][a-z]{1,2}\.)" + SEPARATOR + "(\w)", re.UNICODE) UNDO_AB_ACRONYM = re.compile("(\.[a-zA-Z]\.)" + SEPARATOR + "(\w)", re.UNICODE) def split_sentences(text): processed = replace_abbreviations(text) - return [undo_replacement(sentence) - for sentence in get_sentences(processed)] + return [undo_replacement(sentence) for sentence in get_sentences(processed)] def replace_abbreviations(text): @@ -47,9 +40,7 @@ def replace_abbreviations(text): def undo_replacement(sentence): - return replace_with_separator( - sentence, r" ", [ - UNDO_AB_SENIOR, UNDO_AB_ACRONYM]) + return replace_with_separator(sentence, r" ", [UNDO_AB_SENIOR, UNDO_AB_ACRONYM]) def replace_with_separator(text, separator, regexs): @@ -90,8 +81,7 @@ def clean_text_by_sentences(text): """ Tokenizes a given text into sentences, applying filters and lemmatizing them. Returns a SyntacticUnit list. """ original_sentences = split_sentences(text) - filtered_sentences = [ - join_words(sentence) for sentence in preprocess_documents(original_sentences)] + filtered_sentences = [join_words(sentence) for sentence in preprocess_documents(original_sentences)] return merge_syntactic_units(original_sentences, filtered_sentences) @@ -99,18 +89,11 @@ def clean_text_by_sentences(text): def clean_text_by_word(text): """ Tokenizes a given text into words, applying filters and lemmatizing them. Returns a dict of word -> syntacticUnit. """ - text_without_acronyms = replace_with_separator( - text, "", [AB_ACRONYM_LETTERS]) - original_words = list( - tokenize( - text_without_acronyms, - to_lower=True, - deacc=True)) - filtered_words = [join_words(word_list, "") - for word_list in preprocess_documents(original_words)] + text_without_acronyms = replace_with_separator(text, "", [AB_ACRONYM_LETTERS]) + original_words = list(tokenize(text_without_acronyms, to_lower=True, deacc=True)) + filtered_words = [join_words(word_list, "") for word_list in preprocess_documents(original_words)] if HAS_PATTERN: - # tag needs the context of the words in the text - tags = tag(join_words(original_words)) + tags = tag(join_words(original_words)) # tag needs the context of the words in the text else: tags = None units = merge_syntactic_units(original_words, filtered_words, tags) @@ -118,6 +101,5 @@ def clean_text_by_word(text): def tokenize_by_word(text): - text_without_acronyms = replace_with_separator( - text, "", [AB_ACRONYM_LETTERS]) - return tokenize(text_without_acronyms, to_lower=True, deacc=True) + text_without_acronyms = replace_with_separator(text, "", [AB_ACRONYM_LETTERS]) +return tokenize(text_without_acronyms, to_lower=True, deacc=True) From c2e20c9fb0f1e8017cbdc32347e7f711c2fd006e Mon Sep 17 00:00:00 2001 From: Sourav Singh Date: Tue, 22 Nov 2016 01:02:44 -0800 Subject: [PATCH 3/9] Fix an error in newline --- gensim/summarization/pagerank_weighted.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gensim/summarization/pagerank_weighted.py b/gensim/summarization/pagerank_weighted.py index 061a27f7ea..be1a5dfef5 100644 --- a/gensim/summarization/pagerank_weighted.py +++ b/gensim/summarization/pagerank_weighted.py @@ -19,7 +19,7 @@ def pagerank_weighted(graph, damping=0.85): adjacency_matrix = build_adjacency_matrix(graph) probability_matrix = build_probability_matrix(graph) - pagerank_matrix = damping * adjacency_matrix.todense() + (1 - damping) * \ probability_matrix + pagerank_matrix = damping * adjacency_matrix.todense() + (1 - damping) * probability_matrix # TODO raise an error if matrix has complex eigenvectors? vals, vecs = eigs(pagerank_matrix.T, k=1) From fdcbc7502b66e3ffecba843467b02c8e60bd99b4 Mon Sep 17 00:00:00 2001 From: Sourav Singh Date: Tue, 22 Nov 2016 01:15:40 -0800 Subject: [PATCH 4/9] Fix indent --- gensim/summarization/textcleaner.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gensim/summarization/textcleaner.py b/gensim/summarization/textcleaner.py index a591b51b9a..7609da469a 100644 --- a/gensim/summarization/textcleaner.py +++ b/gensim/summarization/textcleaner.py @@ -102,4 +102,4 @@ def clean_text_by_word(text): def tokenize_by_word(text): text_without_acronyms = replace_with_separator(text, "", [AB_ACRONYM_LETTERS]) -return tokenize(text_without_acronyms, to_lower=True, deacc=True) + return tokenize(text_without_acronyms, to_lower=True, deacc=True) From 482ab89229d4a6d0c231391e274b0285e1671840 Mon Sep 17 00:00:00 2001 From: Sourav Singh Date: Thu, 24 Nov 2016 21:01:09 +0530 Subject: [PATCH 5/9] Fixes according to the review --- gensim/summarization/commons.py | 3 +-- gensim/summarization/keywords.py | 5 ++--- 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/gensim/summarization/commons.py b/gensim/summarization/commons.py index 4f19196066..1c467098f9 100644 --- a/gensim/summarization/commons.py +++ b/gensim/summarization/commons.py @@ -16,6 +16,5 @@ def build_graph(sequence): def remove_unreachable_nodes(graph): for node in graph.nodes(): - if sum(graph.edge_weight((node, other)) - for other in graph.neighbors(node)) == 0: + if sum(graph.edge_weight((node, other)) for other in graph.neighbors(node)) == 0: graph.del_node(node) diff --git a/gensim/summarization/keywords.py b/gensim/summarization/keywords.py index 6eb128678c..be535377f1 100644 --- a/gensim/summarization/keywords.py +++ b/gensim/summarization/keywords.py @@ -160,9 +160,8 @@ def _get_combined_keywords(_keywords, split_text): word = _strip_word(split_text[i]) if word in _keywords: combined_word = [word] - if i + 1 == len_text: - # appends last word if keyword and doesn't iterate - result.append(word) + if i + 1 == len_text: + result.append(word) # appends last word if keyword and doesn't iterate for j in xrange(i + 1, len_text): other_word = _strip_word(split_text[j]) if other_word in _keywords and other_word == split_text[j] and not other_word in combined_word: From f731f192bff80312bdb0402a810949adf532c6ae Mon Sep 17 00:00:00 2001 From: Sourav Singh Date: Mon, 28 Nov 2016 18:37:08 +0530 Subject: [PATCH 6/9] Updates to a few scripts --- gensim/summarization/keywords.py | 15 +++------------ gensim/summarization/summarizer.py | 9 ++------- 2 files changed, 5 insertions(+), 19 deletions(-) diff --git a/gensim/summarization/keywords.py b/gensim/summarization/keywords.py index be535377f1..3648952bdf 100644 --- a/gensim/summarization/keywords.py +++ b/gensim/summarization/keywords.py @@ -160,8 +160,8 @@ def _get_combined_keywords(_keywords, split_text): word = _strip_word(split_text[i]) if word in _keywords: combined_word = [word] - if i + 1 == len_text: - result.append(word) # appends last word if keyword and doesn't iterate + if i + 1 == len_text: + result.append(word) # appends last word if keyword and doesn't iterate for j in xrange(i + 1, len_text): other_word = _strip_word(split_text[j]) if other_word in _keywords and other_word == split_text[j] and not other_word in combined_word: @@ -197,16 +197,7 @@ def _format_results(_keywords, combined_keywords, split, scores): return "\n".join(combined_keywords) -def keywords( - text, - ratio=0.2, - words=None, - split=False, - scores=False, - pos_filter=[ - 'NN', - 'JJ'], - lemmatize=False): +def keywords(text, ratio=0.2, words=None, split=False, scores=False, pos_filter=['NN', 'JJ'], lemmatize=False): # Gets a dict of word -> lemma text = to_unicode(text) tokens = _clean_text_by_word(text) diff --git a/gensim/summarization/summarizer.py b/gensim/summarization/summarizer.py index 92716ed21b..f6c83e319d 100644 --- a/gensim/summarization/summarizer.py +++ b/gensim/summarization/summarizer.py @@ -110,17 +110,12 @@ def _get_sentences_with_word_count(sentences, word_count): return selected_sentences -def _extract_important_sentences( - sentences, - corpus, - important_docs, - word_count): +def _extract_important_sentences(sentences, corpus, important_docs, word_count): important_sentences = _get_important_sentences(sentences, corpus, important_docs) # If no "word_count" option is provided, the number of sentences is # reduced by the provided ratio. Else, the ratio is ignored. - return important_sentences if word_count is None else _get_sentences_with_word_count( - important_sentences, word_count) + return important_sentences if word_count is None else _get_sentences_with_word_count(important_sentences, word_count) def _format_results(extracted_sentences, split): From b21068f585de2d95a0b0938b64afe3de5394a4d4 Mon Sep 17 00:00:00 2001 From: Sourav Singh Date: Mon, 28 Nov 2016 19:13:24 +0530 Subject: [PATCH 7/9] Update bm25.py --- gensim/summarization/bm25.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/gensim/summarization/bm25.py b/gensim/summarization/bm25.py index 45379e2ffd..bff3007ef9 100644 --- a/gensim/summarization/bm25.py +++ b/gensim/summarization/bm25.py @@ -48,8 +48,8 @@ def get_score(self, document, index, average_idf): if word not in self.f[index]: continue idf = self.idf[word] if self.idf[word] >= 0 else EPSILON * average_idf - score += (idf * self.f[index][word] * (PARAM_K1 + 1) / - (self.f[index][word] + PARAM_K1 * (1 - PARAM_B + PARAM_B * self.corpus_size / self.avgdl))) + score += (idf * self.f[index][word] * (PARAM_K1 + 1) / (self.f[index][word] + PARAM_K1 * ( + 1 - PARAM_B + PARAM_B * self.corpus_size / self.avgdl))) return score def get_scores(self, document, average_idf): From fc3a8af742f618257ba7e221a137750e1a5464b6 Mon Sep 17 00:00:00 2001 From: Sourav Singh Date: Tue, 29 Nov 2016 00:14:52 +0530 Subject: [PATCH 8/9] Update keywords.py --- gensim/summarization/keywords.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gensim/summarization/keywords.py b/gensim/summarization/keywords.py index 3648952bdf..8ccc8e0554 100644 --- a/gensim/summarization/keywords.py +++ b/gensim/summarization/keywords.py @@ -164,7 +164,7 @@ def _get_combined_keywords(_keywords, split_text): result.append(word) # appends last word if keyword and doesn't iterate for j in xrange(i + 1, len_text): other_word = _strip_word(split_text[j]) - if other_word in _keywords and other_word == split_text[j] and not other_word in combined_word: + if other_word in _keywords and other_word == split_text[j] and other_word not in combined_word: combined_word.append(other_word) else: for keyword in combined_word: From 19312fc6010b4e011c3325d3ceb3b0630926fc00 Mon Sep 17 00:00:00 2001 From: Sourav Singh Date: Tue, 29 Nov 2016 18:37:18 +0530 Subject: [PATCH 9/9] Update expression to match PEP8 specifications I have made some changes to the expression for calculation of score to match PEP8 specifications. --- gensim/summarization/bm25.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/gensim/summarization/bm25.py b/gensim/summarization/bm25.py index bff3007ef9..b32d2f040d 100644 --- a/gensim/summarization/bm25.py +++ b/gensim/summarization/bm25.py @@ -48,8 +48,9 @@ def get_score(self, document, index, average_idf): if word not in self.f[index]: continue idf = self.idf[word] if self.idf[word] >= 0 else EPSILON * average_idf - score += (idf * self.f[index][word] * (PARAM_K1 + 1) / (self.f[index][word] + PARAM_K1 * ( - 1 - PARAM_B + PARAM_B * self.corpus_size / self.avgdl))) + score += (idf * self.f[index][word] * (PARAM_K1 + 1) + / (self.f[index][word] + PARAM_K1 + * (1 - PARAM_B + PARAM_B * self.corpus_size / self.avgdl))) return score def get_scores(self, document, average_idf):