From f051cbf721fbed682d864a4536565ba506f35261 Mon Sep 17 00:00:00 2001
From: Sourav Singh <ssouravsingh12@gmail.com>
Date: Sun, 13 Nov 2016 08:37:16 -0800
Subject: [PATCH 1/9] PEP8 Fixes for Summarization.

---
 gensim/summarization/__init__.py          |  2 +-
 gensim/summarization/bm25.py              | 25 +++++++---
 gensim/summarization/commons.py           |  3 +-
 gensim/summarization/graph.py             | 14 ++++--
 gensim/summarization/keywords.py          | 39 ++++++++++++----
 gensim/summarization/pagerank_weighted.py |  9 ++--
 gensim/summarization/summarizer.py        | 56 +++++++++++++++++------
 gensim/summarization/syntactic_unit.py    |  3 +-
 gensim/summarization/textcleaner.py       | 40 +++++++++++-----
 9 files changed, 139 insertions(+), 52 deletions(-)

diff --git a/gensim/summarization/__init__.py b/gensim/summarization/__init__.py
index 57c9a7c815..c7efb84d4a 100644
--- a/gensim/summarization/__init__.py
+++ b/gensim/summarization/__init__.py
@@ -1,4 +1,4 @@
 
 # bring model classes directly into package namespace, to save some typing
 from .summarizer import summarize, summarize_corpus
-from .keywords import keywords
\ No newline at end of file
+from .keywords import keywords
diff --git a/gensim/summarization/bm25.py b/gensim/summarization/bm25.py
index 6704146d54..f9aecbb43b 100644
--- a/gensim/summarization/bm25.py
+++ b/gensim/summarization/bm25.py
@@ -18,7 +18,8 @@ class BM25(object):
 
     def __init__(self, corpus):
         self.corpus_size = len(corpus)
-        self.avgdl = sum(map(lambda x: float(len(x)), corpus)) / self.corpus_size
+        self.avgdl = sum(map(lambda x: float(len(x)), corpus)
+                         ) / self.corpus_size
         self.corpus = corpus
         self.f = []
         self.df = {}
@@ -40,16 +41,27 @@ def initialize(self):
                 self.df[word] += 1
 
         for word, freq in iteritems(self.df):
-            self.idf[word] = math.log(self.corpus_size-freq+0.5) - math.log(freq+0.5)
+            self.idf[word] = math.log(
+                self.corpus_size - freq + 0.5) - math.log(freq + 0.5)
 
     def get_score(self, document, index, average_idf):
         score = 0
         for word in document:
             if word not in self.f[index]:
                 continue
-            idf = self.idf[word] if self.idf[word] >= 0 else EPSILON * average_idf
-            score += (idf*self.f[index][word]*(PARAM_K1+1)
-                      / (self.f[index][word] + PARAM_K1*(1 - PARAM_B+PARAM_B*self.corpus_size / self.avgdl)))
+            idf = self.idf[word] if self.idf[
+                word] >= 0 else EPSILON * average_idf
+            score += (idf *
+                      self.f[index][word] *
+                      (PARAM_K1 +
+                       1) /
+                      (self.f[index][word] +
+                          PARAM_K1 *
+                          (1 -
+                           PARAM_B +
+                           PARAM_B *
+                           self.corpus_size /
+                           self.avgdl)))
         return score
 
     def get_scores(self, document, average_idf):
@@ -62,7 +74,8 @@ def get_scores(self, document, average_idf):
 
 def get_bm25_weights(corpus):
     bm25 = BM25(corpus)
-    average_idf = sum(map(lambda k: float(bm25.idf[k]), bm25.idf.keys())) / len(bm25.idf.keys())
+    average_idf = sum(map(lambda k: float(
+        bm25.idf[k]), bm25.idf.keys())) / len(bm25.idf.keys())
 
     weights = []
     for doc in corpus:
diff --git a/gensim/summarization/commons.py b/gensim/summarization/commons.py
index 1c467098f9..4f19196066 100644
--- a/gensim/summarization/commons.py
+++ b/gensim/summarization/commons.py
@@ -16,5 +16,6 @@ def build_graph(sequence):
 
 def remove_unreachable_nodes(graph):
     for node in graph.nodes():
-        if sum(graph.edge_weight((node, other)) for other in graph.neighbors(node)) == 0:
+        if sum(graph.edge_weight((node, other))
+               for other in graph.neighbors(node)) == 0:
             graph.del_node(node)
diff --git a/gensim/summarization/graph.py b/gensim/summarization/graph.py
index bfed410b5e..7922ac7c9c 100644
--- a/gensim/summarization/graph.py
+++ b/gensim/summarization/graph.py
@@ -161,10 +161,13 @@ def __init__(self):
 
     def has_edge(self, edge):
         u, v = edge
-        return (u, v) in self.edge_properties and (v, u) in self.edge_properties
+        return (
+            u, v) in self.edge_properties and (
+            v, u) in self.edge_properties
 
     def edge_weight(self, edge):
-        return self.get_edge_properties(edge).setdefault(self.WEIGHT_ATTRIBUTE_NAME, self.DEFAULT_WEIGHT)
+        return self.get_edge_properties(edge).setdefault(
+            self.WEIGHT_ATTRIBUTE_NAME, self.DEFAULT_WEIGHT)
 
     def neighbors(self, node):
         return self.node_neighbors[node]
@@ -218,7 +221,9 @@ def add_edge_attribute(self, edge, attr):
         self.edge_attr[edge] = self.edge_attributes(edge) + [attr]
 
         if edge[0] != edge[1]:
-            self.edge_attr[(edge[1], edge[0])] = self.edge_attributes((edge[1], edge[0])) + [attr]
+            self.edge_attr[
+                (edge[1], edge[0])] = self.edge_attributes(
+                (edge[1], edge[0])) + [attr]
 
     def edge_attributes(self, edge):
         try:
@@ -229,7 +234,8 @@ def edge_attributes(self, edge):
     def set_edge_properties(self, edge, **properties):
         self.edge_properties.setdefault(edge, {}).update(properties)
         if edge[0] != edge[1]:
-            self.edge_properties.setdefault((edge[1], edge[0]), {}).update(properties)
+            self.edge_properties.setdefault(
+                (edge[1], edge[0]), {}).update(properties)
 
     def del_edge(self, edge):
         u, v = edge
diff --git a/gensim/summarization/keywords.py b/gensim/summarization/keywords.py
index 3bb7cee100..7e9a919909 100644
--- a/gensim/summarization/keywords.py
+++ b/gensim/summarization/keywords.py
@@ -37,7 +37,8 @@ def _get_words_for_graph(tokens, pos_filter):
         include_filters = set(pos_filter)
         exclude_filters = frozenset([])
     if include_filters and exclude_filters:
-        raise ValueError("Can't use both include and exclude filters, should use only one")
+        raise ValueError(
+            "Can't use both include and exclude filters, should use only one")
 
     result = []
     for word, unit in iteritems(tokens):
@@ -58,7 +59,8 @@ def _set_graph_edge(graph, tokens, word_a, word_b):
         lemma_b = tokens[word_b].token
         edge = (lemma_a, lemma_b)
 
-        if graph.has_node(lemma_a) and graph.has_node(lemma_b) and not graph.has_edge(edge):
+        if graph.has_node(lemma_a) and graph.has_node(
+                lemma_b) and not graph.has_edge(edge):
             graph.add_edge(edge)
 
 
@@ -161,10 +163,12 @@ def _get_combined_keywords(_keywords, split_text):
         if word in _keywords:
             combined_word = [word]
             if i + 1 == len_text:
-                result.append(word)   # appends last word if keyword and doesn't iterate
+                # appends last word if keyword and doesn't iterate
+                result.append(word)
             for j in xrange(i + 1, len_text):
                 other_word = _strip_word(split_text[j])
-                if other_word in _keywords and other_word == split_text[j] and not other_word in combined_word:
+                if other_word in _keywords and other_word == split_text[
+                        j] and not other_word in combined_word:
                     combined_word.append(other_word)
                 else:
                     for keyword in combined_word:
@@ -189,15 +193,27 @@ def _format_results(_keywords, combined_keywords, split, scores):
     :param keywords:dict of keywords:scores
     :param combined_keywords:list of word/s
     """
-    combined_keywords.sort(key=lambda w: _get_average_score(w, _keywords), reverse=True)
+    combined_keywords.sort(
+        key=lambda w: _get_average_score(
+            w, _keywords), reverse=True)
     if scores:
-        return [(word, _get_average_score(word, _keywords)) for word in combined_keywords]
+        return [(word, _get_average_score(word, _keywords))
+                for word in combined_keywords]
     if split:
         return combined_keywords
     return "\n".join(combined_keywords)
 
 
-def keywords(text, ratio=0.2, words=None, split=False, scores=False, pos_filter=['NN', 'JJ'], lemmatize=False):
+def keywords(
+        text,
+        ratio=0.2,
+        words=None,
+        split=False,
+        scores=False,
+        pos_filter=[
+            'NN',
+            'JJ'],
+        lemmatize=False):
     # Gets a dict of word -> lemma
     text = to_unicode(text)
     tokens = _clean_text_by_word(text)
@@ -210,10 +226,12 @@ def keywords(text, ratio=0.2, words=None, split=False, scores=False, pos_filter=
 
     _remove_unreachable_nodes(graph)
 
-    # Ranks the tokens using the PageRank algorithm. Returns dict of lemma -> score
+    # Ranks the tokens using the PageRank algorithm. Returns dict of lemma ->
+    # score
     pagerank_scores = _pagerank(graph)
 
-    extracted_lemmas = _extract_tokens(graph.nodes(), pagerank_scores, ratio, words)
+    extracted_lemmas = _extract_tokens(
+        graph.nodes(), pagerank_scores, ratio, words)
 
     # The results can be polluted by many variations of the same word
     if lemmatize:
@@ -225,7 +243,8 @@ def keywords(text, ratio=0.2, words=None, split=False, scores=False, pos_filter=
 
     keywords = _get_keywords_with_score(extracted_lemmas, lemmas_to_word)
 
-    # text.split() to keep numbers and punctuation marks, so separeted concepts are not combined
+    # text.split() to keep numbers and punctuation marks, so separeted
+    # concepts are not combined
     combined_keywords = _get_combined_keywords(keywords, text.split())
 
     return _format_results(keywords, combined_keywords, split, scores)
diff --git a/gensim/summarization/pagerank_weighted.py b/gensim/summarization/pagerank_weighted.py
index 1978c6e1c7..f2e97049e4 100644
--- a/gensim/summarization/pagerank_weighted.py
+++ b/gensim/summarization/pagerank_weighted.py
@@ -19,9 +19,11 @@ def pagerank_weighted(graph, damping=0.85):
     adjacency_matrix = build_adjacency_matrix(graph)
     probability_matrix = build_probability_matrix(graph)
 
-    pagerank_matrix = damping * adjacency_matrix.todense() + (1 - damping) * probability_matrix
+    pagerank_matrix = damping * adjacency_matrix.todense() + (1 - damping) * \
+        probability_matrix
 
-    vals, vecs = eigs(pagerank_matrix.T, k=1)  # TODO raise an error if matrix has complex eigenvectors?
+    # TODO raise an error if matrix has complex eigenvectors?
+    vals, vecs = eigs(pagerank_matrix.T, k=1)
 
     return process_results(graph, vecs.real)
 
@@ -35,7 +37,8 @@ def build_adjacency_matrix(graph):
 
     for i in xrange(length):
         current_node = nodes[i]
-        neighbors_sum = sum(graph.edge_weight((current_node, neighbor)) for neighbor in graph.neighbors(current_node))
+        neighbors_sum = sum(graph.edge_weight((current_node, neighbor))
+                            for neighbor in graph.neighbors(current_node))
         for j in xrange(length):
             edge_weight = float(graph.edge_weight((current_node, nodes[j])))
             if i != j and edge_weight != 0.0:
diff --git a/gensim/summarization/summarizer.py b/gensim/summarization/summarizer.py
index 0779011999..1c8fc4f219 100644
--- a/gensim/summarization/summarizer.py
+++ b/gensim/summarization/summarizer.py
@@ -72,7 +72,8 @@ def _get_similarity(doc1, doc2, vec1, vec2):
     length_1 = _get_doc_length(doc1)
     length_2 = _get_doc_length(doc2)
 
-    denominator = _log10(length_1) + _log10(length_2) if length_1 > 0 and length_2 > 0 else 0
+    denominator = _log10(
+        length_1) + _log10(length_2) if length_1 > 0 and length_2 > 0 else 0
 
     return numerator / denominator if denominator != 0 else 0
 
@@ -86,7 +87,8 @@ def _build_corpus(sentences):
 def _get_important_sentences(sentences, corpus, important_docs):
     hashable_corpus = _build_hasheable_corpus(corpus)
     sentences_by_corpus = dict(zip(hashable_corpus, sentences))
-    return [sentences_by_corpus[tuple(important_doc)] for important_doc in important_docs]
+    return [sentences_by_corpus[tuple(important_doc)]
+            for important_doc in important_docs]
 
 
 def _get_sentences_with_word_count(sentences, word_count):
@@ -101,7 +103,12 @@ def _get_sentences_with_word_count(sentences, word_count):
 
         # Checks if the inclusion of the sentence gives a better approximation
         # to the word parameter.
-        if abs(word_count - length - words_in_sentence) > abs(word_count - length):
+        if abs(
+                word_count -
+                length -
+                words_in_sentence) > abs(
+                word_count -
+                length):
             return selected_sentences
 
         selected_sentences.append(sentence)
@@ -110,12 +117,18 @@ def _get_sentences_with_word_count(sentences, word_count):
     return selected_sentences
 
 
-def _extract_important_sentences(sentences, corpus, important_docs, word_count):
-    important_sentences = _get_important_sentences(sentences, corpus, important_docs)
+def _extract_important_sentences(
+        sentences,
+        corpus,
+        important_docs,
+        word_count):
+    important_sentences = _get_important_sentences(
+        sentences, corpus, important_docs)
 
     # If no "word_count" option is provided, the number of sentences is
     # reduced by the provided ratio. Else, the ratio is ignored.
-    return important_sentences if word_count is None else _get_sentences_with_word_count(important_sentences, word_count)
+    return important_sentences if word_count is None else _get_sentences_with_word_count(
+        important_sentences, word_count)
 
 
 def _format_results(extracted_sentences, split):
@@ -152,20 +165,27 @@ def summarize_corpus(corpus, ratio=0.2):
 
     # Warns the user if there are too few documents.
     if len(corpus) < INPUT_MIN_LENGTH:
-        logger.warning("Input corpus is expected to have at least " + str(INPUT_MIN_LENGTH) + " documents.")
+        logger.warning(
+            "Input corpus is expected to have at least " +
+            str(INPUT_MIN_LENGTH) +
+            " documents.")
 
     graph = _build_graph(hashable_corpus)
     _set_graph_edge_weights(graph)
     _remove_unreachable_nodes(graph)
 
-    # Cannot calculate eigenvectors if number of unique words in text < 3. Warns user to add more text. The function ends.
+    # Cannot calculate eigenvectors if number of unique words in text < 3.
+    # Warns user to add more text. The function ends.
     if len(graph.nodes()) < 3:
-        logger.warning("Please add more sentences to the text. The number of reachable nodes is below 3")
+        logger.warning(
+            "Please add more sentences to the text. The number of reachable nodes is below 3")
         return
 
     pagerank_scores = _pagerank(graph)
 
-    hashable_corpus.sort(key=lambda doc: pagerank_scores.get(doc, 0), reverse=True)
+    hashable_corpus.sort(
+        key=lambda doc: pagerank_scores.get(
+            doc, 0), reverse=True)
 
     return [list(doc) for doc in hashable_corpus[:int(len(corpus) * ratio)]]
 
@@ -198,20 +218,26 @@ def summarize(text, ratio=0.2, word_count=None, split=False):
         logger.warning("Input text is empty.")
         return
 
-    # If only one sentence is present, the function raises an error (Avoids ZeroDivisionError). 
+    # If only one sentence is present, the function raises an error (Avoids
+    # ZeroDivisionError).
     if len(sentences) == 1:
         raise ValueError("input must have more than one sentence")
-    
+
     # Warns if the text is too short.
     if len(sentences) < INPUT_MIN_LENGTH:
-        logger.warning("Input text is expected to have at least " + str(INPUT_MIN_LENGTH) + " sentences.")
+        logger.warning(
+            "Input text is expected to have at least " +
+            str(INPUT_MIN_LENGTH) +
+            " sentences.")
 
     corpus = _build_corpus(sentences)
 
-    most_important_docs = summarize_corpus(corpus, ratio=ratio if word_count is None else 1)
+    most_important_docs = summarize_corpus(
+        corpus, ratio=ratio if word_count is None else 1)
 
     # Extracts the most important sentences with the selected criterion.
-    extracted_sentences = _extract_important_sentences(sentences, corpus, most_important_docs, word_count)
+    extracted_sentences = _extract_important_sentences(
+        sentences, corpus, most_important_docs, word_count)
 
     # Sorts the extracted sentences by apparition order in the original text.
     extracted_sentences.sort(key=lambda s: s.index)
diff --git a/gensim/summarization/syntactic_unit.py b/gensim/summarization/syntactic_unit.py
index 89842e1122..5a84eca139 100644
--- a/gensim/summarization/syntactic_unit.py
+++ b/gensim/summarization/syntactic_unit.py
@@ -14,7 +14,8 @@ def __init__(self, text, token=None, tag=None):
         self.score = -1
 
     def __str__(self):
-        return "Original unit: '" + self.text + "' *-*-*-* " + "Processed unit: '" + self.token + "'"
+        return "Original unit: '" + self.text + "' *-*-*-* " + \
+            "Processed unit: '" + self.token + "'"
 
     def __repr__(self):
         return str(self)
diff --git a/gensim/summarization/textcleaner.py b/gensim/summarization/textcleaner.py
index 7609da469a..eafbb706b0 100644
--- a/gensim/summarization/textcleaner.py
+++ b/gensim/summarization/textcleaner.py
@@ -17,22 +17,29 @@
     logger.info("'pattern' package found; tag filters are available for English")
     HAS_PATTERN = True
 except ImportError:
-    logger.info("'pattern' package not found; tag filters are not available for English")
+    logger.info(
+        "'pattern' package not found; tag filters are not available for English")
     HAS_PATTERN = False
 
 
 SEPARATOR = r"@"
-RE_SENTENCE = re.compile('(\S.+?[.!?])(?=\s+|$)|(\S.+?)(?=[\n]|$)', re.UNICODE)  # backup (\S.+?[.!?])(?=\s+|$)|(\S.+?)(?=[\n]|$)
+# backup (\S.+?[.!?])(?=\s+|$)|(\S.+?)(?=[\n]|$)
+RE_SENTENCE = re.compile('(\S.+?[.!?])(?=\s+|$)|(\S.+?)(?=[\n]|$)', re.UNICODE)
 AB_SENIOR = re.compile("([A-Z][a-z]{1,2}\.)\s(\w)", re.UNICODE)
 AB_ACRONYM = re.compile("(\.[a-zA-Z]\.)\s(\w)", re.UNICODE)
 AB_ACRONYM_LETTERS = re.compile("([a-zA-Z])\.([a-zA-Z])\.", re.UNICODE)
-UNDO_AB_SENIOR = re.compile("([A-Z][a-z]{1,2}\.)" + SEPARATOR + "(\w)", re.UNICODE)
+UNDO_AB_SENIOR = re.compile(
+    "([A-Z][a-z]{1,2}\.)" +
+    SEPARATOR +
+    "(\w)",
+    re.UNICODE)
 UNDO_AB_ACRONYM = re.compile("(\.[a-zA-Z]\.)" + SEPARATOR + "(\w)", re.UNICODE)
 
 
 def split_sentences(text):
     processed = replace_abbreviations(text)
-    return [undo_replacement(sentence) for sentence in get_sentences(processed)]
+    return [undo_replacement(sentence)
+            for sentence in get_sentences(processed)]
 
 
 def replace_abbreviations(text):
@@ -40,7 +47,9 @@ def replace_abbreviations(text):
 
 
 def undo_replacement(sentence):
-    return replace_with_separator(sentence, r" ", [UNDO_AB_SENIOR, UNDO_AB_ACRONYM])
+    return replace_with_separator(
+        sentence, r" ", [
+            UNDO_AB_SENIOR, UNDO_AB_ACRONYM])
 
 
 def replace_with_separator(text, separator, regexs):
@@ -81,7 +90,8 @@ def clean_text_by_sentences(text):
     """ Tokenizes a given text into sentences, applying filters and lemmatizing them.
     Returns a SyntacticUnit list. """
     original_sentences = split_sentences(text)
-    filtered_sentences = [join_words(sentence) for sentence in preprocess_documents(original_sentences)]
+    filtered_sentences = [
+        join_words(sentence) for sentence in preprocess_documents(original_sentences)]
 
     return merge_syntactic_units(original_sentences, filtered_sentences)
 
@@ -89,11 +99,18 @@ def clean_text_by_sentences(text):
 def clean_text_by_word(text):
     """ Tokenizes a given text into words, applying filters and lemmatizing them.
     Returns a dict of word -> syntacticUnit. """
-    text_without_acronyms = replace_with_separator(text, "", [AB_ACRONYM_LETTERS])
-    original_words = list(tokenize(text_without_acronyms, to_lower=True, deacc=True))
-    filtered_words = [join_words(word_list, "") for word_list in preprocess_documents(original_words)]
+    text_without_acronyms = replace_with_separator(
+        text, "", [AB_ACRONYM_LETTERS])
+    original_words = list(
+        tokenize(
+            text_without_acronyms,
+            to_lower=True,
+            deacc=True))
+    filtered_words = [join_words(word_list, "")
+                      for word_list in preprocess_documents(original_words)]
     if HAS_PATTERN:
-        tags = tag(join_words(original_words))  # tag needs the context of the words in the text
+        # tag needs the context of the words in the text
+        tags = tag(join_words(original_words))
     else:
         tags = None
     units = merge_syntactic_units(original_words, filtered_words, tags)
@@ -101,5 +118,6 @@ def clean_text_by_word(text):
 
 
 def tokenize_by_word(text):
-    text_without_acronyms = replace_with_separator(text, "", [AB_ACRONYM_LETTERS])
+    text_without_acronyms = replace_with_separator(
+        text, "", [AB_ACRONYM_LETTERS])
     return tokenize(text_without_acronyms, to_lower=True, deacc=True)

From c3b772e31b7f3b8d4ad88aab5699906ffd1aa10d Mon Sep 17 00:00:00 2001
From: Sourav Singh <ssouravsingh12@gmail.com>
Date: Tue, 22 Nov 2016 00:50:18 -0800
Subject: [PATCH 2/9] Undo some changes to code.

---
 gensim/summarization/bm25.py              | 25 ++++----------
 gensim/summarization/graph.py             | 14 +++-----
 gensim/summarization/keywords.py          | 19 ++++------
 gensim/summarization/pagerank_weighted.py |  6 ++--
 gensim/summarization/summarizer.py        | 39 ++++++---------------
 gensim/summarization/syntactic_unit.py    |  3 +-
 gensim/summarization/textcleaner.py       | 42 +++++++----------------
 7 files changed, 41 insertions(+), 107 deletions(-)

diff --git a/gensim/summarization/bm25.py b/gensim/summarization/bm25.py
index f9aecbb43b..45379e2ffd 100644
--- a/gensim/summarization/bm25.py
+++ b/gensim/summarization/bm25.py
@@ -18,8 +18,7 @@ class BM25(object):
 
     def __init__(self, corpus):
         self.corpus_size = len(corpus)
-        self.avgdl = sum(map(lambda x: float(len(x)), corpus)
-                         ) / self.corpus_size
+        self.avgdl = sum(map(lambda x: float(len(x)), corpus)) / self.corpus_size
         self.corpus = corpus
         self.f = []
         self.df = {}
@@ -41,27 +40,16 @@ def initialize(self):
                 self.df[word] += 1
 
         for word, freq in iteritems(self.df):
-            self.idf[word] = math.log(
-                self.corpus_size - freq + 0.5) - math.log(freq + 0.5)
+            self.idf[word] = math.log(self.corpus_size - freq + 0.5) - math.log(freq + 0.5)
 
     def get_score(self, document, index, average_idf):
         score = 0
         for word in document:
             if word not in self.f[index]:
                 continue
-            idf = self.idf[word] if self.idf[
-                word] >= 0 else EPSILON * average_idf
-            score += (idf *
-                      self.f[index][word] *
-                      (PARAM_K1 +
-                       1) /
-                      (self.f[index][word] +
-                          PARAM_K1 *
-                          (1 -
-                           PARAM_B +
-                           PARAM_B *
-                           self.corpus_size /
-                           self.avgdl)))
+            idf = self.idf[word] if self.idf[word] >= 0 else EPSILON * average_idf
+            score += (idf * self.f[index][word] * (PARAM_K1 + 1) /
+                      (self.f[index][word] + PARAM_K1 * (1 - PARAM_B + PARAM_B * self.corpus_size / self.avgdl)))
         return score
 
     def get_scores(self, document, average_idf):
@@ -74,8 +62,7 @@ def get_scores(self, document, average_idf):
 
 def get_bm25_weights(corpus):
     bm25 = BM25(corpus)
-    average_idf = sum(map(lambda k: float(
-        bm25.idf[k]), bm25.idf.keys())) / len(bm25.idf.keys())
+    average_idf = sum(map(lambda k: float(bm25.idf[k]), bm25.idf.keys())) / len(bm25.idf.keys())
 
     weights = []
     for doc in corpus:
diff --git a/gensim/summarization/graph.py b/gensim/summarization/graph.py
index 7922ac7c9c..bfed410b5e 100644
--- a/gensim/summarization/graph.py
+++ b/gensim/summarization/graph.py
@@ -161,13 +161,10 @@ def __init__(self):
 
     def has_edge(self, edge):
         u, v = edge
-        return (
-            u, v) in self.edge_properties and (
-            v, u) in self.edge_properties
+        return (u, v) in self.edge_properties and (v, u) in self.edge_properties
 
     def edge_weight(self, edge):
-        return self.get_edge_properties(edge).setdefault(
-            self.WEIGHT_ATTRIBUTE_NAME, self.DEFAULT_WEIGHT)
+        return self.get_edge_properties(edge).setdefault(self.WEIGHT_ATTRIBUTE_NAME, self.DEFAULT_WEIGHT)
 
     def neighbors(self, node):
         return self.node_neighbors[node]
@@ -221,9 +218,7 @@ def add_edge_attribute(self, edge, attr):
         self.edge_attr[edge] = self.edge_attributes(edge) + [attr]
 
         if edge[0] != edge[1]:
-            self.edge_attr[
-                (edge[1], edge[0])] = self.edge_attributes(
-                (edge[1], edge[0])) + [attr]
+            self.edge_attr[(edge[1], edge[0])] = self.edge_attributes((edge[1], edge[0])) + [attr]
 
     def edge_attributes(self, edge):
         try:
@@ -234,8 +229,7 @@ def edge_attributes(self, edge):
     def set_edge_properties(self, edge, **properties):
         self.edge_properties.setdefault(edge, {}).update(properties)
         if edge[0] != edge[1]:
-            self.edge_properties.setdefault(
-                (edge[1], edge[0]), {}).update(properties)
+            self.edge_properties.setdefault((edge[1], edge[0]), {}).update(properties)
 
     def del_edge(self, edge):
         u, v = edge
diff --git a/gensim/summarization/keywords.py b/gensim/summarization/keywords.py
index 7e9a919909..6eb128678c 100644
--- a/gensim/summarization/keywords.py
+++ b/gensim/summarization/keywords.py
@@ -37,8 +37,7 @@ def _get_words_for_graph(tokens, pos_filter):
         include_filters = set(pos_filter)
         exclude_filters = frozenset([])
     if include_filters and exclude_filters:
-        raise ValueError(
-            "Can't use both include and exclude filters, should use only one")
+        raise ValueError("Can't use both include and exclude filters, should use only one")
 
     result = []
     for word, unit in iteritems(tokens):
@@ -59,8 +58,7 @@ def _set_graph_edge(graph, tokens, word_a, word_b):
         lemma_b = tokens[word_b].token
         edge = (lemma_a, lemma_b)
 
-        if graph.has_node(lemma_a) and graph.has_node(
-                lemma_b) and not graph.has_edge(edge):
+        if graph.has_node(lemma_a) and graph.has_node(lemma_b) and not graph.has_edge(edge):
             graph.add_edge(edge)
 
 
@@ -167,8 +165,7 @@ def _get_combined_keywords(_keywords, split_text):
                 result.append(word)
             for j in xrange(i + 1, len_text):
                 other_word = _strip_word(split_text[j])
-                if other_word in _keywords and other_word == split_text[
-                        j] and not other_word in combined_word:
+                if other_word in _keywords and other_word == split_text[j] and not other_word in combined_word:
                     combined_word.append(other_word)
                 else:
                     for keyword in combined_word:
@@ -193,12 +190,9 @@ def _format_results(_keywords, combined_keywords, split, scores):
     :param keywords:dict of keywords:scores
     :param combined_keywords:list of word/s
     """
-    combined_keywords.sort(
-        key=lambda w: _get_average_score(
-            w, _keywords), reverse=True)
+    combined_keywords.sort(key=lambda w: _get_average_score(w, _keywords), reverse=True)
     if scores:
-        return [(word, _get_average_score(word, _keywords))
-                for word in combined_keywords]
+        return [(word, _get_average_score(word, _keywords)) for word in combined_keywords]
     if split:
         return combined_keywords
     return "\n".join(combined_keywords)
@@ -230,8 +224,7 @@ def keywords(
     # score
     pagerank_scores = _pagerank(graph)
 
-    extracted_lemmas = _extract_tokens(
-        graph.nodes(), pagerank_scores, ratio, words)
+    extracted_lemmas = _extract_tokens(graph.nodes(), pagerank_scores, ratio, words)
 
     # The results can be polluted by many variations of the same word
     if lemmatize:
diff --git a/gensim/summarization/pagerank_weighted.py b/gensim/summarization/pagerank_weighted.py
index f2e97049e4..061a27f7ea 100644
--- a/gensim/summarization/pagerank_weighted.py
+++ b/gensim/summarization/pagerank_weighted.py
@@ -19,8 +19,7 @@ def pagerank_weighted(graph, damping=0.85):
     adjacency_matrix = build_adjacency_matrix(graph)
     probability_matrix = build_probability_matrix(graph)
 
-    pagerank_matrix = damping * adjacency_matrix.todense() + (1 - damping) * \
-        probability_matrix
+    pagerank_matrix = damping * adjacency_matrix.todense() + (1 - damping) * \ probability_matrix
 
     # TODO raise an error if matrix has complex eigenvectors?
     vals, vecs = eigs(pagerank_matrix.T, k=1)
@@ -37,8 +36,7 @@ def build_adjacency_matrix(graph):
 
     for i in xrange(length):
         current_node = nodes[i]
-        neighbors_sum = sum(graph.edge_weight((current_node, neighbor))
-                            for neighbor in graph.neighbors(current_node))
+        neighbors_sum = sum(graph.edge_weight((current_node, neighbor))for neighbor in graph.neighbors(current_node))
         for j in xrange(length):
             edge_weight = float(graph.edge_weight((current_node, nodes[j])))
             if i != j and edge_weight != 0.0:
diff --git a/gensim/summarization/summarizer.py b/gensim/summarization/summarizer.py
index 1c8fc4f219..92716ed21b 100644
--- a/gensim/summarization/summarizer.py
+++ b/gensim/summarization/summarizer.py
@@ -72,8 +72,7 @@ def _get_similarity(doc1, doc2, vec1, vec2):
     length_1 = _get_doc_length(doc1)
     length_2 = _get_doc_length(doc2)
 
-    denominator = _log10(
-        length_1) + _log10(length_2) if length_1 > 0 and length_2 > 0 else 0
+    denominator = _log10(length_1) + _log10(length_2) if length_1 > 0 and length_2 > 0 else 0
 
     return numerator / denominator if denominator != 0 else 0
 
@@ -87,8 +86,7 @@ def _build_corpus(sentences):
 def _get_important_sentences(sentences, corpus, important_docs):
     hashable_corpus = _build_hasheable_corpus(corpus)
     sentences_by_corpus = dict(zip(hashable_corpus, sentences))
-    return [sentences_by_corpus[tuple(important_doc)]
-            for important_doc in important_docs]
+    return [sentences_by_corpus[tuple(important_doc)] for important_doc in important_docs]
 
 
 def _get_sentences_with_word_count(sentences, word_count):
@@ -103,12 +101,7 @@ def _get_sentences_with_word_count(sentences, word_count):
 
         # Checks if the inclusion of the sentence gives a better approximation
         # to the word parameter.
-        if abs(
-                word_count -
-                length -
-                words_in_sentence) > abs(
-                word_count -
-                length):
+        if abs(word_count - length - words_in_sentence) > abs(word_count - length):
             return selected_sentences
 
         selected_sentences.append(sentence)
@@ -122,8 +115,7 @@ def _extract_important_sentences(
         corpus,
         important_docs,
         word_count):
-    important_sentences = _get_important_sentences(
-        sentences, corpus, important_docs)
+    important_sentences = _get_important_sentences(sentences, corpus, important_docs)
 
     # If no "word_count" option is provided, the number of sentences is
     # reduced by the provided ratio. Else, the ratio is ignored.
@@ -165,10 +157,7 @@ def summarize_corpus(corpus, ratio=0.2):
 
     # Warns the user if there are too few documents.
     if len(corpus) < INPUT_MIN_LENGTH:
-        logger.warning(
-            "Input corpus is expected to have at least " +
-            str(INPUT_MIN_LENGTH) +
-            " documents.")
+        logger.warning("Input corpus is expected to have at least " + str(INPUT_MIN_LENGTH) + " documents.")
 
     graph = _build_graph(hashable_corpus)
     _set_graph_edge_weights(graph)
@@ -177,15 +166,12 @@ def summarize_corpus(corpus, ratio=0.2):
     # Cannot calculate eigenvectors if number of unique words in text < 3.
     # Warns user to add more text. The function ends.
     if len(graph.nodes()) < 3:
-        logger.warning(
-            "Please add more sentences to the text. The number of reachable nodes is below 3")
+        logger.warning("Please add more sentences to the text. The number of reachable nodes is below 3")
         return
 
     pagerank_scores = _pagerank(graph)
 
-    hashable_corpus.sort(
-        key=lambda doc: pagerank_scores.get(
-            doc, 0), reverse=True)
+    hashable_corpus.sort(key=lambda doc: pagerank_scores.get(doc, 0), reverse=True)
 
     return [list(doc) for doc in hashable_corpus[:int(len(corpus) * ratio)]]
 
@@ -225,19 +211,14 @@ def summarize(text, ratio=0.2, word_count=None, split=False):
 
     # Warns if the text is too short.
     if len(sentences) < INPUT_MIN_LENGTH:
-        logger.warning(
-            "Input text is expected to have at least " +
-            str(INPUT_MIN_LENGTH) +
-            " sentences.")
+        logger.warning("Input text is expected to have at least " + str(INPUT_MIN_LENGTH) + " sentences.")
 
     corpus = _build_corpus(sentences)
 
-    most_important_docs = summarize_corpus(
-        corpus, ratio=ratio if word_count is None else 1)
+    most_important_docs = summarize_corpus(corpus, ratio=ratio if word_count is None else 1)
 
     # Extracts the most important sentences with the selected criterion.
-    extracted_sentences = _extract_important_sentences(
-        sentences, corpus, most_important_docs, word_count)
+    extracted_sentences = _extract_important_sentences(sentences, corpus, most_important_docs, word_count)
 
     # Sorts the extracted sentences by apparition order in the original text.
     extracted_sentences.sort(key=lambda s: s.index)
diff --git a/gensim/summarization/syntactic_unit.py b/gensim/summarization/syntactic_unit.py
index 5a84eca139..89842e1122 100644
--- a/gensim/summarization/syntactic_unit.py
+++ b/gensim/summarization/syntactic_unit.py
@@ -14,8 +14,7 @@ def __init__(self, text, token=None, tag=None):
         self.score = -1
 
     def __str__(self):
-        return "Original unit: '" + self.text + "' *-*-*-* " + \
-            "Processed unit: '" + self.token + "'"
+        return "Original unit: '" + self.text + "' *-*-*-* " + "Processed unit: '" + self.token + "'"
 
     def __repr__(self):
         return str(self)
diff --git a/gensim/summarization/textcleaner.py b/gensim/summarization/textcleaner.py
index eafbb706b0..a591b51b9a 100644
--- a/gensim/summarization/textcleaner.py
+++ b/gensim/summarization/textcleaner.py
@@ -17,29 +17,22 @@
     logger.info("'pattern' package found; tag filters are available for English")
     HAS_PATTERN = True
 except ImportError:
-    logger.info(
-        "'pattern' package not found; tag filters are not available for English")
+    logger.info("'pattern' package not found; tag filters are not available for English")
     HAS_PATTERN = False
 
 
 SEPARATOR = r"@"
-# backup (\S.+?[.!?])(?=\s+|$)|(\S.+?)(?=[\n]|$)
-RE_SENTENCE = re.compile('(\S.+?[.!?])(?=\s+|$)|(\S.+?)(?=[\n]|$)', re.UNICODE)
+RE_SENTENCE = re.compile('(\S.+?[.!?])(?=\s+|$)|(\S.+?)(?=[\n]|$)', re.UNICODE)  # backup (\S.+?[.!?])(?=\s+|$)|(\S.+?)(?=[\n]|$)
 AB_SENIOR = re.compile("([A-Z][a-z]{1,2}\.)\s(\w)", re.UNICODE)
 AB_ACRONYM = re.compile("(\.[a-zA-Z]\.)\s(\w)", re.UNICODE)
 AB_ACRONYM_LETTERS = re.compile("([a-zA-Z])\.([a-zA-Z])\.", re.UNICODE)
-UNDO_AB_SENIOR = re.compile(
-    "([A-Z][a-z]{1,2}\.)" +
-    SEPARATOR +
-    "(\w)",
-    re.UNICODE)
+UNDO_AB_SENIOR = re.compile("([A-Z][a-z]{1,2}\.)" + SEPARATOR + "(\w)", re.UNICODE)
 UNDO_AB_ACRONYM = re.compile("(\.[a-zA-Z]\.)" + SEPARATOR + "(\w)", re.UNICODE)
 
 
 def split_sentences(text):
     processed = replace_abbreviations(text)
-    return [undo_replacement(sentence)
-            for sentence in get_sentences(processed)]
+    return [undo_replacement(sentence) for sentence in get_sentences(processed)]
 
 
 def replace_abbreviations(text):
@@ -47,9 +40,7 @@ def replace_abbreviations(text):
 
 
 def undo_replacement(sentence):
-    return replace_with_separator(
-        sentence, r" ", [
-            UNDO_AB_SENIOR, UNDO_AB_ACRONYM])
+    return replace_with_separator(sentence, r" ", [UNDO_AB_SENIOR, UNDO_AB_ACRONYM])
 
 
 def replace_with_separator(text, separator, regexs):
@@ -90,8 +81,7 @@ def clean_text_by_sentences(text):
     """ Tokenizes a given text into sentences, applying filters and lemmatizing them.
     Returns a SyntacticUnit list. """
     original_sentences = split_sentences(text)
-    filtered_sentences = [
-        join_words(sentence) for sentence in preprocess_documents(original_sentences)]
+    filtered_sentences = [join_words(sentence) for sentence in preprocess_documents(original_sentences)]
 
     return merge_syntactic_units(original_sentences, filtered_sentences)
 
@@ -99,18 +89,11 @@ def clean_text_by_sentences(text):
 def clean_text_by_word(text):
     """ Tokenizes a given text into words, applying filters and lemmatizing them.
     Returns a dict of word -> syntacticUnit. """
-    text_without_acronyms = replace_with_separator(
-        text, "", [AB_ACRONYM_LETTERS])
-    original_words = list(
-        tokenize(
-            text_without_acronyms,
-            to_lower=True,
-            deacc=True))
-    filtered_words = [join_words(word_list, "")
-                      for word_list in preprocess_documents(original_words)]
+    text_without_acronyms = replace_with_separator(text, "", [AB_ACRONYM_LETTERS])
+    original_words = list(tokenize(text_without_acronyms, to_lower=True, deacc=True))
+    filtered_words = [join_words(word_list, "") for word_list in preprocess_documents(original_words)]
     if HAS_PATTERN:
-        # tag needs the context of the words in the text
-        tags = tag(join_words(original_words))
+        tags = tag(join_words(original_words))  # tag needs the context of the words in the text
     else:
         tags = None
     units = merge_syntactic_units(original_words, filtered_words, tags)
@@ -118,6 +101,5 @@ def clean_text_by_word(text):
 
 
 def tokenize_by_word(text):
-    text_without_acronyms = replace_with_separator(
-        text, "", [AB_ACRONYM_LETTERS])
-    return tokenize(text_without_acronyms, to_lower=True, deacc=True)
+    text_without_acronyms = replace_with_separator(text, "", [AB_ACRONYM_LETTERS])
+return tokenize(text_without_acronyms, to_lower=True, deacc=True)

From c2e20c9fb0f1e8017cbdc32347e7f711c2fd006e Mon Sep 17 00:00:00 2001
From: Sourav Singh <souravsingh@users.noreply.github.com>
Date: Tue, 22 Nov 2016 01:02:44 -0800
Subject: [PATCH 3/9] Fix an error in newline

---
 gensim/summarization/pagerank_weighted.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/gensim/summarization/pagerank_weighted.py b/gensim/summarization/pagerank_weighted.py
index 061a27f7ea..be1a5dfef5 100644
--- a/gensim/summarization/pagerank_weighted.py
+++ b/gensim/summarization/pagerank_weighted.py
@@ -19,7 +19,7 @@ def pagerank_weighted(graph, damping=0.85):
     adjacency_matrix = build_adjacency_matrix(graph)
     probability_matrix = build_probability_matrix(graph)
 
-    pagerank_matrix = damping * adjacency_matrix.todense() + (1 - damping) * \ probability_matrix
+    pagerank_matrix = damping * adjacency_matrix.todense() + (1 - damping) * probability_matrix
 
     # TODO raise an error if matrix has complex eigenvectors?
     vals, vecs = eigs(pagerank_matrix.T, k=1)

From fdcbc7502b66e3ffecba843467b02c8e60bd99b4 Mon Sep 17 00:00:00 2001
From: Sourav Singh <souravsingh@users.noreply.github.com>
Date: Tue, 22 Nov 2016 01:15:40 -0800
Subject: [PATCH 4/9] Fix indent

---
 gensim/summarization/textcleaner.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/gensim/summarization/textcleaner.py b/gensim/summarization/textcleaner.py
index a591b51b9a..7609da469a 100644
--- a/gensim/summarization/textcleaner.py
+++ b/gensim/summarization/textcleaner.py
@@ -102,4 +102,4 @@ def clean_text_by_word(text):
 
 def tokenize_by_word(text):
     text_without_acronyms = replace_with_separator(text, "", [AB_ACRONYM_LETTERS])
-return tokenize(text_without_acronyms, to_lower=True, deacc=True)
+    return tokenize(text_without_acronyms, to_lower=True, deacc=True)

From 482ab89229d4a6d0c231391e274b0285e1671840 Mon Sep 17 00:00:00 2001
From: Sourav Singh <ssouravsingh12@gmail.com>
Date: Thu, 24 Nov 2016 21:01:09 +0530
Subject: [PATCH 5/9] Fixes according to the review

---
 gensim/summarization/commons.py  | 3 +--
 gensim/summarization/keywords.py | 5 ++---
 2 files changed, 3 insertions(+), 5 deletions(-)

diff --git a/gensim/summarization/commons.py b/gensim/summarization/commons.py
index 4f19196066..1c467098f9 100644
--- a/gensim/summarization/commons.py
+++ b/gensim/summarization/commons.py
@@ -16,6 +16,5 @@ def build_graph(sequence):
 
 def remove_unreachable_nodes(graph):
     for node in graph.nodes():
-        if sum(graph.edge_weight((node, other))
-               for other in graph.neighbors(node)) == 0:
+        if sum(graph.edge_weight((node, other)) for other in graph.neighbors(node)) == 0:
             graph.del_node(node)
diff --git a/gensim/summarization/keywords.py b/gensim/summarization/keywords.py
index 6eb128678c..be535377f1 100644
--- a/gensim/summarization/keywords.py
+++ b/gensim/summarization/keywords.py
@@ -160,9 +160,8 @@ def _get_combined_keywords(_keywords, split_text):
         word = _strip_word(split_text[i])
         if word in _keywords:
             combined_word = [word]
-            if i + 1 == len_text:
-                # appends last word if keyword and doesn't iterate
-                result.append(word)
+            if i + 1 == len_text: 
+                result.append(word) # appends last word if keyword and doesn't iterate
             for j in xrange(i + 1, len_text):
                 other_word = _strip_word(split_text[j])
                 if other_word in _keywords and other_word == split_text[j] and not other_word in combined_word:

From f731f192bff80312bdb0402a810949adf532c6ae Mon Sep 17 00:00:00 2001
From: Sourav Singh <ssouravsingh12@gmail.com>
Date: Mon, 28 Nov 2016 18:37:08 +0530
Subject: [PATCH 6/9] Updates to a few scripts

---
 gensim/summarization/keywords.py   | 15 +++------------
 gensim/summarization/summarizer.py |  9 ++-------
 2 files changed, 5 insertions(+), 19 deletions(-)

diff --git a/gensim/summarization/keywords.py b/gensim/summarization/keywords.py
index be535377f1..3648952bdf 100644
--- a/gensim/summarization/keywords.py
+++ b/gensim/summarization/keywords.py
@@ -160,8 +160,8 @@ def _get_combined_keywords(_keywords, split_text):
         word = _strip_word(split_text[i])
         if word in _keywords:
             combined_word = [word]
-            if i + 1 == len_text: 
-                result.append(word) # appends last word if keyword and doesn't iterate
+            if i + 1 == len_text:
+                result.append(word)  # appends last word if keyword and doesn't iterate
             for j in xrange(i + 1, len_text):
                 other_word = _strip_word(split_text[j])
                 if other_word in _keywords and other_word == split_text[j] and not other_word in combined_word:
@@ -197,16 +197,7 @@ def _format_results(_keywords, combined_keywords, split, scores):
     return "\n".join(combined_keywords)
 
 
-def keywords(
-        text,
-        ratio=0.2,
-        words=None,
-        split=False,
-        scores=False,
-        pos_filter=[
-            'NN',
-            'JJ'],
-        lemmatize=False):
+def keywords(text, ratio=0.2, words=None, split=False, scores=False, pos_filter=['NN', 'JJ'], lemmatize=False):
     # Gets a dict of word -> lemma
     text = to_unicode(text)
     tokens = _clean_text_by_word(text)
diff --git a/gensim/summarization/summarizer.py b/gensim/summarization/summarizer.py
index 92716ed21b..f6c83e319d 100644
--- a/gensim/summarization/summarizer.py
+++ b/gensim/summarization/summarizer.py
@@ -110,17 +110,12 @@ def _get_sentences_with_word_count(sentences, word_count):
     return selected_sentences
 
 
-def _extract_important_sentences(
-        sentences,
-        corpus,
-        important_docs,
-        word_count):
+def _extract_important_sentences(sentences, corpus, important_docs, word_count):
     important_sentences = _get_important_sentences(sentences, corpus, important_docs)
 
     # If no "word_count" option is provided, the number of sentences is
     # reduced by the provided ratio. Else, the ratio is ignored.
-    return important_sentences if word_count is None else _get_sentences_with_word_count(
-        important_sentences, word_count)
+    return important_sentences if word_count is None else _get_sentences_with_word_count(important_sentences, word_count)
 
 
 def _format_results(extracted_sentences, split):

From b21068f585de2d95a0b0938b64afe3de5394a4d4 Mon Sep 17 00:00:00 2001
From: Sourav Singh <souravsingh@users.noreply.github.com>
Date: Mon, 28 Nov 2016 19:13:24 +0530
Subject: [PATCH 7/9] Update bm25.py

---
 gensim/summarization/bm25.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/gensim/summarization/bm25.py b/gensim/summarization/bm25.py
index 45379e2ffd..bff3007ef9 100644
--- a/gensim/summarization/bm25.py
+++ b/gensim/summarization/bm25.py
@@ -48,8 +48,8 @@ def get_score(self, document, index, average_idf):
             if word not in self.f[index]:
                 continue
             idf = self.idf[word] if self.idf[word] >= 0 else EPSILON * average_idf
-            score += (idf * self.f[index][word] * (PARAM_K1 + 1) /
-                      (self.f[index][word] + PARAM_K1 * (1 - PARAM_B + PARAM_B * self.corpus_size / self.avgdl)))
+            score += (idf * self.f[index][word] * (PARAM_K1 + 1) / (self.f[index][word] + PARAM_K1 * (
+                1 - PARAM_B + PARAM_B * self.corpus_size / self.avgdl)))
         return score
 
     def get_scores(self, document, average_idf):

From fc3a8af742f618257ba7e221a137750e1a5464b6 Mon Sep 17 00:00:00 2001
From: Sourav Singh <souravsingh@users.noreply.github.com>
Date: Tue, 29 Nov 2016 00:14:52 +0530
Subject: [PATCH 8/9] Update keywords.py

---
 gensim/summarization/keywords.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/gensim/summarization/keywords.py b/gensim/summarization/keywords.py
index 3648952bdf..8ccc8e0554 100644
--- a/gensim/summarization/keywords.py
+++ b/gensim/summarization/keywords.py
@@ -164,7 +164,7 @@ def _get_combined_keywords(_keywords, split_text):
                 result.append(word)  # appends last word if keyword and doesn't iterate
             for j in xrange(i + 1, len_text):
                 other_word = _strip_word(split_text[j])
-                if other_word in _keywords and other_word == split_text[j] and not other_word in combined_word:
+                if other_word in _keywords and other_word == split_text[j] and other_word not in combined_word:
                     combined_word.append(other_word)
                 else:
                     for keyword in combined_word:

From 19312fc6010b4e011c3325d3ceb3b0630926fc00 Mon Sep 17 00:00:00 2001
From: Sourav Singh <souravsingh@users.noreply.github.com>
Date: Tue, 29 Nov 2016 18:37:18 +0530
Subject: [PATCH 9/9] Update expression to match PEP8 specifications

I have made some changes to the expression for calculation of score to match PEP8 specifications.
---
 gensim/summarization/bm25.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/gensim/summarization/bm25.py b/gensim/summarization/bm25.py
index bff3007ef9..b32d2f040d 100644
--- a/gensim/summarization/bm25.py
+++ b/gensim/summarization/bm25.py
@@ -48,8 +48,9 @@ def get_score(self, document, index, average_idf):
             if word not in self.f[index]:
                 continue
             idf = self.idf[word] if self.idf[word] >= 0 else EPSILON * average_idf
-            score += (idf * self.f[index][word] * (PARAM_K1 + 1) / (self.f[index][word] + PARAM_K1 * (
-                1 - PARAM_B + PARAM_B * self.corpus_size / self.avgdl)))
+            score += (idf * self.f[index][word] * (PARAM_K1 + 1) 
+                / (self.f[index][word] + PARAM_K1 
+                * (1 - PARAM_B + PARAM_B * self.corpus_size / self.avgdl)))
         return score
 
     def get_scores(self, document, average_idf):