Merge pull request #74 from miso-belica/fix-69-missing-stopwords

Added stopwords into SumBasic summarizer
miso-belica · Nov 17, 2016 · 12cba48 · 12cba48
2 parents 32cee98 + 726e6c3
commit 12cba48
Showing 1 changed file with 22 additions and 13 deletions.
diff --git a/sumy/summarizers/sum_basic.py b/sumy/summarizers/sum_basic.py
@@ -2,10 +2,8 @@
 
 from __future__ import absolute_import
 from __future__ import division, print_function, unicode_literals
-import math
 
 from ._summarizer import AbstractSummarizer
-from ..utils import get_stop_words
 
 
 class SumBasicSummarizer(AbstractSummarizer):
@@ -15,13 +13,23 @@ class SumBasicSummarizer(AbstractSummarizer):
     Source: http://www.cis.upenn.edu/~nenkova/papers/ipm.pdf
 
     """
+    _stop_words = frozenset()
+
+    @property
+    def stop_words(self):
+        return self._stop_words
+
+    @stop_words.setter
+    def stop_words(self, words):
+        self._stop_words = frozenset(map(self.normalize_word, words))
 
     def __call__(self, document, sentences_count):
         sentences = document.sentences
         ratings = self._compute_ratings(sentences)
         return self._get_best_sentences(document.sentences, sentences_count, ratings)
 
-    def _get_all_words_in_doc(self, sentences):
+    @staticmethod
+    def _get_all_words_in_doc(sentences):
         return [w for s in sentences for w in s.words]
 
     def _get_content_words_in_sentence(self, sentence):
@@ -35,7 +43,8 @@ def _normalize_words(self, words):
     def _filter_out_stop_words(self, words):
         return [w for w in words if w not in self.stop_words]
 
-    def _compute_word_freq(self, list_of_words):
+    @staticmethod
+    def _compute_word_freq(list_of_words):
         word_freq = {}
         for w in list_of_words:
             word_freq[w] = word_freq.get(w, 0) + 1
@@ -48,16 +57,17 @@ def _get_all_content_words_in_doc(self, sentences):
         return normalized_content_words
 
     def _compute_tf(self, sentences):
-        '''
+        """
         Computes the normalized term frequency as explained in http://www.tfidf.com/
-        '''
+        """
         content_words = self._get_all_content_words_in_doc(sentences)
         content_words_count = len(content_words)
         content_words_freq = self._compute_word_freq(content_words)
         content_word_tf = dict((k, v / content_words_count) for (k, v) in content_words_freq.items())
         return content_word_tf
 
-    def _compute_average_probability_of_words(self, word_freq_in_doc, content_words_in_sentence):
+    @staticmethod
+    def _compute_average_probability_of_words(word_freq_in_doc, content_words_in_sentence):
         content_words_count = len(content_words_in_sentence)
         if content_words_count > 0:
             word_freq_sum = sum([word_freq_in_doc[w] for w in content_words_in_sentence])
@@ -66,24 +76,23 @@ def _compute_average_probability_of_words(self, word_freq_in_doc, content_words_
         else: 
             return 0
 
-    def _update_tf(self, word_freq, words_to_update):
+    @staticmethod
+    def _update_tf(word_freq, words_to_update):
         for w in words_to_update:
             word_freq[w] *= word_freq[w]
         return word_freq
 
-
     def _find_index_of_best_sentence(self, word_freq, sentences_as_words):
         min_possible_freq = -1
         max_value = min_possible_freq
         best_sentence_index = 0
         for i, words in enumerate(sentences_as_words):
             word_freq_avg = self._compute_average_probability_of_words(word_freq, words)
-            if (word_freq_avg > max_value): 
+            if word_freq_avg > max_value:
                 max_value = word_freq_avg
                 best_sentence_index = i
         return best_sentence_index
 
-
     def _compute_ratings(self, sentences):
         word_freq = self._compute_tf(sentences)
         ratings = {}
@@ -100,10 +109,10 @@ def _compute_ratings(self, sentences):
             best_sentence = sentences_list.pop(best_sentence_index)
 
             # value is the iteration in which it was removed multiplied by -1 so that the first sentences removed (the most important) have highest values
-            ratings[best_sentence] =  -1 * len(ratings)
+            ratings[best_sentence] = -len(ratings)
 
             # update probabilities
             best_sentence_words = sentences_as_words.pop(best_sentence_index)
             self._update_tf(word_freq, best_sentence_words)
 
-        return ratings
+        return ratings