Skip to content

Commit

Permalink
Merge pull request #74 from miso-belica/fix-69-missing-stopwords
Browse files Browse the repository at this point in the history
Added stopwords into SumBasic summarizer
  • Loading branch information
miso-belica authored Nov 17, 2016
2 parents 32cee98 + 726e6c3 commit 12cba48
Showing 1 changed file with 22 additions and 13 deletions.
35 changes: 22 additions & 13 deletions sumy/summarizers/sum_basic.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,8 @@

from __future__ import absolute_import
from __future__ import division, print_function, unicode_literals
import math

from ._summarizer import AbstractSummarizer
from ..utils import get_stop_words


class SumBasicSummarizer(AbstractSummarizer):
Expand All @@ -15,13 +13,23 @@ class SumBasicSummarizer(AbstractSummarizer):
Source: http://www.cis.upenn.edu/~nenkova/papers/ipm.pdf
"""
_stop_words = frozenset()

@property
def stop_words(self):
return self._stop_words

@stop_words.setter
def stop_words(self, words):
self._stop_words = frozenset(map(self.normalize_word, words))

def __call__(self, document, sentences_count):
sentences = document.sentences
ratings = self._compute_ratings(sentences)
return self._get_best_sentences(document.sentences, sentences_count, ratings)

def _get_all_words_in_doc(self, sentences):
@staticmethod
def _get_all_words_in_doc(sentences):
return [w for s in sentences for w in s.words]

def _get_content_words_in_sentence(self, sentence):
Expand All @@ -35,7 +43,8 @@ def _normalize_words(self, words):
def _filter_out_stop_words(self, words):
return [w for w in words if w not in self.stop_words]

def _compute_word_freq(self, list_of_words):
@staticmethod
def _compute_word_freq(list_of_words):
word_freq = {}
for w in list_of_words:
word_freq[w] = word_freq.get(w, 0) + 1
Expand All @@ -48,16 +57,17 @@ def _get_all_content_words_in_doc(self, sentences):
return normalized_content_words

def _compute_tf(self, sentences):
'''
"""
Computes the normalized term frequency as explained in http://www.tfidf.com/
'''
"""
content_words = self._get_all_content_words_in_doc(sentences)
content_words_count = len(content_words)
content_words_freq = self._compute_word_freq(content_words)
content_word_tf = dict((k, v / content_words_count) for (k, v) in content_words_freq.items())
return content_word_tf

def _compute_average_probability_of_words(self, word_freq_in_doc, content_words_in_sentence):
@staticmethod
def _compute_average_probability_of_words(word_freq_in_doc, content_words_in_sentence):
content_words_count = len(content_words_in_sentence)
if content_words_count > 0:
word_freq_sum = sum([word_freq_in_doc[w] for w in content_words_in_sentence])
Expand All @@ -66,24 +76,23 @@ def _compute_average_probability_of_words(self, word_freq_in_doc, content_words_
else:
return 0

def _update_tf(self, word_freq, words_to_update):
@staticmethod
def _update_tf(word_freq, words_to_update):
for w in words_to_update:
word_freq[w] *= word_freq[w]
return word_freq


def _find_index_of_best_sentence(self, word_freq, sentences_as_words):
min_possible_freq = -1
max_value = min_possible_freq
best_sentence_index = 0
for i, words in enumerate(sentences_as_words):
word_freq_avg = self._compute_average_probability_of_words(word_freq, words)
if (word_freq_avg > max_value):
if word_freq_avg > max_value:
max_value = word_freq_avg
best_sentence_index = i
return best_sentence_index


def _compute_ratings(self, sentences):
word_freq = self._compute_tf(sentences)
ratings = {}
Expand All @@ -100,10 +109,10 @@ def _compute_ratings(self, sentences):
best_sentence = sentences_list.pop(best_sentence_index)

# value is the iteration in which it was removed multiplied by -1 so that the first sentences removed (the most important) have highest values
ratings[best_sentence] = -1 * len(ratings)
ratings[best_sentence] = -len(ratings)

# update probabilities
best_sentence_words = sentences_as_words.pop(best_sentence_index)
self._update_tf(word_freq, best_sentence_words)

return ratings
return ratings

0 comments on commit 12cba48

Please sign in to comment.