miso-belica · miso-belica · Mar 9, 2022 · Mar 9, 2022
diff --git a/sumy/summarizers/sum_basic.py b/sumy/summarizers/sum_basic.py
@@ -8,10 +8,9 @@
 
 class SumBasicSummarizer(AbstractSummarizer):
     """
-    SumBasic: a frequency-based summarization system that adjusts word frequencies as 
+    SumBasic: a frequency-based summarization system that adjusts word frequencies as
     sentences are extracted.
     Source: http://www.cis.upenn.edu/~nenkova/papers/ipm.pdf
-
     """
     _stop_words = frozenset()
 
@@ -28,12 +27,11 @@ def __call__(self, document, sentences_count):
         ratings = self._compute_ratings(sentences)
         return self._get_best_sentences(document.sentences, sentences_count, ratings)
 
-    @staticmethod
-    def _get_all_words_in_doc(sentences):
-        return [w for s in sentences for w in s.words]
+    def _get_all_words_in_doc(self, sentences):
+        return self._stem_words([w for s in sentences for w in s.words])
 
     def _get_content_words_in_sentence(self, sentence):
-        normalized_words = self._normalize_words(sentence.words)   
+        normalized_words = self._normalize_words(sentence.words)
         normalized_content_words = self._filter_out_stop_words(normalized_words)
         stemmed_normalized_content_words = self._stem_words(normalized_content_words)
         return stemmed_normalized_content_words
@@ -77,7 +75,7 @@ def _compute_average_probability_of_words(word_freq_in_doc, content_words_in_sen
             word_freq_sum = sum([word_freq_in_doc[w] for w in content_words_in_sentence])
             word_freq_avg = word_freq_sum / content_words_count
             return word_freq_avg
-        else: 
+        else:
             return 0
 
     @staticmethod
@@ -100,13 +98,13 @@ def _find_index_of_best_sentence(self, word_freq, sentences_as_words):
     def _compute_ratings(self, sentences):
         word_freq = self._compute_tf(sentences)
         ratings = {}
-        
+
         # make it a list so that it can be modified
         sentences_list = list(sentences)
 
         # get all content words once for efficiency
         sentences_as_words = [self._get_content_words_in_sentence(s) for s in sentences]
-        
+
         # Removes one sentence per iteration by adding to summary
         while len(sentences_list) > 0:
             best_sentence_index = self._find_index_of_best_sentence(word_freq, sentences_as_words)

diff --git a/tests/test_summarizers/test_sum_basic.py b/tests/test_summarizers/test_sum_basic.py
@@ -30,7 +30,6 @@ def test_empty_document():
 
 
 def test_single_sentence():
-
     s = Sentence("I am one slightly longer sentence.", Tokenizer("english"))
     document = build_document([s])
     summarizer = _build_summarizer(EMPTY_STOP_WORDS)
@@ -39,6 +38,15 @@ def test_single_sentence():
     assert len(returned) == 1
 
 
+def test_stemmer_does_not_cause_crash():
+    """https://github.com/miso-belica/sumy/issues/165"""
+    document = build_document([Sentence("Was ist das längste deutsche Wort?", Tokenizer("german"))])
+    summarizer = _build_summarizer(EMPTY_STOP_WORDS, Stemmer("german"))
+
+    returned = summarizer(document, 10)
+    assert len(returned) == 1
+
+
 def test_normalize_words():
     summarizer = _build_summarizer(EMPTY_STOP_WORDS)
     sentence = "This iS A test 2 CHECk normalization."