diff --git a/doc/widgets/wordcloud.md b/doc/widgets/wordcloud.md index 33abec80c..b5d3b821a 100644 --- a/doc/widgets/wordcloud.md +++ b/doc/widgets/wordcloud.md @@ -11,9 +11,10 @@ Generates a word cloud from corpus. **Outputs** - Corpus: Documents that match the selection. -- Word: Selected word that can be used as query in [Concordance](concordance.md). +- Selected Word: Selected word that can be used as query in [Concordance](concordance.md). +- Word Counts: Words and their weights. -**Word Cloud** displays tokens in the corpus, their size denoting the frequency of the word in corpus. Words are listed by their frequency (weight) in the widget. The widget outputs documents, containing selected tokens from the word cloud. +**Word Cloud** displays tokens in the corpus, their size denoting the frequency of the word in corpus or an average bag of words count, when bag of words features are at the input of the widget. Words are listed by their frequency (weight) in the widget. The widget outputs documents, containing selected tokens from the word cloud. ![](images/Word-Cloud-stamped.png) diff --git a/orangecontrib/text/widgets/owwordcloud.py b/orangecontrib/text/widgets/owwordcloud.py index cbdbb33ad..7fd9330e2 100644 --- a/orangecontrib/text/widgets/owwordcloud.py +++ b/orangecontrib/text/widgets/owwordcloud.py @@ -1,6 +1,7 @@ # coding: utf-8 from collections import Counter from math import pi as PI +from operator import itemgetter import numpy as np from AnyQt.QtCore import Qt, QItemSelection, QItemSelectionModel, pyqtSlot, \ @@ -40,6 +41,9 @@ class Outputs: class Warning(widget.OWWidget.Warning): topic_precedence = widget.Msg('Input signal Topic takes priority over Corpus') + class Info(widget.OWWidget.Information): + bow_weights = widget.Msg("Showing bag of words weights.") + def __init__(self): super().__init__() self.n_topic_words = 0 @@ -250,17 +254,45 @@ def create_weight_list(self): @Inputs.corpus def on_corpus_change(self, data): self.corpus = data + self.Info.clear() self.corpus_counter = Counter() if data is not None: - self.corpus_counter = Counter(w for doc in data.ngrams for w in doc) + bow_counts = self._bow_words() + if bow_counts: + self.Info.bow_weights() + self.corpus_counter = Counter(bow_counts) + else: + self.corpus_counter = Counter(w for doc in data.ngrams for w in doc) n_docs, n_words = len(data), len(self.corpus_counter) - self.documents_info_str = ('{} documents with {} words'.format(n_docs, n_words) - if data else '(no documents on input)') + self.documents_info_str = ( + '{} documents with {} words'.format(n_docs, n_words) + if data else '(no documents on input)') self.create_weight_list() + def _bow_words(self): + """ + This function extract words from bag of words features and assign them + the frequency which is average bow count. + """ + bow_features = self._get_bow_variables() + if not bow_features: + return {} + + average_bows = { + f.name: self.corpus.get_column_view(f)[0].mean() + for f in bow_features} + return average_bows + + def _get_bow_variables(self): + """ + Extract bow variables from data + """ + return [var for var in self.corpus.domain.variables + if var.attributes.get("bow-feature", False)] + def handleNewSignals(self): if self.topic is not None and len(self.topic): self._apply_topic() diff --git a/orangecontrib/text/widgets/tests/test_owworldcloud.py b/orangecontrib/text/widgets/tests/test_owworldcloud.py index 2191c63be..485e71729 100644 --- a/orangecontrib/text/widgets/tests/test_owworldcloud.py +++ b/orangecontrib/text/widgets/tests/test_owworldcloud.py @@ -1,6 +1,9 @@ import unittest +import numpy as np from Orange.widgets.tests.base import WidgetTest +from scipy.sparse import csr_matrix + from orangecontrib.text.corpus import Corpus from orangecontrib.text.widgets.owwordcloud import OWWordCloud @@ -29,6 +32,72 @@ def test_empty_data(self): self.send_signal(self.widget.Inputs.corpus, self.corpus[:0]) self.assertTrue(self.widget.documents_info_str == "(no documents on input)") + def test_bow_features(self): + """ + When bag of words features are at the input word cloud must be made + based on BOW weights. + """ + data = self.corpus[:3] + data.extend_attributes( + csr_matrix([[3, 2, 0], [0, 3, 6], [0, 1, 0]]), + ["Word1", "Word2", "Word3"]) + for v in data.domain.attributes: + v.attributes["bow-feature"] = True + + self.send_signal(self.widget.Inputs.corpus, data) + self.assertDictEqual( + self.widget.corpus_counter, {"Word1": 1, "Word2": 2, "Word3": 2}) + output = self.get_output(self.widget.Outputs.word_counts) + np.testing.assert_array_equal([2, 2, 1], output.X.flatten()) + np.testing.assert_array_equal( + ["Word2", "Word3", "Word1"], output.metas.flatten()) + self.assertListEqual( + [(2.0, 'Word2'), (2.0, 'Word3'), (1.0, 'Word1')], + self.widget.tablemodel[:]) + + # try with one word not bow-feature + data = self.corpus[:3] + data.extend_attributes( + csr_matrix([[3, 2, 0], [0, 3, 6], [0, 1, 0]]), + ["Word1", "Word2", "Word3"]) + for v in data.domain.attributes[:2]: + v.attributes["bow-feature"] = True + + self.send_signal(self.widget.Inputs.corpus, data) + self.assertDictEqual( + self.widget.corpus_counter, {"Word1": 1, "Word2": 2}) + output = self.get_output(self.widget.Outputs.word_counts) + np.testing.assert_array_equal([2, 1], output.X.flatten()) + np.testing.assert_array_equal( + ["Word2", "Word1"], output.metas.flatten()) + self.assertListEqual( + [(2.0, 'Word2'), (1.0, 'Word1')], + self.widget.tablemodel[:]) + + def test_bow_info(self): + """ + Widget shows info when bow-features used. This test tests this info. + """ + data = self.corpus[:3] + + # no data no info + self.assertFalse(self.widget.Info.bow_weights.is_shown()) + self.send_signal(self.widget.Inputs.corpus, data) + self.assertFalse(self.widget.Info.bow_weights.is_shown()) + self.send_signal(self.widget.Inputs.corpus, None) + self.assertFalse(self.widget.Info.bow_weights.is_shown()) + + # send bow data + data.extend_attributes( + csr_matrix([[3, 2, 0], [0, 3, 6], [0, 1, 0]]), + ["Word1", "Word2", "Word3"]) + for v in data.domain.attributes: + v.attributes["bow-feature"] = True + self.send_signal(self.widget.Inputs.corpus, data) + self.assertTrue(self.widget.Info.bow_weights.is_shown()) + self.send_signal(self.widget.Inputs.corpus, None) + self.assertFalse(self.widget.Info.bow_weights.is_shown()) + if __name__ == "__main__": unittest.main()