Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[ENH] Computation in separate thread for base vectorizer; use base vectorizer for embedding #852

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 11 additions & 0 deletions orangecontrib/text/tests/test_bowvectorizer.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import unittest
from unittest.mock import MagicMock, call

import numpy as np
from Orange.data import Domain, StringVariable
Expand Down Expand Up @@ -231,6 +232,16 @@ def test_tfidf_correctness(self):
idfs_test = self.test_counts * np.log(n / document_appearance)
self.assert_bow_same(bow_test, idfs_test, self.terms)

def test_callback(self):
vect = BowVectorizer()
corpus = Corpus.from_file("deerwester")
callback = MagicMock()

result = vect.transform(corpus, callback=callback)
self.assertIsInstance(result, Corpus)
self.assertEqual(len(result.domain.variables), 43)
callback.assert_has_calls([call(0.3), call(0.6), call(0.9), call(1)])


if __name__ == "__main__":
unittest.main()
36 changes: 13 additions & 23 deletions orangecontrib/text/tests/test_documentembedder.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,22 +34,22 @@ def tearDown(self):

@patch(PATCH_METHOD)
def test_with_empty_corpus(self, mock):
self.assertIsNone(self.embedder(self.corpus[:0])[0])
self.assertIsNone(self.embedder(self.corpus[:0])[1])
self.assertIsNone(self.embedder.transform(self.corpus[:0])[0])
self.assertIsNone(self.embedder.transform(self.corpus[:0])[1])
mock.request.assert_not_called()
mock.get_response.assert_not_called()
self.assertEqual(self.embedder._embedder._cache._cache_dict, dict())

@patch(PATCH_METHOD, make_dummy_post(b'{"embedding": [0.3, 1]}'))
def test_success_subset(self):
res, skipped = self.embedder(self.corpus[[0]])
res, skipped = self.embedder.transform(self.corpus[[0]])
assert_array_equal(res.X, [[0.3, 1]])
self.assertEqual(len(self.embedder._embedder._cache._cache_dict), 1)
self.assertIsNone(skipped)

@patch(PATCH_METHOD, make_dummy_post(b'{"embedding": [0.3, 1]}'))
def test_success_shapes(self):
res, skipped = self.embedder(self.corpus)
res, skipped = self.embedder.transform(self.corpus)
self.assertEqual(res.X.shape, (len(self.corpus), 2))
self.assertEqual(len(res.domain.variables),
len(self.corpus.domain.variables) + 2)
Expand All @@ -58,31 +58,31 @@ def test_success_shapes(self):
@patch(PATCH_METHOD, make_dummy_post(b''))
def test_empty_response(self):
with self.assertWarns(RuntimeWarning):
res, skipped = self.embedder(self.corpus[[0]])
res, skipped = self.embedder.transform(self.corpus[[0]])
self.assertIsNone(res)
self.assertEqual(len(skipped), 1)
self.assertEqual(len(self.embedder._embedder._cache._cache_dict), 0)

@patch(PATCH_METHOD, make_dummy_post(b'str'))
def test_invalid_response(self):
with self.assertWarns(RuntimeWarning):
res, skipped = self.embedder(self.corpus[[0]])
res, skipped = self.embedder.transform(self.corpus[[0]])
self.assertIsNone(res)
self.assertEqual(len(skipped), 1)
self.assertEqual(len(self.embedder._embedder._cache._cache_dict), 0)

@patch(PATCH_METHOD, make_dummy_post(b'{"embeddings": [0.3, 1]}'))
def test_invalid_json_key(self):
with self.assertWarns(RuntimeWarning):
res, skipped = self.embedder(self.corpus[[0]])
res, skipped = self.embedder.transform(self.corpus[[0]])
self.assertIsNone(res)
self.assertEqual(len(skipped), 1)
self.assertEqual(len(self.embedder._embedder._cache._cache_dict), 0)

@patch(PATCH_METHOD, make_dummy_post(b'{"embedding": [0.3, 1]}'))
def test_persistent_caching(self):
self.assertEqual(len(self.embedder._embedder._cache._cache_dict), 0)
self.embedder(self.corpus[[0]])
self.embedder.transform(self.corpus[[0]])
self.assertEqual(len(self.embedder._embedder._cache._cache_dict), 1)
self.embedder._embedder._cache.persist_cache()

Expand All @@ -98,7 +98,7 @@ def test_cache_for_different_languages(self):
embedder = DocumentEmbedder(language='sl')
embedder.clear_cache()
self.assertEqual(len(embedder._embedder._cache._cache_dict), 0)
embedder(self.corpus[[0]])
embedder.transform(self.corpus[[0]])
self.assertEqual(len(embedder._embedder._cache._cache_dict), 1)
embedder._embedder._cache.persist_cache()

Expand All @@ -116,44 +116,34 @@ def test_cache_for_different_aggregators(self):
embedder = DocumentEmbedder(aggregator='max')
embedder.clear_cache()
self.assertEqual(len(embedder._embedder._cache._cache_dict), 0)
embedder(self.corpus[[0]])
embedder.transform(self.corpus[[0]])
self.assertEqual(len(embedder._embedder._cache._cache_dict), 1)
embedder._embedder._cache.persist_cache()

embedder = DocumentEmbedder(aggregator='min')
self.assertEqual(len(embedder._embedder._cache._cache_dict), 1)
embedder(self.corpus[[0]])
embedder.transform(self.corpus[[0]])
self.assertEqual(len(embedder._embedder._cache._cache_dict), 2)

@patch(PATCH_METHOD, make_dummy_post(b'{"embedding": [0.3, 1]}'))
def test_with_statement(self):
with self.embedder as embedder:
res, skipped = embedder(self.corpus[[0]])
assert_array_equal(res.X, [[0.3, 1]])

@patch(PATCH_METHOD, make_dummy_post(b'{"embedding": [0.3, 1]}'))
def test_cancel(self):
self.assertFalse(self.embedder._embedder._cancelled)
self.embedder._embedder._cancelled = True
with self.assertRaises(Exception):
self.embedder(self.corpus[[0]])
self.embedder.transform(self.corpus[[0]])

@patch(PATCH_METHOD, side_effect=OSError)
def test_connection_error(self, _):
embedder = DocumentEmbedder()
with self.assertRaises(ConnectionError):
embedder(self.corpus[[0]])
embedder.transform(self.corpus[[0]])

def test_invalid_parameters(self):
with self.assertRaises(ValueError):
self.embedder = DocumentEmbedder(language='eng')
with self.assertRaises(ValueError):
self.embedder = DocumentEmbedder(aggregator='average')

def test_invalid_corpus_type(self):
with self.assertRaises(ValueError):
self.embedder(self.corpus[0])


if __name__ == "__main__":
unittest.main()
15 changes: 15 additions & 0 deletions orangecontrib/text/tests/test_simhash.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import unittest
from unittest.mock import MagicMock, call

from orangecontrib.text.corpus import Corpus
from orangecontrib.text.vectorization import SimhashVectorizer
Expand All @@ -18,3 +19,17 @@ def test_transform(self):
def test_report(self):
vect = SimhashVectorizer()
self.assertGreater(len(vect.report()), 0)

def test_callback(self):
vect = SimhashVectorizer(shingle_len=10, f=64)
callback = MagicMock()
result = vect.transform(self.corpus, callback=callback)

self.assertIsInstance(result, Corpus)
self.assertEqual(len(result), len(self.corpus))
self.assertEqual(result.X.shape, (len(self.corpus), 64))
callback.assert_has_calls([call(i / len(self.corpus)) for i in range(9)])


if __name__ == "__main__":
unittest.main()
9 changes: 8 additions & 1 deletion orangecontrib/text/vectorization/bagofwords.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
from functools import partial

import numpy as np
from Orange.util import dummy_callback
from gensim import corpora, models, matutils
from sklearn.preprocessing import normalize

Expand Down Expand Up @@ -68,18 +69,23 @@ def __init__(self, norm=NONE, wlocal=COUNT, wglobal=NONE):
self.wlocal = wlocal
self.wglobal = wglobal

def _transform(self, corpus, source_dict=None):
def _transform(self, corpus, source_dict=None, callback=dummy_callback):
if not (len(corpus.dictionary) or source_dict) or not len(corpus):
return corpus
temp_corpus = list(corpus.ngrams_iterator(' ', include_postags=True))
dic = corpora.Dictionary(temp_corpus, prune_at=None) if not source_dict else source_dict
callback(0.3)
temp_corpus = [dic.doc2bow(doc) for doc in temp_corpus]
model = models.TfidfModel(dictionary=dic, normalize=False,
wlocal=self.wlocals[self.wlocal],
wglobal=self.wglobals[self.wglobal])
callback(0.6)

X = matutils.corpus2csc(model[temp_corpus], dtype=float, num_terms=len(dic)).T
norm = self.norms[self.norm]
if norm:
X = norm(X)
callback(0.9)

# set compute values
shared_cv = SharedTransform(self, corpus.used_preprocessor,
Expand All @@ -88,6 +94,7 @@ def _transform(self, corpus, source_dict=None):
for i in range(len(dic))]

corpus = self.add_features(corpus, X, dic, cv, var_attrs={'bow-feature': True})
callback(1)
return corpus

def report(self):
Expand Down
9 changes: 4 additions & 5 deletions orangecontrib/text/vectorization/base.py
Original file line number Diff line number Diff line change
@@ -1,22 +1,21 @@
import numpy as np

from Orange.data.util import SharedComputeValue
from Orange.util import dummy_callback
from orangecontrib.text.util import Sparse2CorpusSliceable


class BaseVectorizer:
"""Base class for vectorization objects. """
name = NotImplemented

def transform(self, corpus, copy=True, source_dict=None):
def transform(self, corpus, copy=True, source_dict=None, callback=dummy_callback):
"""Transforms a corpus to a new one with additional attributes. """
if not (len(corpus.dictionary) or source_dict) or not len(corpus):
return corpus
if copy:
corpus = corpus.copy()
return self._transform(corpus, source_dict)
return self._transform(corpus, source_dict, callback)

def _transform(self, corpus, source_dict):
def _transform(self, corpus, source_dict, callback):
raise NotImplementedError

def report(self):
Expand Down
46 changes: 16 additions & 30 deletions orangecontrib/text/vectorization/document_embedder.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,13 +6,14 @@
import sys
import warnings
import zlib
from typing import Any, List, Optional, Tuple, Union
from typing import Any, Optional, Tuple

import numpy as np
from Orange.misc.server_embedder import ServerEmbedderCommunicator
from Orange.util import dummy_callback

from orangecontrib.text import Corpus
from orangecontrib.text.vectorization.base import BaseVectorizer

AGGREGATORS = ['Mean', 'Sum', 'Max', 'Min']
AGGREGATORS_L = ['mean', 'sum', 'max', 'min']
Expand Down Expand Up @@ -52,7 +53,7 @@
LANGUAGES = list(LANGS_TO_ISO.values())


class DocumentEmbedder:
class DocumentEmbedder(BaseVectorizer):
"""This class is used for obtaining dense embeddings of documents in
corpus using fastText pretrained models from:
E. Grave, P. Bojanowski, P. Gupta, A. Joulin, T. Mikolov,
Expand Down Expand Up @@ -93,9 +94,9 @@ def __init__(self, language: str = 'en',
server_url='https://apiv2.garaza.io',
embedder_type='text')

def __call__(
self, corpus: Union[Corpus, List[List[str]]], callback=dummy_callback
) -> Union[Tuple[Corpus, Corpus], List[Optional[List[float]]]]:
def _transform(
self, corpus: Corpus, _, callback=dummy_callback
) -> Tuple[Corpus, Corpus]:
"""Adds matrix of document embeddings to a corpus.

Parameters
Expand All @@ -109,14 +110,7 @@ def __call__(
Corpus (original or a copy) with new features added.
Skipped documents
Corpus of documents that were not embedded

Raises
------
ValueError
If corpus is not instance of Corpus.
"""
if not isinstance(corpus, (Corpus, list)):
raise ValueError("Input should be instance of Corpus or list.")
embs = self._embedder.embedd_data(
list(corpus.ngrams) if isinstance(corpus, Corpus) else corpus,
callback=callback,
Expand All @@ -135,12 +129,6 @@ def __call__(
skipped_documents = [emb is None for emb in embs]
embedded_documents = np.logical_not(skipped_documents)

variable_attrs = {
'hidden': True,
'skip-normalization': True,
'embedding-feature': True
}

new_corpus = None
if np.any(embedded_documents):
# if at least one embedding is not None, extend attributes
Expand All @@ -150,18 +138,22 @@ def __call__(
[e for e, ns in zip(embs, embedded_documents) if ns],
dtype=float,
),
['Dim{}'.format(i + 1) for i in range(dim)],
var_attrs=variable_attrs
["Dim{}".format(i + 1) for i in range(dim)],
var_attrs={
"embedding-feature": True,
"hidden": True,
},
)

skipped_corpus = None
if np.any(skipped_documents):
skipped_corpus = corpus[skipped_documents].copy()
skipped_corpus.name = "Skipped documents"
warnings.warn(("Some documents were not embedded for " +
"unknown reason. Those documents " +
"are skipped."),
RuntimeWarning)
warnings.warn(
"Some documents were not embedded for unknown reason. Those "
"documents are skipped.",
RuntimeWarning,
)

return new_corpus, skipped_corpus

Expand All @@ -181,12 +173,6 @@ def clear_cache(self):
if self._embedder:
self._embedder.clear_cache()

def __enter__(self):
return self

def __exit__(self, _, __, ___):
pass


class _ServerEmbedder(ServerEmbedderCommunicator):
def __init__(self, aggregator: str, *args, **kwargs) -> None:
Expand Down
12 changes: 9 additions & 3 deletions orangecontrib/text/vectorization/simhash.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import nltk
from Orange.util import dummy_callback
from simhash import Simhash
import numpy as np

Expand Down Expand Up @@ -36,7 +37,7 @@ def compute_hash(self, tokens):
def int2binarray(self, num):
return [int(x) for x in self._bin_format.format(num)]

def _transform(self, corpus, source_dict):
def _transform(self, corpus, _, callback=dummy_callback):
""" Computes simhash values from the given corpus
and creates a new one with a simhash attribute.

Expand All @@ -46,8 +47,13 @@ def _transform(self, corpus, source_dict):
Returns:
Corpus with `simhash` variable
"""

X = np.array([self.int2binarray(self.compute_hash(doc)) for doc in corpus.tokens], dtype=float)
if not len(corpus):
return corpus
hashes = []
for i, doc in enumerate(corpus.tokens):
hashes.append(self.int2binarray(self.compute_hash(doc)))
callback(i / len(corpus))
X = np.array(hashes, dtype=float)
corpus = corpus.extend_attributes(
X,
feature_names=[
Expand Down
17 changes: 7 additions & 10 deletions orangecontrib/text/widgets/owbagofwords.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,16 +48,13 @@ def create_configuration_layout(self):

return layout

def update_method(self):
self.method = self.Method(norm=self.normalization,
wlocal=self.wlocal,
wglobal=self.wglobal)
def init_method(self):
return self.Method(
norm=self.normalization, wlocal=self.wlocal, wglobal=self.wglobal
)


if __name__ == '__main__':
app = QApplication([])
widget = OWTBagOfWords()
widget.show()
corpus = Corpus.from_file('book-excerpts')
widget.set_data(corpus)
app.exec()
from orangewidget.utils.widgetpreview import WidgetPreview

WidgetPreview(OWTBagOfWords).run(Corpus.from_file("book-excerpts"))
Loading