Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Revert "Corpus: Language Detection" #618

Merged
merged 1 commit into from
Jan 22, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 0 additions & 18 deletions orangecontrib/text/corpus.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,6 @@
import numpy as np
import scipy.sparse as sp
from gensim import corpora
import fasttext

from Orange.data import (
Variable,
Expand Down Expand Up @@ -83,7 +82,6 @@ def __init__(self, domain=None, X=None, Y=None, metas=None, W=None,
from orangecontrib.text.preprocess import PreprocessorList
self.__used_preprocessor = PreprocessorList([]) # required for compute values
self._titles: Optional[np.ndarray] = None
self.languages = None
self._pp_documents = None # preprocessed documents

if domain is not None and text_features is None:
Expand Down Expand Up @@ -226,22 +224,6 @@ def _unique_titles(titles: List[str]) -> List[str]:
new_titles.append(t)
return new_titles

def detect_languages(self):
"""
Detects language of each document using fastText language
identification model.
[A. Joulin, E. Grave, P. Bojanowski, T. Mikolov,
Bag of Tricks for Efficient Text Classification],
[A. Joulin, E. Grave, P. Bojanowski, M. Douze, H. Jégou, T. Mikolov,
FastText.zip: Compressing text classification models]
"""
path = os.path.join(os.path.dirname(__file__), 'models', 'lid.176.ftz')
model = fasttext.load_model(path)
texts = [' '.join(t.replace('\n', ' ').split(' ')[:2000])
for t in self.documents]
self.languages = [model.predict(t)[0][0].replace('__label__', '')
for t in texts]

def _infer_text_features(self):
"""
Infer which text features to use. If nothing was provided
Expand Down
Binary file removed orangecontrib/text/models/lid.176.ftz
Binary file not shown.
8 changes: 0 additions & 8 deletions orangecontrib/text/tests/test_corpus.py
Original file line number Diff line number Diff line change
Expand Up @@ -573,14 +573,6 @@ def test_pickle_corpus(self):
c = pp(c)
pickle.dumps(c)

def test_languages(self):
corpus = Corpus.from_file('deerwester')

self.assertIsNone(corpus.languages)
corpus.detect_languages()
self.assertEqual(len(corpus.languages), len(corpus))
self.assertListEqual(corpus.languages, ['en' for _ in range(len(corpus))])


if __name__ == "__main__":
unittest.main()
65 changes: 1 addition & 64 deletions orangecontrib/text/widgets/owcorpus.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,7 @@
import os
import numpy as np
from copy import copy

from Orange.data import Table, StringVariable, Variable, DiscreteVariable, \
Domain
from Orange.data import Table, StringVariable, Variable
from Orange.data.io import FileFormat
from Orange.widgets import gui
from Orange.widgets.utils.itemmodels import VariableListModel, DomainModel
Expand All @@ -14,7 +12,6 @@
from Orange.widgets.utils.concurrent import TaskState, ConcurrentWidgetMixin
from orangecontrib.text.corpus import Corpus, get_sample_corpora_dir
from orangecontrib.text.widgets.utils import widgets, QSize
from orangecontrib.text.vectorization.base import get_unique_names


class OWCorpus(OWWidget, ConcurrentWidgetMixin):
Expand Down Expand Up @@ -51,7 +48,6 @@ class Outputs:
])
used_attrs = ContextSetting([])
title_variable = ContextSetting("")
detect_languages = Setting(False)

class Error(OWWidget.Error):
read_file = Msg("Can't read file ({})")
Expand Down Expand Up @@ -104,10 +100,6 @@ def __init__(self):
self.unused_attrs_view.setModel(self.unused_attrs_model)
ibox.layout().addWidget(self.unused_attrs_view)

gui.checkBox(self.controlArea, self, "detect_languages",
"Detect language automatically",
callback=self.handle_languages)

# Documentation Data Sets & Report
box = gui.hBox(self.controlArea)
self.browse_documentation = gui.button(
Expand Down Expand Up @@ -254,7 +246,6 @@ def describe(corpus):
if self.corpus is None:
self.info.set_output_summary(self.info.NoOutput)
else:
self.handle_languages()
self.info.set_output_summary(
str(len(self.corpus)), describe(self.corpus))

Expand Down Expand Up @@ -312,60 +303,6 @@ def describe(features):
('Target', describe(domain.class_vars)),
))

def handle_languages(self):
if self.corpus is not None:
domain = self.corpus.domain
if self.detect_languages:
if self.corpus.languages is None:
self.corpus.detect_languages()

curr_attributes = list(domain.attributes)
curr_class_var = [domain.class_var] if domain.class_var else []
curr_metas = list(domain.metas)
curr_variables = curr_attributes + curr_class_var + curr_metas
curr_names = [var.name for var in curr_variables]
new_name = get_unique_names(curr_names, "Language")

variable_attrs = {'language-feature': True}
new_variable = StringVariable(new_name)
new_variable.attributes.update(variable_attrs)
new_domain = Domain(
attributes=domain.attributes,
class_vars=domain.class_var,
metas=list(domain.metas) + [new_variable]
)
metas = np.hstack([self.corpus.metas,
np.array(self.corpus.languages).reshape(-1, 1)])
self.corpus = Corpus(new_domain,
self.corpus.X.copy(),
self.corpus.Y.copy(),
metas,
self.corpus.W.copy(),
copy(self.corpus.text_features))
else:
lang_feat_idx = None
for i, f in enumerate(domain.metas):
if ('language-feature' in f.attributes and
f.attributes['language-feature']):
lang_feat_idx = i
break
if lang_feat_idx is not None:
new_domain = Domain(
attributes=domain.attributes,
class_vars=domain.class_var,
metas=list(np.delete(list(domain.metas),
lang_feat_idx))
)
self.corpus = Corpus(
new_domain,
self.corpus.X.copy(),
self.corpus.Y.copy(),
np.delete(self.corpus.metas, lang_feat_idx, axis=1),
self.corpus.W.copy(),
copy(self.corpus.text_features)
)
self.Outputs.corpus.send(self.corpus)


if __name__ == '__main__':
from AnyQt.QtWidgets import QApplication
Expand Down
1 change: 0 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -16,4 +16,3 @@ docx2txt>=0.6
lxml
biopython # Enables Pubmed widget.
ufal.udpipe >=1.2.0.3
fasttext