Skip to content

Commit

Permalink
Merge pull request #294 from lanzagar/preprocess
Browse files Browse the repository at this point in the history
[FIX] preprocess: Use default tokenizer when None
  • Loading branch information
nikicc authored Aug 2, 2017
2 parents 0240780 + f93d2b9 commit 1639182
Show file tree
Hide file tree
Showing 3 changed files with 24 additions and 21 deletions.
3 changes: 0 additions & 3 deletions orangecontrib/text/preprocess/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,3 @@
from .tokenize import *
from .transform import *
from .preprocess import *

base_preprocessor = Preprocessor(transformers=[LowercaseTransformer()],
tokenizer=WordPunctTokenizer())
16 changes: 13 additions & 3 deletions orangecontrib/text/preprocess/preprocess.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,12 @@
from orangecontrib.text.preprocess import FrequencyFilter
from orangecontrib.text.preprocess import (
FrequencyFilter, LowercaseTransformer, WordPunctTokenizer)

__all__ = ['Preprocessor']

__all__ = ['Preprocessor', 'base_preprocessor']


BASE_TOKENIZER = WordPunctTokenizer()
BASE_TRANSFORMERS = [LowercaseTransformer()]


class Preprocessor:
Expand Down Expand Up @@ -87,7 +93,7 @@ def process_document(self, document):
if self.tokenizer:
tokens = self.tokenizer.tokenize(document)
else:
tokens = [document]
tokens = BASE_TOKENIZER.tokenize(document)

if self.normalizer:
tokens = self.normalizer(tokens)
Expand Down Expand Up @@ -133,3 +139,7 @@ def report(self):
('Frequency filter', str(self.freq_filter)),
('Pos tagger', str(self.pos_tagger)),
)


base_preprocessor = Preprocessor(transformers=BASE_TRANSFORMERS,
tokenizer=BASE_TOKENIZER)
26 changes: 11 additions & 15 deletions orangecontrib/text/tests/test_preprocess.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,20 +26,13 @@ def setUp(self):
self.corpus = Corpus.from_file('deerwester')

def test_string_processor(self):
class StripStringTransformer(preprocess.BaseTransformer):
@classmethod
def transform(cls, string):
return string[:-1]
p = Preprocessor(transformers=StripStringTransformer())

np.testing.assert_equal(p(self.corpus).tokens,
np.array([[doc[:-1]] for doc in self.corpus.documents]))
p = Preprocessor(transformers=preprocess.LowercaseTransformer())
tokens = p(self.corpus).tokens
p2 = Preprocessor(transformers=[])
tokens2 = p2(self.corpus).tokens

p = Preprocessor(transformers=[StripStringTransformer(),
preprocess.LowercaseTransformer()])

np.testing.assert_equal(p(self.corpus).tokens,
np.array([[doc[:-1].lower()] for doc in self.corpus.documents]))
np.testing.assert_equal(tokens,
[[t.lower() for t in doc] for doc in tokens2])

self.assertRaises(TypeError, Preprocessor, string_transformers=1)

Expand All @@ -59,9 +52,12 @@ class CapTokenNormalizer(preprocess.BaseNormalizer):
def normalize(cls, token):
return token.capitalize()
p = Preprocessor(normalizer=CapTokenNormalizer())
tokens = p(self.corpus).tokens
p2 = Preprocessor(normalizer=None)
tokens2 = p2(self.corpus).tokens

np.testing.assert_equal(p(self.corpus).tokens,
np.array([[sent.capitalize()] for sent in self.corpus.documents]))
np.testing.assert_equal(
tokens, [[t.capitalize() for t in doc] for doc in tokens2])

def test_token_filter(self):
class SpaceTokenizer(preprocess.BaseTokenizer):
Expand Down

0 comments on commit 1639182

Please sign in to comment.