jbesomi · ryangawei · Jul 29, 2020 · Jul 29, 2020 · Jul 29, 2020 · Jul 29, 2020
diff --git a/setup.cfg b/setup.cfg
@@ -38,6 +38,8 @@ install_requires =
     unidecode>=1.1.1
     gensim>=3.6.0
     matplotlib>=3.1.0
+    wrapt>=1.12.1
+    jieba>=0.42.1
 # TODO pick the correct version.
 [options.extras_require]
 dev =

diff --git a/tests/lang/__init__.py b/tests/lang/__init__.py
diff --git a/tests/lang/zh/__init__.py b/tests/lang/zh/__init__.py
diff --git a/tests/lang/zh/test_indexes.py b/tests/lang/zh/test_indexes.py
@@ -0,0 +1,125 @@
+import pandas as pd
+from texthero.lang.hero_zh import preprocessing
+
+from ... import PandasTestCase
+import unittest
+import string
+from parameterized import parameterized
+
+
+# Define valid inputs for different functions.
+s_text = pd.Series(["Test"], index=[5])
+s_numeric = pd.Series([5.0], index=[5])
+s_numeric_lists = pd.Series([[5.0, 5.0], [6.0, 6.0]], index=[5, 6])
+
+# Define all test cases. Every test case is a list
+# of [name of test case, function to test, tuple of valid input for the function].
+# First argument of valid input has to be the Pandas Series where we
+# want to keep the index. If this is different for a function, a separate
+# test case has to implemented in the class below.
+# The tests will be run by AbstractIndexTest below through the @parameterized
+# decorator.
+# The names will be expanded automatically, so e.g. "named_entities"
+# creates test cases test_correct_index_named_entities and test_incorrect_index_named_entities.
+
+
+test_cases_preprocessing = [
+    ["fillna", preprocessing.fillna, (s_text,)],
+    ["remove_whitespace", preprocessing.remove_whitespace, (s_text,)],
+    ["clean", preprocessing.clean, (s_text,)],
+    ["remove_html_tags", preprocessing.remove_html_tags, (s_text,)],
+    ["tokenize", preprocessing.tokenize, (s_text,)],
+    ["replace_urls", preprocessing.replace_urls, (s_text, "")],
+    ["remove_urls", preprocessing.remove_urls, (s_text,)],
+    ["replace_tags", preprocessing.replace_tags, (s_text, "")],
+    ["remove_tags", preprocessing.remove_tags, (s_text,)],
+    ["replace_hashtags", preprocessing.replace_hashtags, (s_text, "")],
+    ["remove_hashtags", preprocessing.remove_hashtags, (s_text,)],
+]
+
+test_cases = test_cases_preprocessing
+
+# test_cases_nlp = [
+# ["named_entities", nlp.named_entities, (s_text,)],
+# ["noun_chunks", nlp.noun_chunks, (s_text,)],
+# ]
+#
+# test_cases_preprocessing = [
+# ["fillna", preprocessing.fillna, (s_text,)],
+# ["lowercase", preprocessing.lowercase, (s_text,)],
+# ["replace_digits", preprocessing.replace_digits, (s_text, "")],
+# ["remove_digits", preprocessing.remove_digits, (s_text,)],
+# ["replace_punctuation", preprocessing.replace_punctuation, (s_text, "")],
+# ["remove_punctuation", preprocessing.remove_punctuation, (s_text,)],
+# ["remove_diacritics", preprocessing.remove_diacritics, (s_text,)],
+# ["remove_whitespace", preprocessing.remove_whitespace, (s_text,)],
+# ["replace_stopwords", preprocessing.replace_stopwords, (s_text, "")],
+# ["remove_stopwords", preprocessing.remove_stopwords, (s_text,)],
+# ["stem", preprocessing.stem, (s_text,)],
+# ["clean", preprocessing.clean, (s_text,)],
+# ["remove_round_brackets", preprocessing.remove_round_brackets, (s_text,)],
+# ["remove_curly_brackets", preprocessing.remove_curly_brackets, (s_text,)],
+# ["remove_square_brackets", preprocessing.remove_square_brackets, (s_text,)],
+# ["remove_angle_brackets", preprocessing.remove_angle_brackets, (s_text,)],
+# ["remove_brackets", preprocessing.remove_brackets, (s_text,)],
+# ["remove_html_tags", preprocessing.remove_html_tags, (s_text,)],
+# ["tokenize", preprocessing.tokenize, (s_text,)],
+# ["tokenize_with_phrases", preprocessing.tokenize_with_phrases, (s_text,)],
+# ["replace_urls", preprocessing.replace_urls, (s_text, "")],
+# ["remove_urls", preprocessing.remove_urls, (s_text,)],
+# ["replace_tags", preprocessing.replace_tags, (s_text, "")],
+# ["remove_tags", preprocessing.remove_tags, (s_text,)],
+# ]
+#
+# test_cases_representation = [
+# ["count", representation.count, (preprocessing.tokenize(s_text),),],
+# [
+# "term_frequency",
+# representation.term_frequency,
+# (preprocessing.tokenize(s_text),),
+# ],
+# ["tfidf", representation.tfidf, (preprocessing.tokenize(s_text),)],
+# ["pca", representation.pca, (s_numeric_lists, 0)],
+# ["nmf", representation.nmf, (s_numeric_lists,)],
+# ["tsne", representation.tsne, (s_numeric_lists,)],
+# ["kmeans", representation.kmeans, (s_numeric_lists, 1)],
+# ["dbscan", representation.dbscan, (s_numeric_lists,)],
+# ["meanshift", representation.meanshift, (s_numeric_lists,)],
+# ]
+#
+# test_cases_visualization = []
+#
+# test_cases = (
+# test_cases_nlp
+# + test_cases_preprocessing
+# + test_cases_representation
+# + test_cases_visualization
+# )
+
+
+class AbstractIndexTest(PandasTestCase):
+    """
+    Class for index test cases. Tests for all cases
+    in test_cases whether the input's index is correctly
+    preserved by the function. Some function's tests
+    are implemented manually as they take different inputs.
+
+    """
+
+    """
+    Tests defined in test_cases above.
+    """
+
+    @parameterized.expand(test_cases)
+    def test_correct_index(self, name, test_function, valid_input):
+        s = valid_input[0]
+        result_s = test_function(*valid_input)
+        t_same_index = pd.Series(s.values, s.index)
+        self.assertTrue(result_s.index.equals(t_same_index.index))
+
+    @parameterized.expand(test_cases)
+    def test_incorrect_index(self, name, test_function, valid_input):
+        s = valid_input[0]
+        result_s = test_function(*valid_input)
+        t_different_index = pd.Series(s.values, index=None)
+        self.assertFalse(result_s.index.equals(t_different_index.index))
diff --git a/tests/lang/zh/test_preprocessing.py b/tests/lang/zh/test_preprocessing.py
@@ -0,0 +1,126 @@
+import string
+
+import pandas as pd
+import numpy as np
+import doctest
+
+from texthero.lang.hero_zh import preprocessing, stopwords
+from ... import PandasTestCase
+
+
+"""
+Test doctest
+"""
+
+
+def load_tests(loader, tests, ignore):
+    tests.addTests(doctest.DocTestSuite(preprocessing))
+    return tests
+
+
+class TestPreprocessing(PandasTestCase):
+    """
+    Remove whitespace.
+    """
+
+    def test_remove_whitespace(self):
+        s = pd.Series("早上好啊，\n\t我的朋友。今天我要去吃     KFC。")
+        s_true = pd.Series("早上好啊， 我的朋友。今天我要去吃 KFC。")
+        self.assertEqual(preprocessing.remove_whitespace(s), s_true)
+
+    """
+    Test pipeline.
+    """
+
+    def test_pipeline_stopwords(self):
+        s = pd.Series("语言是人类区别其他动物的本质特性。\t@中国NLP第一大师\n#如何定义NLP 为什么呢？")
+        s_true = pd.Series("语言是人类区别其他动物的本质特性。     为什么呢？")
+        pipeline = [
+            preprocessing.remove_whitespace,
+            preprocessing.remove_hashtags,
+            preprocessing.remove_tags,
+        ]
+        self.assertEqual(preprocessing.clean(s, pipeline=pipeline), s_true)
+
+    """
+    Test remove html tags
+    """
+
+    def test_remove_html_tags(self):
+        s = pd.Series("<html> 中国新闻网 <br>体育</br> 标记<html> &nbsp;")
+        s_true = pd.Series(" 中国新闻网 体育 标记 ")
+        self.assertEqual(preprocessing.remove_html_tags(s), s_true)
+
+    """
+    Text tokenization
+    """
+
+    def test_tokenize(self):
+        s = pd.Series("我昨天吃烤鸭去了。")
+        s_true = pd.Series([["我", "昨天", "吃", "烤鸭", "去", "了", "。"]])
+        self.assertEqual(preprocessing.tokenize(s), s_true)
+
+    def test_tokenize_multirows(self):
+        s = pd.Series(["今天天气真好", "明天会怎样呢"])
+        s_true = pd.Series([["今天天气", "真", "好"], ["明天", "会", "怎样", "呢"]])
+        self.assertEqual(preprocessing.tokenize(s), s_true)
+
+    """
+    Has content
+    """
+
+    def test_has_content(self):
+        s = pd.Series(["哈哈", np.nan, "\t\n", " ", "", "这有点东西", None])
+        s_true = pd.Series([True, False, False, False, False, True, False])
+        self.assertEqual(preprocessing.has_content(s), s_true)
+
+    """
+    Test remove urls
+    """
+
+    def test_remove_urls(self):
+        s = pd.Series("http://tests.com http://www.tests.com")
+        s_true = pd.Series("   ")
+        self.assertEqual(preprocessing.remove_urls(s), s_true)
+
+    def test_remove_urls_https(self):
+        s = pd.Series("https://tests.com https://www.tests.com")
+        s_true = pd.Series("   ")
+        self.assertEqual(preprocessing.remove_urls(s), s_true)
+
+    def test_remove_urls_multiline(self):
+        s = pd.Series("https://tests.com \n https://tests.com")
+        s_true = pd.Series("  \n  ")
+        self.assertEqual(preprocessing.remove_urls(s), s_true)
+
+    """
+    Test replace and remove tags
+    """
+
+    def test_replace_tags(self):
+        s = pd.Series("你好@马丁123abc佩奇，我要把你取关了。")
+        s_true = pd.Series("你好TAG，我要把你取关了。")
+
+        self.assertEqual(preprocessing.replace_tags(s, symbol="TAG"), s_true)
+
+    def test_remove_tags(self):
+        s = pd.Series("你好@马丁123abc佩奇，我要把你取关了。")
+        s_true = pd.Series("你好 ，我要把你取关了。")
+
+        self.assertEqual(preprocessing.remove_tags(s), s_true)
+
+    """
+    Test replace and remove hashtags
+    """
+
+    def test_replace_hashtags(self):
+        s = pd.Series("语言是人类区别其他动物的本质特性。#NLP百科大全")
+        s_true = pd.Series("语言是人类区别其他动物的本质特性。HASHTAG")
+
+        self.assertEqual(preprocessing.replace_hashtags(s, symbol="HASHTAG"), s_true)
+
+    def test_remove_hashtags(self):
+        s = pd.Series("语言是人类区别其他动物的本质特性。#NLP百科大全")
+        s_true = pd.Series("语言是人类区别其他动物的本质特性。 ")
+
+        self.assertEqual(preprocessing.remove_hashtags(s), s_true)
diff --git a/texthero/_helper.py b/texthero/_helper.py
@@ -4,6 +4,7 @@
 
 import functools
 import warnings
+import wrapt
 
 
 """
@@ -70,3 +71,18 @@ def wrapper(*args, **kwargs):
         return wrapper
 
     return decorator
+
+
+def root_caller(target_module):
+    """
+    A decorator to call functions with the same name from `texthero.target_module`. It can 
+    be used for multilingual support when a function can be reused by many languages.
+
+    """
+
+    @wrapt.decorator
+    def wrapper(wrapped, instance, args, kwargs):
+        root_func = getattr(target_module, wrapped.__name__)
+        return root_func(*args, **kwargs)
+
+    return wrapper
diff --git a/texthero/lang/__init__.py b/texthero/lang/__init__.py
diff --git a/texthero/lang/hero_zh/__init__.py b/texthero/lang/hero_zh/__init__.py
@@ -0,0 +1,18 @@
+"""Texthero: python toolkit for text preprocessing, representation and visualization.
+
+
+
+"""
+from . import preprocessing
+from .preprocessing import *
+
+# from . import representation
+# from .representation import *
+#
+# from . import visualization
+# from .visualization import *
+#
+# from . import nlp
+# from .nlp import *
+
+from . import stopwords