Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add initial Chinese support for hero.lang.zh.preprocessing #128

Draft
wants to merge 14 commits into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions setup.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,8 @@ install_requires =
unidecode>=1.1.1
gensim>=3.6.0
matplotlib>=3.1.0
wrapt>=1.12.1
jieba>=0.42.1
# TODO pick the correct version.
[options.extras_require]
dev =
Expand Down
Empty file added tests/lang/__init__.py
Empty file.
Empty file added tests/lang/zh/__init__.py
Empty file.
125 changes: 125 additions & 0 deletions tests/lang/zh/test_indexes.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,125 @@
import pandas as pd
from texthero.lang.hero_zh import preprocessing

from ... import PandasTestCase
import unittest
import string
from parameterized import parameterized


# Define valid inputs for different functions.
s_text = pd.Series(["Test"], index=[5])
s_numeric = pd.Series([5.0], index=[5])
s_numeric_lists = pd.Series([[5.0, 5.0], [6.0, 6.0]], index=[5, 6])

# Define all test cases. Every test case is a list
# of [name of test case, function to test, tuple of valid input for the function].
# First argument of valid input has to be the Pandas Series where we
# want to keep the index. If this is different for a function, a separate
# test case has to implemented in the class below.
# The tests will be run by AbstractIndexTest below through the @parameterized
# decorator.
# The names will be expanded automatically, so e.g. "named_entities"
# creates test cases test_correct_index_named_entities and test_incorrect_index_named_entities.


test_cases_preprocessing = [
["fillna", preprocessing.fillna, (s_text,)],
["remove_whitespace", preprocessing.remove_whitespace, (s_text,)],
["clean", preprocessing.clean, (s_text,)],
["remove_html_tags", preprocessing.remove_html_tags, (s_text,)],
["tokenize", preprocessing.tokenize, (s_text,)],
["replace_urls", preprocessing.replace_urls, (s_text, "")],
["remove_urls", preprocessing.remove_urls, (s_text,)],
["replace_tags", preprocessing.replace_tags, (s_text, "")],
["remove_tags", preprocessing.remove_tags, (s_text,)],
["replace_hashtags", preprocessing.replace_hashtags, (s_text, "")],
["remove_hashtags", preprocessing.remove_hashtags, (s_text,)],
]

test_cases = test_cases_preprocessing

# test_cases_nlp = [
# ["named_entities", nlp.named_entities, (s_text,)],
# ["noun_chunks", nlp.noun_chunks, (s_text,)],
# ]
#
# test_cases_preprocessing = [
# ["fillna", preprocessing.fillna, (s_text,)],
# ["lowercase", preprocessing.lowercase, (s_text,)],
# ["replace_digits", preprocessing.replace_digits, (s_text, "")],
# ["remove_digits", preprocessing.remove_digits, (s_text,)],
# ["replace_punctuation", preprocessing.replace_punctuation, (s_text, "")],
# ["remove_punctuation", preprocessing.remove_punctuation, (s_text,)],
# ["remove_diacritics", preprocessing.remove_diacritics, (s_text,)],
# ["remove_whitespace", preprocessing.remove_whitespace, (s_text,)],
# ["replace_stopwords", preprocessing.replace_stopwords, (s_text, "")],
# ["remove_stopwords", preprocessing.remove_stopwords, (s_text,)],
# ["stem", preprocessing.stem, (s_text,)],
# ["clean", preprocessing.clean, (s_text,)],
# ["remove_round_brackets", preprocessing.remove_round_brackets, (s_text,)],
# ["remove_curly_brackets", preprocessing.remove_curly_brackets, (s_text,)],
# ["remove_square_brackets", preprocessing.remove_square_brackets, (s_text,)],
# ["remove_angle_brackets", preprocessing.remove_angle_brackets, (s_text,)],
# ["remove_brackets", preprocessing.remove_brackets, (s_text,)],
# ["remove_html_tags", preprocessing.remove_html_tags, (s_text,)],
# ["tokenize", preprocessing.tokenize, (s_text,)],
# ["tokenize_with_phrases", preprocessing.tokenize_with_phrases, (s_text,)],
# ["replace_urls", preprocessing.replace_urls, (s_text, "")],
# ["remove_urls", preprocessing.remove_urls, (s_text,)],
# ["replace_tags", preprocessing.replace_tags, (s_text, "")],
# ["remove_tags", preprocessing.remove_tags, (s_text,)],
# ]
#
# test_cases_representation = [
# ["count", representation.count, (preprocessing.tokenize(s_text),),],
# [
# "term_frequency",
# representation.term_frequency,
# (preprocessing.tokenize(s_text),),
# ],
# ["tfidf", representation.tfidf, (preprocessing.tokenize(s_text),)],
# ["pca", representation.pca, (s_numeric_lists, 0)],
# ["nmf", representation.nmf, (s_numeric_lists,)],
# ["tsne", representation.tsne, (s_numeric_lists,)],
# ["kmeans", representation.kmeans, (s_numeric_lists, 1)],
# ["dbscan", representation.dbscan, (s_numeric_lists,)],
# ["meanshift", representation.meanshift, (s_numeric_lists,)],
# ]
#
# test_cases_visualization = []
#
# test_cases = (
# test_cases_nlp
# + test_cases_preprocessing
# + test_cases_representation
# + test_cases_visualization
# )


class AbstractIndexTest(PandasTestCase):
"""
Class for index test cases. Tests for all cases
in test_cases whether the input's index is correctly
preserved by the function. Some function's tests
are implemented manually as they take different inputs.

"""

"""
Tests defined in test_cases above.
"""

@parameterized.expand(test_cases)
def test_correct_index(self, name, test_function, valid_input):
s = valid_input[0]
result_s = test_function(*valid_input)
t_same_index = pd.Series(s.values, s.index)
self.assertTrue(result_s.index.equals(t_same_index.index))

@parameterized.expand(test_cases)
def test_incorrect_index(self, name, test_function, valid_input):
s = valid_input[0]
result_s = test_function(*valid_input)
t_different_index = pd.Series(s.values, index=None)
self.assertFalse(result_s.index.equals(t_different_index.index))
126 changes: 126 additions & 0 deletions tests/lang/zh/test_preprocessing.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,126 @@
import string

import pandas as pd
import numpy as np
import doctest

from texthero.lang.hero_zh import preprocessing, stopwords
from ... import PandasTestCase


"""
Test doctest
"""


def load_tests(loader, tests, ignore):
tests.addTests(doctest.DocTestSuite(preprocessing))
return tests


class TestPreprocessing(PandasTestCase):
"""
Remove whitespace.
"""

def test_remove_whitespace(self):
s = pd.Series("早上好啊,\n\t我的朋友。今天我要去吃 KFC。")
s_true = pd.Series("早上好啊, 我的朋友。今天我要去吃 KFC。")
self.assertEqual(preprocessing.remove_whitespace(s), s_true)

"""
Test pipeline.
"""

def test_pipeline_stopwords(self):
s = pd.Series("语言是人类区别其他动物的本质特性。\t@中国NLP第一大师\n#如何定义NLP 为什么呢?")
s_true = pd.Series("语言是人类区别其他动物的本质特性。 为什么呢?")
pipeline = [
preprocessing.remove_whitespace,
preprocessing.remove_hashtags,
preprocessing.remove_tags,
]
self.assertEqual(preprocessing.clean(s, pipeline=pipeline), s_true)

"""
Test remove html tags
"""

def test_remove_html_tags(self):
s = pd.Series("<html> 中国新闻网 <br>体育</br> 标记<html> &nbsp;")
s_true = pd.Series(" 中国新闻网 体育 标记 ")
self.assertEqual(preprocessing.remove_html_tags(s), s_true)

"""
Text tokenization
"""

def test_tokenize(self):
s = pd.Series("我昨天吃烤鸭去了。")
s_true = pd.Series([["我", "昨天", "吃", "烤鸭", "去", "了", "。"]])
self.assertEqual(preprocessing.tokenize(s), s_true)

def test_tokenize_multirows(self):
s = pd.Series(["今天天气真好", "明天会怎样呢"])
s_true = pd.Series([["今天天气", "真", "好"], ["明天", "会", "怎样", "呢"]])
self.assertEqual(preprocessing.tokenize(s), s_true)

"""
Has content
"""

def test_has_content(self):
s = pd.Series(["哈哈", np.nan, "\t\n", " ", "", "这有点东西", None])
s_true = pd.Series([True, False, False, False, False, True, False])
self.assertEqual(preprocessing.has_content(s), s_true)

"""
Test remove urls
"""

def test_remove_urls(self):
s = pd.Series("http://tests.com http://www.tests.com")
s_true = pd.Series(" ")
self.assertEqual(preprocessing.remove_urls(s), s_true)

def test_remove_urls_https(self):
s = pd.Series("https://tests.com https://www.tests.com")
s_true = pd.Series(" ")
self.assertEqual(preprocessing.remove_urls(s), s_true)

def test_remove_urls_multiline(self):
s = pd.Series("https://tests.com \n https://tests.com")
s_true = pd.Series(" \n ")
self.assertEqual(preprocessing.remove_urls(s), s_true)

"""
Test replace and remove tags
"""

def test_replace_tags(self):
s = pd.Series("你好@马丁123abc佩奇,我要把你取关了。")
s_true = pd.Series("你好TAG,我要把你取关了。")

self.assertEqual(preprocessing.replace_tags(s, symbol="TAG"), s_true)

def test_remove_tags(self):
s = pd.Series("你好@马丁123abc佩奇,我要把你取关了。")
s_true = pd.Series("你好 ,我要把你取关了。")

self.assertEqual(preprocessing.remove_tags(s), s_true)

"""
Test replace and remove hashtags
"""

def test_replace_hashtags(self):
s = pd.Series("语言是人类区别其他动物的本质特性。#NLP百科大全")
s_true = pd.Series("语言是人类区别其他动物的本质特性。HASHTAG")

self.assertEqual(preprocessing.replace_hashtags(s, symbol="HASHTAG"), s_true)

def test_remove_hashtags(self):
s = pd.Series("语言是人类区别其他动物的本质特性。#NLP百科大全")
s_true = pd.Series("语言是人类区别其他动物的本质特性。 ")

self.assertEqual(preprocessing.remove_hashtags(s), s_true)
16 changes: 16 additions & 0 deletions texthero/_helper.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

import functools
import warnings
import wrapt


"""
Expand Down Expand Up @@ -70,3 +71,18 @@ def wrapper(*args, **kwargs):
return wrapper

return decorator


def root_caller(target_module):
"""
A decorator to call functions with the same name from `texthero.target_module`. It can
be used for multilingual support when a function can be reused by many languages.

"""

@wrapt.decorator
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is the wrapt module really necessary here? Maybe have a look at the decorator implemented a few lines above that uses just the built-in functools from the standard library. Then the new dependency isn't needed.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I saw in your PR #69 that wrapt is used too. This module looks clean so I just want to try it, not really necessary and I'm fine to remove it. 😃

def wrapper(wrapped, instance, args, kwargs):
root_func = getattr(target_module, wrapped.__name__)
return root_func(*args, **kwargs)

return wrapper
Empty file added texthero/lang/__init__.py
Empty file.
18 changes: 18 additions & 0 deletions texthero/lang/hero_zh/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
"""Texthero: python toolkit for text preprocessing, representation and visualization.



"""
from . import preprocessing
from .preprocessing import *

# from . import representation
# from .representation import *
#
# from . import visualization
# from .visualization import *
#
# from . import nlp
# from .nlp import *

from . import stopwords
Loading