Skip to content

Commit

Permalink
Use custom pandas Series in preprocessing.py
Browse files Browse the repository at this point in the history
  • Loading branch information
ryangawei committed Aug 15, 2020
1 parent a9942bc commit 99d94d9
Show file tree
Hide file tree
Showing 2 changed files with 41 additions and 47 deletions.
2 changes: 0 additions & 2 deletions setup.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -38,8 +38,6 @@ install_requires =
unidecode>=1.1.1
gensim>=3.6.0
matplotlib>=3.1.0
wrapt>=1.12.1
jieba>=0.42.1
# TODO pick the correct version.
[options.extras_require]
dev =
Expand Down
86 changes: 41 additions & 45 deletions texthero/lang/zh/preprocessing.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,9 +11,11 @@
import pandas as pd

from spacy.lang.zh import Chinese

import texthero as hero
from texthero._types import TokenSeries, TextSeries, InputSeries

# Standard functions supported Chinese
# Standard functions that supports Chinese
from texthero.preprocessing import (
fillna,
has_content,
Expand Down Expand Up @@ -55,12 +57,14 @@

def get_default_pipeline() -> List[Callable[[pd.Series], pd.Series]]:
"""
Return a list contaning all the methods used in the default cleaning pipeline.
Return a list with the following functions:
1. :meth:`texthero.preprocessing.fillna`
2. :meth:`texthero.preprocessing.remove_whitespace`
3. :meth:`texthero.preprocessing.tokenize`
See also
--------
:meth:`texthero.preprocessing.get_default_pipeline`
"""
return [
fillna,
Expand All @@ -70,18 +74,17 @@ def get_default_pipeline() -> List[Callable[[pd.Series], pd.Series]]:
]


def clean(s: pd.Series, pipeline=None) -> pd.Series:
@InputSeries(TextSeries)
def clean(s: TextSeries, pipeline=None) -> TextSeries:
"""
Pre-process a text-based Pandas Series, by using the following default pipline.
Default pipeline:
1. :meth:`texthero.preprocessing.fillna`
2. :meth:`texthero.preprocessing.remove_whitespace`
3. :meth:`texthero.preprocessing.tokenize`
Parameters
----------
s : Pandas Series
s : :class:`texthero._types.TextSeries`
pipeline :List[Callable[[Pandas Series], Pandas Series]]
inserting specific pipeline to clean a text
Expand All @@ -96,22 +99,23 @@ def clean(s: pd.Series, pipeline=None) -> pd.Series:
>>> hero.clean(s)
0 [我, 昨天, 吃, 烤鸭, 去, 了, 。, 挺好吃, 的, 。]
dtype: object
See also
--------
:meth:`texthero.preprocessing.clean`
"""
if not pipeline:
pipeline = get_default_pipeline()

return hero.preprocessing.clean(s, pipeline)


def replace_tags(s: pd.Series, symbol: str) -> pd.Series:
"""Replace all tags from a given Pandas Series with symbol.
A tag is a string formed by @ concatenated with a sequence of Chinese & English characters and digits.
Example: @我爱texthero123.
@InputSeries(TextSeries)
def replace_tags(s: TextSeries, symbol: str) -> TextSeries:
"""
Parameters
----------
s : Pandas Series
s : :class:`texthero._types.TextSeries`
symbols : str
Symbols to replace
Expand All @@ -125,18 +129,18 @@ def replace_tags(s: pd.Series, symbol: str) -> pd.Series:
0 你好啊TAG。
dtype: object
See also
--------
:meth:`texthero.preprocessing.replace_tags`
"""

pattern = r"@[a-zA-Z0-9\u4e00-\u9fa5]+"
return s.str.replace(pattern, symbol)


def remove_tags(s: pd.Series) -> pd.Series:
"""Remove all tags from a given Pandas Series.
A tag is a string formed by @ concatenated with a sequence of Chinese & English characters and digits.
Example: @我爱texthero123. Tags are replaced by an empty space ` `.
@InputSeries(TextSeries)
def remove_tags(s: TextSeries) -> TextSeries:
"""
Examples
--------
>>> import texthero.lang.zh as hero
Expand All @@ -148,20 +152,17 @@ def remove_tags(s: pd.Series) -> pd.Series:
See also
--------
:meth:`texthero.preprocessing.replace_tags` for replacing a tag with a custom symbol.
:meth:`texthero.preprocessing.remove_tags`
"""
return replace_tags(s, " ")


def replace_hashtags(s: pd.Series, symbol: str) -> pd.Series:
"""Replace all hashtags from a Pandas Series with symbol
A hashtag is a string formed by # concatenated with a sequence of Chinese & English characters, digits and underscores.
Example: #杰克_texthero_123.
@InputSeries(TextSeries)
def replace_hashtags(s: TextSeries, symbol: str) -> TextSeries:
"""
Parameters
----------
s : Pandas Series
s : :class:`texthero._types.TextSeries`
symbols : str
Symbols to replace
Expand All @@ -175,17 +176,17 @@ def replace_hashtags(s: pd.Series, symbol: str) -> pd.Series:
0 今天天气真不错HASHTAG。
dtype: object
See also
--------
:meth:`texthero.preprocessing.replace_hashtags`
"""
pattern = r"#[a-zA-Z0-9_\u4e00-\u9fa5]+"
return s.str.replace(pattern, symbol)


def remove_hashtags(s: pd.Series) -> pd.Series:
"""Remove all hashtags from a given Pandas Series
A hashtag is a string formed by # concatenated with a sequence of Chinese & English characters, digits and underscores.
Example: #杰克_texthero_123.
@InputSeries(TextSeries)
def remove_hashtags(s: TextSeries) -> TextSeries:
"""
Examples
--------
>>> import texthero.lang.zh as hero
Expand All @@ -197,21 +198,14 @@ def remove_hashtags(s: pd.Series) -> pd.Series:
See also
--------
:meth:`texthero.preprocessing.replace_hashtags` for replacing a hashtag with a custom symbol.
:meth:`texthero.preprocessing.remove_hashtags`
"""
return replace_hashtags(s, " ")


def tokenize(s: pd.Series) -> pd.Series:
@InputSeries(TextSeries)
def tokenize(s: TextSeries) -> TokenSeries:
"""
Tokenize each row of the given Series.
Tokenize each row of the given Pandas Series and return a Pandas Series where each row contains a list of tokens.
Algorithm: add a space between any punctuation symbol at
exception if the symbol is between two alphanumeric character and split.
Examples
--------
>>> import texthero.lang.zh as hero
Expand All @@ -221,7 +215,9 @@ def tokenize(s: pd.Series) -> pd.Series:
0 [我, 昨天, 吃, 烤鸭, 去, 了, 。]
dtype: object
See also
--------
:meth:`texthero.preprocessing.tokenize`
"""

tokenizer = Chinese()
return s.apply(lambda string: [token.text for token in tokenizer(string)])

0 comments on commit 99d94d9

Please sign in to comment.