Skip to content

Commit

Permalink
Isolate generic preprocessing functions (piskvorky#3180)
Browse files Browse the repository at this point in the history
* Move preprocessing functions from textcourpus module

* Move preprocessing functions from lowcorpus module

* Add test cases for preprocessing functions

* Fix styling issues

* Refactor remove_stopwords() and strip_short()

* make tests pass

* rm unused import

Co-authored-by: Michael Penkov <m@penkov.dev>
  • Loading branch information
2 people authored and tbbharaj committed Aug 19, 2021
1 parent fbbdb05 commit 6f59f5b
Show file tree
Hide file tree
Showing 4 changed files with 131 additions and 115 deletions.
21 changes: 2 additions & 19 deletions gensim/corpora/lowcorpus.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,28 +11,11 @@

from gensim import utils
from gensim.corpora import IndexedCorpus

from gensim.parsing.preprocessing import split_on_space

logger = logging.getLogger(__name__)


def split_on_space(s):
"""Split line by spaces, used in :class:`gensim.corpora.lowcorpus.LowCorpus`.
Parameters
----------
s : str
Some line.
Returns
-------
list of str
List of tokens from `s`.
"""
return [word for word in utils.to_unicode(s).strip().split(' ') if word]


class LowCorpus(IndexedCorpus):
"""Corpus handles input in `GibbsLda++ format <http://gibbslda.sourceforge.net/>`_.
Expand Down Expand Up @@ -86,7 +69,7 @@ def __init__(self, fname, id2word=None, line2words=split_on_space):
If not provided, the mapping is constructed directly from `fname`.
line2words : callable, optional
Function which converts lines(str) into tokens(list of str),
using :func:`~gensim.corpora.lowcorpus.split_on_space` as default.
using :func:`~gensim.parsing.preprocessing.split_on_space` as default.
"""
IndexedCorpus.__init__(self, fname)
Expand Down
104 changes: 13 additions & 91 deletions gensim/corpora/textcorpus.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,93 +44,15 @@

from gensim import interfaces, utils
from gensim.corpora.dictionary import Dictionary
from gensim.parsing.preprocessing import STOPWORDS, RE_WHITESPACE
from gensim.parsing.preprocessing import (
remove_stopword_tokens, remove_short_tokens,
lower_to_unicode, strip_multiple_whitespaces,
)
from gensim.utils import deaccent, simple_tokenize

logger = logging.getLogger(__name__)


def remove_stopwords(tokens, stopwords=STOPWORDS):
"""Remove stopwords using list from `gensim.parsing.preprocessing.STOPWORDS`.
Parameters
----------
tokens : iterable of str
Sequence of tokens.
stopwords : iterable of str, optional
Sequence of stopwords
Returns
-------
list of str
List of tokens without `stopwords`.
"""
return [token for token in tokens if token not in stopwords]


def remove_short(tokens, minsize=3):
"""Remove tokens shorter than `minsize` chars.
Parameters
----------
tokens : iterable of str
Sequence of tokens.
minsize : int, optimal
Minimal length of token (include).
Returns
-------
list of str
List of tokens without short tokens.
"""
return [token for token in tokens if len(token) >= minsize]


def lower_to_unicode(text, encoding='utf8', errors='strict'):
"""Lowercase `text` and convert to unicode, using :func:`gensim.utils.any2unicode`.
Parameters
----------
text : str
Input text.
encoding : str, optional
Encoding that will be used for conversion.
errors : str, optional
Error handling behaviour, used as parameter for `unicode` function (python2 only).
Returns
-------
str
Unicode version of `text`.
See Also
--------
:func:`gensim.utils.any2unicode`
Convert any string to unicode-string.
"""
return utils.to_unicode(text.lower(), encoding, errors)


def strip_multiple_whitespaces(s):
"""Collapse multiple whitespace characters into a single space.
Parameters
----------
s : str
Input string
Returns
-------
str
String with collapsed whitespaces.
"""
return RE_WHITESPACE.sub(" ", s)


class TextCorpus(interfaces.CorpusABC):
"""Helper class to simplify the pipeline of getting BoW vectors from plain text.
Expand Down Expand Up @@ -177,12 +99,12 @@ class TextCorpus(interfaces.CorpusABC):
The default preprocessing consists of:
#. :func:`~gensim.corpora.textcorpus.lower_to_unicode` - lowercase and convert to unicode (assumes utf8 encoding)
#. :func:`~gensim.parsing.preprocessing.lower_to_unicode` - lowercase and convert to unicode (assumes utf8 encoding)
#. :func:`~gensim.utils.deaccent`- deaccent (asciifolding)
#. :func:`~gensim.corpora.textcorpus.strip_multiple_whitespaces` - collapse multiple whitespaces into a single one
#. :func:`~gensim.parsing.preprocessing.strip_multiple_whitespaces` - collapse multiple whitespaces into one
#. :func:`~gensim.utils.simple_tokenize` - tokenize by splitting on whitespace
#. :func:`~gensim.corpora.textcorpus.remove_short` - remove words less than 3 characters long
#. :func:`~gensim.corpora.textcorpus.remove_stopwords` - remove stopwords
#. :func:`~gensim.parsing.preprocessing.remove_short_tokens` - remove words less than 3 characters long
#. :func:`~gensim.parsing.preprocessing.remove_stopword_tokens` - remove stopwords
"""

Expand All @@ -204,15 +126,15 @@ def __init__(self, input=None, dictionary=None, metadata=False, character_filter
Each will be applied to the text of each document in order, and should return a single string with
the modified text. For Python 2, the original text will not be unicode, so it may be useful to
convert to unicode as the first character filter.
If None - using :func:`~gensim.corpora.textcorpus.lower_to_unicode`,
:func:`~gensim.utils.deaccent` and :func:`~gensim.corpora.textcorpus.strip_multiple_whitespaces`.
If None - using :func:`~gensim.parsing.preprocessing.lower_to_unicode`,
:func:`~gensim.utils.deaccent` and :func:`~gensim.parsing.preprocessing.strip_multiple_whitespaces`.
tokenizer : callable, optional
Tokenizer for document, if None - using :func:`~gensim.utils.simple_tokenize`.
token_filters : iterable of callable, optional
Each will be applied to the iterable of tokens in order, and should return another iterable of tokens.
These filters can add, remove, or replace tokens, or do nothing at all.
If None - using :func:`~gensim.corpora.textcorpus.remove_short` and
:func:`~gensim.corpora.textcorpus.remove_stopwords`.
If None - using :func:`~gensim.parsing.preprocessing.remove_short_tokens` and
:func:`~gensim.parsing.preprocessing.remove_stopword_tokens`.
Examples
--------
Expand Down Expand Up @@ -254,7 +176,7 @@ def __init__(self, input=None, dictionary=None, metadata=False, character_filter

self.token_filters = token_filters
if self.token_filters is None:
self.token_filters = [remove_short, remove_stopwords]
self.token_filters = [remove_short_tokens, remove_stopword_tokens]

self.length = None
self.dictionary = None
Expand Down
95 changes: 91 additions & 4 deletions gensim/parsing/preprocessing.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,17 +68,20 @@
RE_WHITESPACE = re.compile(r"(\s)+", re.UNICODE)


def remove_stopwords(s):
def remove_stopwords(s, stopwords=None):
"""Remove :const:`~gensim.parsing.preprocessing.STOPWORDS` from `s`.
Parameters
----------
s : str
stopwords : iterable of str, optional
Sequence of stopwords
If None - using :const:`~gensim.parsing.preprocessing.STOPWORDS`
Returns
-------
str
Unicode string without :const:`~gensim.parsing.preprocessing.STOPWORDS`.
Unicode string without `stopwords`.
Examples
--------
Expand All @@ -90,7 +93,29 @@ def remove_stopwords(s):
"""
s = utils.to_unicode(s)
return " ".join(w for w in s.split() if w not in STOPWORDS)
return " ".join(remove_stopword_tokens(s.split(), stopwords))


def remove_stopword_tokens(tokens, stopwords=None):
"""Remove stopword tokens using list `stopwords`.
Parameters
----------
tokens : iterable of str
Sequence of tokens.
stopwords : iterable of str, optional
Sequence of stopwords
If None - using :const:`~gensim.parsing.preprocessing.STOPWORDS`
Returns
-------
list of str
List of tokens without `stopwords`.
"""
if stopwords is None:
stopwords = STOPWORDS
return [token for token in tokens if token not in stopwords]


def strip_punctuation(s):
Expand Down Expand Up @@ -170,7 +195,26 @@ def strip_short(s, minsize=3):
"""
s = utils.to_unicode(s)
return " ".join(e for e in s.split() if len(e) >= minsize)
return " ".join(remove_short_tokens(s.split(), minsize))


def remove_short_tokens(tokens, minsize=3):
"""Remove tokens shorter than `minsize` chars.
Parameters
----------
tokens : iterable of str
Sequence of tokens.
minsize : int, optimal
Minimal length of token (include).
Returns
-------
list of str
List of tokens without short tokens.
"""

return [token for token in tokens if len(token) >= minsize]


def strip_numeric(s):
Expand Down Expand Up @@ -308,6 +352,49 @@ def stem_text(text):
stem = stem_text


def lower_to_unicode(text, encoding='utf8', errors='strict'):
"""Lowercase `text` and convert to unicode, using :func:`gensim.utils.any2unicode`.
Parameters
----------
text : str
Input text.
encoding : str, optional
Encoding that will be used for conversion.
errors : str, optional
Error handling behaviour, used as parameter for `unicode` function (python2 only).
Returns
-------
str
Unicode version of `text`.
See Also
--------
:func:`gensim.utils.any2unicode`
Convert any string to unicode-string.
"""
return utils.to_unicode(text.lower(), encoding, errors)


def split_on_space(s):
"""Split line by spaces, used in :class:`gensim.corpora.lowcorpus.LowCorpus`.
Parameters
----------
s : str
Some line.
Returns
-------
list of str
List of tokens from `s`.
"""
return [word for word in utils.to_unicode(s).strip().split(' ') if word]


DEFAULT_FILTERS = [
lambda x: x.lower(), strip_tags, strip_punctuation,
strip_multiple_whitespaces, strip_numeric,
Expand Down
26 changes: 25 additions & 1 deletion gensim/test/test_parsing.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,12 +7,17 @@

import logging
import unittest

import mock
import numpy as np

from gensim.parsing.preprocessing import (
remove_short_tokens,
remove_stopword_tokens,
remove_stopwords,
stem_text,
split_alphanum,
split_on_space,
strip_multiple_whitespaces,
strip_non_alphanum,
strip_numeric,
Expand All @@ -21,7 +26,6 @@
strip_tags,
)


# several documents
doc1 = """C'est un trou de verdure où chante une rivière,
Accrochant follement aux herbes des haillons
Expand Down Expand Up @@ -76,6 +80,26 @@ def test_split_alphanum(self):
def test_strip_stopwords(self):
self.assertEqual(remove_stopwords("the world is square"), "world square")

# confirm redifining the global `STOPWORDS` working
with mock.patch('gensim.parsing.preprocessing.STOPWORDS', frozenset(["the"])):
self.assertEqual(remove_stopwords("the world is square"), "world is square")

def test_strip_stopword_tokens(self):
self.assertEqual(remove_stopword_tokens(["the", "world", "is", "sphere"]), ["world", "sphere"])

# confirm redifining the global `STOPWORDS` working
with mock.patch('gensim.parsing.preprocessing.STOPWORDS', frozenset(["the"])):
self.assertEqual(
remove_stopword_tokens(["the", "world", "is", "sphere"]),
["world", "is", "sphere"]
)

def test_strip_short_tokens(self):
self.assertEqual(remove_short_tokens(["salut", "les", "amis", "du", "59"], 3), ["salut", "les", "amis"])

def test_split_on_space(self):
self.assertEqual(split_on_space(" salut les amis du 59 "), ["salut", "les", "amis", "du", "59"])

def test_stem_text(self):
target = \
"while it is quit us to be abl to search a larg " + \
Expand Down

0 comments on commit 6f59f5b

Please sign in to comment.