jbesomi · henrifroese · Aug 18, 2020 · Aug 19, 2020 · Aug 19, 2020 · Aug 21, 2020
diff --git a/tests/test_preprocessing.py b/tests/test_preprocessing.py
@@ -381,3 +381,45 @@ def test_remove_hashtags(self):
         s_true = pd.Series("Hi  , we will remove you")
 
         self.assertEqual(preprocessing.remove_hashtags(s), s_true)
+
+    """
+    Filter Extremes
+    """
+
+    def test_filter_extrems(self):
+        s = pd.Series(
+            [
+                "Here one two one one one go there",
+                "two go one one one two two two is important",
+            ]
+        )
+        s_result = s.pipe(preprocessing.tokenize).pipe(preprocessing.filter_extremes, 3)
+        s_true = pd.Series(
+            [
+                ["one", "two", "one", "one", "one", "go"],
+                ["two", "go", "one", "one", "one", "two", "two", "two"],
+            ]
+        )
+        pd.testing.assert_series_equal(s_result, s_true)
+
+    def test_filter_extrems_min_and_max(self):
+        s = pd.Series(
+            [
+                "Here one two one one one go there",
+                "two go one one one two two two is important",
+                "one two three four this is good",
+                "here one one important statement",
+            ]
+        )
+        s_result = s.pipe(preprocessing.tokenize).pipe(
+            preprocessing.filter_extremes, min_df=2, max_df=3
+        )
+        s_true = pd.Series(
+            [
+                ["two", "go"],
+                ["two", "go", "two", "two", "two", "is", "important"],
+                ["two", "is"],
+                ["important"],
+            ]
+        )
+        pd.testing.assert_series_equal(s_result, s_true)
diff --git a/texthero/nlp.py b/texthero/nlp.py
@@ -158,7 +158,7 @@ def pos_tag(s: TextSeries) -> pd.Series:
     coarse-grained POS has a NOUN value, then the refined POS will give more
     details about the type of the noun, whether it is singular, plural and/or
     proper.
-    
+
     You can use the spacy `explain` function to find out which fine-grained
     POS it is.
 

diff --git a/texthero/preprocessing.py b/texthero/preprocessing.py
@@ -14,6 +14,7 @@
 
 from texthero import stopwords as _stopwords
 from texthero._types import TokenSeries, TextSeries, InputSeries
+from texthero import representation
 
 from typing import List, Callable, Union
 
@@ -49,7 +50,7 @@ def lowercase(s: TextSeries) -> TextSeries:
     """
     Lowercase all texts in a series.
 
-    
+
     Examples
     --------
     >>> import texthero as hero
@@ -143,8 +144,8 @@ def replace_punctuation(s: TextSeries, symbol: str = " ") -> TextSeries:
     Replace all punctuation with a given symbol.
 
     Replace all punctuation from the given
-    Pandas Series with a custom symbol. 
-    It considers as punctuation characters all :data:`string.punctuation` 
+    Pandas Series with a custom symbol.
+    It considers as punctuation characters all :data:`string.punctuation`
     symbols `!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~).`
 
 
@@ -367,7 +368,6 @@ def remove_stopwords(
     0    Texthero      
     dtype: object
 
-
     """
     return replace_stopwords(s, symbol="", stopwords=stopwords)
 
@@ -861,7 +861,7 @@ def replace_hashtags(s: TextSeries, symbol: str) -> TextSeries:
     """Replace all hashtags from a Pandas Series with symbol
 
     A hashtag is a string formed by # concatenated with a sequence of
-    characters, digits and underscores. Example: #texthero_123. 
+    characters, digits and underscores. Example: #texthero_123.
 
     Parameters
     ----------
@@ -889,7 +889,7 @@ def remove_hashtags(s: TextSeries) -> TextSeries:
     """Remove all hashtags from a given Pandas Series
 
     A hashtag is a string formed by # concatenated with a sequence of
-    characters, digits and underscores. Example: #texthero_123. 
+    characters, digits and underscores. Example: #texthero_123.
 
     Examples
     --------
@@ -906,3 +906,78 @@ def remove_hashtags(s: TextSeries) -> TextSeries:
         with a custom symbol.
     """
     return replace_hashtags(s, " ")
+
+
+@InputSeries(TokenSeries)
+def filter_extremes(
+    s: TokenSeries, max_words=None, min_df=1, max_df=1.0
+) -> TokenSeries:
+    """
+    Decrease the size of your documents by
+    filtering out words by their frequency.
+
+    It is often useful to reduce the size of your dataset
+    by dropping words in order to
+    reduce noise and improve performance.
+    This function removes all words/tokens from
+    all documents where the
+    document frequency (=number of documents a term appears in) is
+    -  below min_df
+    - above max_df.
+
+    When min_df or max_df is an integer, then document frequency
+    is the absolute number of documents that a term
+    appears in. When it's a float, it is the
+    proportion of documents a term appears in.
+
+    Additionally, only max_words many words are kept.
+
+    Parameters
+    ----------
+    max_words : int, default to None
+        The maximum number of words/tokens that
+        are kept, according to term frequency descending.
+        If None, will consider all features.
+
+    min_df : int or float, default to 1
+        Remove words that have a document frequency
+        lower than min_df. If float, it represents a
+        proportion of documents, integer absolute counts.
+
+    max_df : int or float, default to 1
+        Remove words that have a document frequency
+        higher than max_df. If float, it represents a
+        proportion of documents, integer absolute counts.
+
+    Example
+    -------
+    >>> import texthero as hero
+    >>> import pandas as pd
+    >>> s = pd.Series(
+    ...        [
+    ...         "Here one two one one one go there",
+    ...         "two go one one one two two two is important",
+    ...     ]
+    ... )
+    >>> s.pipe(hero.tokenize).pipe(hero.filter_extremes, 3)
+    0              [one, two, one, one, one, go]
+    1    [two, go, one, one, one, two, two, two]
+    dtype: object
+    """
+    # Use term_frequency to do the filtering
+    # for us (cannot do this faster as we
+    # need to build the document-term matrix
+    # anyway to filter by min_df and max_df).
+    s_term_frequency = representation.term_frequency(
+        s, max_features=max_words, min_df=min_df, max_df=max_df
+    )
+
+    # The remaining tokens are exactly the subcolumn names
+    # in the term_frequency DocumentTermDF.
+    tokens_to_keep = set(s_term_frequency.columns)
+
+    # Go through documents and only keep tokens in tokens_to_keep.
+    # FIXME: Parallelize this after #162 is merged.
+    return s.apply(
+        lambda token_list: [token for token in token_list if token in tokens_to_keep]
+    )
diff --git a/texthero/representation.py b/texthero/representation.py
@@ -78,7 +78,7 @@ def count(
 
     min_df : float in range [0.0, 1.0] or int, optional, default=1
         When building the vocabulary ignore terms that have a document
-        frequency (number of documents they appear in) strictly 
+        frequency (number of documents they appear in) strictly
         lower than the given threshold.
         If float, the parameter represents a proportion of documents,
         integer absolute counts.
@@ -154,7 +154,7 @@ def term_frequency(
 
     min_df : float in range [0.0, 1.0] or int, optional, default=1
         When building the vocabulary ignore terms that have a document
-        frequency (number of documents they appear in) strictly 
+        frequency (number of documents they appear in) strictly
         lower than the given threshold.
         If float, the parameter represents a proportion of documents,
         integer absolute counts.
@@ -233,7 +233,7 @@ def tfidf(s: pd.Series, max_features=None, min_df=1, max_df=1.0,) -> pd.DataFram
 
     min_df : float in range [0.0, 1.0] or int, optional, default=1
         When building the vocabulary ignore terms that have a document
-        frequency (number of documents they appear in) strictly 
+        frequency (number of documents they appear in) strictly
         lower than the given threshold.
         If float, the parameter represents a proportion of documents, 
         integer absolute counts.
@@ -378,7 +378,7 @@ def nmf(
     natural language processing to find clusters of similar
     texts (e.g. some texts in a corpus might be about sports
     and some about music, so they will differ in the usage
-    of technical terms; see the example below). 
+    of technical terms; see the example below).
 
     Given a document-term matrix (so in
     texthero usually a Series after applying
@@ -424,7 +424,7 @@ def nmf(
     >>> # As we can see, the third document, which
     >>> # is a mix of sports and music, is placed
     >>> # between the two axes (the topics) while
-    >>> # the other documents are placed right on 
+    >>> # the other documents are placed right on
     >>> # one topic axis each.
 
     See also
@@ -575,11 +575,11 @@ def kmeans(
     Performs K-means clustering algorithm on the given input.
 
     K-means clustering is used in natural language processing
-    to separate texts into k clusters (groups) 
+    to separate texts into k clusters (groups)
     (e.g. some texts in a corpus might be about sports
     and some about music, so they will differ in the usage
     of technical terms; the K-means algorithm uses this
-    to separate them into two clusters). 
+    to separate them into two clusters).
 
     Given a document-term matrix (so in
     texthero usually a Series after applying