From fa342a92d4f007cebfce29f1f22a4e31fedc56c6 Mon Sep 17 00:00:00 2001 From: Maximilian Krahn Date: Tue, 18 Aug 2020 22:06:14 +0200 Subject: [PATCH 01/21] added MultiIndex DF support suport MultiIndex as function parameter returns MultiIndex, where Representation was returned * missing: correct test Co-authored-by: Henri Froese --- tests/test_indexes.py | 18 +-- tests/test_representation.py | 63 +------- texthero/representation.py | 294 +++++++++++++---------------------- texthero/visualization.py | 4 +- 4 files changed, 115 insertions(+), 264 deletions(-) diff --git a/tests/test_indexes.py b/tests/test_indexes.py index cc041c3a..af7afcd2 100644 --- a/tests/test_indexes.py +++ b/tests/test_indexes.py @@ -56,21 +56,9 @@ ] test_cases_representation = [ - [ - "count", - lambda x: representation.flatten(representation.count(x)), - (s_tokenized_lists,), - ], - [ - "term_frequency", - lambda x: representation.flatten(representation.term_frequency(x)), - (s_tokenized_lists,), - ], - [ - "tfidf", - lambda x: representation.flatten(representation.tfidf(x)), - (s_tokenized_lists,), - ], + ["count", representation.count, (s_tokenized_lists,),], + ["term_frequency", representation.term_frequency, (s_tokenized_lists,),], + ["tfidf", representation.tfidf, (s_tokenized_lists,),], ["pca", representation.pca, (s_numeric_lists, 0)], ["nmf", representation.nmf, (s_numeric_lists,)], ["tsne", representation.tsne, (s_numeric_lists,)], diff --git a/tests/test_representation.py b/tests/test_representation.py index 036775af..41b81ffa 100644 --- a/tests/test_representation.py +++ b/tests/test_representation.py @@ -50,16 +50,9 @@ def _tfidf(term, corpus, document_index): [["Test", "Test", "TEST", "!"], ["Test", "?", ".", "."]], index=[5, 7] ) -s_tokenized_output_index = pd.MultiIndex.from_tuples( - [(0, "!"), (0, "TEST"), (0, "Test"), (1, "."), (1, "?"), (1, "Test")], -) - -s_tokenized_output_noncontinuous_index = pd.MultiIndex.from_tuples( - [(5, "!"), (5, "TEST"), (5, "Test"), (7, "."), (7, "?"), (7, "Test")], -) - -s_tokenized_output_min_df_index = pd.MultiIndex.from_tuples([(0, "Test"), (1, "Test")],) +s_tokenized_output_index = [0,1] +s_tokenized_output_index_noncontinous = [5,7] test_cases_vectorization = [ # format: [function_name, function, correct output for tokenized input above, dtype of output] @@ -182,55 +175,3 @@ def test_tfidf_formula(self): ).astype("Sparse") self.assertEqual(representation.tfidf(s), s_true) - - """ - flatten. - """ - - def test_flatten(self): - index = pd.MultiIndex.from_tuples( - [("doc0", "Word1"), ("doc0", "Word3"), ("doc1", "Word2")], - ) - s = pd.Series([3, np.nan, 4], index=index) - - s_true = pd.Series( - [[3.0, 0.0, np.nan], [0.0, 4.0, 0.0]], index=["doc0", "doc1"], - ) - - pd.testing.assert_series_equal( - representation.flatten(s), s_true, check_names=False - ) - - def test_flatten_fill_missing_with(self): - index = pd.MultiIndex.from_tuples( - [("doc0", "Word1"), ("doc0", "Word3"), ("doc1", "Word2")], - ) - s = pd.Series([3, np.nan, 4], index=index) - - s_true = pd.Series( - [[3.0, "FILLED", np.nan], ["FILLED", 4.0, "FILLED"]], - index=["doc0", "doc1"], - ) - - pd.testing.assert_series_equal( - representation.flatten(s, fill_missing_with="FILLED"), - s_true, - check_names=False, - ) - - def test_flatten_missing_row(self): - # Simulating a row with no features, so it's completely missing from - # the representation series. - index = pd.MultiIndex.from_tuples( - [("doc0", "Word1"), ("doc0", "Word3"), ("doc1", "Word2")], - ) - s = pd.Series([3, np.nan, 4], index=index) - - s_true = pd.Series( - [[3.0, 0.0, np.nan], [0.0, 4.0, 0.0], [0.0, 0.0, 0.0]], - index=["doc0", "doc1", "doc2"], - ) - - pd.testing.assert_series_equal( - representation.flatten(s, index=s_true.index), s_true, check_names=False - ) diff --git a/texthero/representation.py b/texthero/representation.py index 07b7706c..042db71a 100644 --- a/texthero/representation.py +++ b/texthero/representation.py @@ -27,90 +27,14 @@ """ -def flatten( - s: Union[pd.Series, pd.Series.sparse], - index: pd.Index = None, - fill_missing_with: Any = 0.0, -) -> pd.Series: - """ - Transform a Pandas Representation Series to a "normal" (flattened) Pandas Series. - - The given Series should have a multiindex with first level being the document - and second level being individual features of that document (e.g. tdidf scores per word). - The flattened Series has one cell per document, with the cell being a list of all - the individual features of that document. - - Parameters - ---------- - s : Sparse Pandas Series or Pandas Series - The multiindexed Pandas Series to flatten. - - index : Pandas Index, optional, default to None - The index the flattened Series should have. - - fill_missing_with : Any, default to 0.0 - Value to fill the NaNs (missing values) with. This _does not_ mean - that existing values that are np.nan are replaced, but rather that - features that are not present in one document but present in others - are filled with fill_missing_with. See example below. - - - Examples - -------- - >>> import texthero as hero - >>> import pandas as pd - >>> import numpy as np - >>> index = pd.MultiIndex.from_tuples([("doc0", "Word1"), ("doc0", "Word3"), ("doc1", "Word2")], names=['document', 'word']) - >>> s = pd.Series([3, np.nan, 4], index=index) - >>> s - document word - doc0 Word1 3.0 - Word3 NaN - doc1 Word2 4.0 - dtype: float64 - >>> hero.flatten(s, fill_missing_with=0.0) - document - doc0 [3.0, 0.0, nan] - doc1 [0.0, 4.0, 0.0] - dtype: object - - """ - s = s.unstack(fill_value=fill_missing_with) - - if index is not None: - s = s.reindex(index, fill_value=fill_missing_with) - # Reindexing makes the documents for which no values - # are present in the Sparse Representation Series - # "reappear" correctly. - - s = pd.Series(s.values.tolist(), index=s.index) - - return s - - -def _check_is_valid_representation(s: pd.Series) -> bool: +def _check_is_valid_DocumentTermDF(df: Union[pd.DataFrame, pd.Series]) -> bool: """ - Check if the given Pandas Series is a Document Representation Series. + Check if the given Pandas Series is a Document Term DF. - Returns true if Series is Document Representation Series, else False. + Returns true if input is Document Term DF, else False. """ - - # TODO: in Version 2 when only representation is accepted as input -> change "return False" to "raise ValueError" - - if not isinstance(s.index, pd.MultiIndex): - return False - # raise ValueError( - # f"The input Pandas Series should be a Representation Pandas Series and should have a MultiIndex. The given Pandas Series does not appears to have MultiIndex" - # ) - - if s.index.nlevels != 2: - return False - # raise ValueError( - # f"The input Pandas Series should be a Representation Pandas Series and should have a MultiIndex, where the first level represent the document and the second one the words/token. The given Pandas Series has {s.index.nlevels} number of levels instead of 2." - # ) - - return True + return isinstance(df, pd.DataFrame) and isinstance(df.columns, pd.MultiIndex) # Warning message for not-tokenized inputs @@ -132,11 +56,11 @@ def count( min_df=1, max_df=1.0, binary=False, -) -> pd.Series: +) -> pd.DataFrame: """ Represent a text-based Pandas Series using count. - Return a Document Representation Series with the + Return a Document Term DataFrame with the number of occurences of a document's words for every document. TODO add tutorial link @@ -144,10 +68,6 @@ def count( The input Series should already be tokenized. If not, it will be tokenized before count is calculated. - Use :meth:`hero.representation.flatten` on the output to get - a standard Pandas Series with the document vectors - in every cell. - Parameters ---------- s : Pandas Series (tokenized) @@ -177,15 +97,14 @@ def count( >>> import pandas as pd >>> s = pd.Series(["Sentence one", "Sentence two"]).pipe(hero.tokenize) >>> hero.count(s) - 0 Sentence 1 - one 1 - 1 Sentence 1 - two 1 - dtype: Sparse[int64, 0] + count + Sentence one two + 0 1 1 0 + 1 1 0 1 See Also -------- - Document Representation Series: TODO add tutorial link + Document Term DataFrame: TODO add tutorial link """ # TODO. Can be rewritten without sklearn. @@ -204,25 +123,23 @@ def count( ) tf_vectors_csr = tf.fit_transform(s) - tf_vectors_coo = coo_matrix(tf_vectors_csr) - s_out = pd.Series.sparse.from_coo(tf_vectors_coo) - - features_names = tf.get_feature_names() - - # Map word index to word name - s_out.index = s_out.index.map(lambda x: (s.index[x[0]], features_names[x[1]])) + multiindexed_columns = pd.MultiIndex.from_tuples( + [("count", word) for word in tf.get_feature_names()] + ) - return s_out + return pd.DataFrame.sparse.from_spmatrix( + tf_vectors_csr, s.index, multiindexed_columns + ) def term_frequency( s: pd.Series, max_features: Optional[int] = None, min_df=1, max_df=1.0, -) -> pd.Series: +) -> pd.DataFrame: """ Represent a text-based Pandas Series using term frequency. - Return a Document Representation Series with the + Return a Document Term DataFrame with the term frequencies of the terms for every document. TODO add tutorial link @@ -230,11 +147,6 @@ def term_frequency( The input Series should already be tokenized. If not, it will be tokenized before term_frequency is calculated. - Use :meth:`hero.representation.flatten` on the output to get - a standard Pandas Series with the document vectors - in every cell. - - Parameters ---------- s : Pandas Series (tokenized) @@ -261,16 +173,14 @@ def term_frequency( >>> import pandas as pd >>> s = pd.Series(["Sentence one hey", "Sentence two"]).pipe(hero.tokenize) >>> hero.term_frequency(s) - 0 Sentence 0.2 - hey 0.2 - one 0.2 - 1 Sentence 0.2 - two 0.2 - dtype: Sparse[float64, nan] + term_frequency + Sentence hey one two + 0 0.2 0.2 0.2 0.0 + 1 0.2 0.0 0.0 0.2 See Also -------- - Document Representation Series: TODO add tutorial link + Document Term DataFrame: TODO add tutorial link """ # Check if input is tokenized. Else, print warning and tokenize. if not isinstance(s.iloc[0], list): @@ -291,17 +201,16 @@ def term_frequency( total_count_coo = np.sum(tf_vectors_coo) frequency_coo = np.divide(tf_vectors_coo, total_count_coo) - s_out = pd.Series.sparse.from_coo(frequency_coo) - - features_names = tf.get_feature_names() - - # Map word index to word name - s_out.index = s_out.index.map(lambda x: (s.index[x[0]], features_names[x[1]])) + multiindexed_columns = pd.MultiIndex.from_tuples( + [("term_frequency", word) for word in tf.get_feature_names()] + ) - return s_out + return pd.DataFrame.sparse.from_spmatrix( + frequency_coo, s.index, multiindexed_columns + ) -def tfidf(s: pd.Series, max_features=None, min_df=1, max_df=1.0,) -> pd.Series: +def tfidf(s: pd.Series, max_features=None, min_df=1, max_df=1.0,) -> pd.DataFrame: """ Represent a text-based Pandas Series using TF-IDF. @@ -324,20 +233,13 @@ def tfidf(s: pd.Series, max_features=None, min_df=1, max_df=1.0,) -> pd.Series: so the result is exactly what you get applying the formula described above. - Return a Document Representation Series with the + Return a Document Term DataFrame with the tfidf of every word in the document. TODO add tutorial link The input Series should already be tokenized. If not, it will be tokenized before tfidf is calculated. - If working with big pandas Series, you might want to limit - the number of features through the max_features parameter. - - Use :meth:`hero.representation.flatten` on the output to get - a standard Pandas Series with the document vectors - in every cell. - Parameters ---------- s : Pandas Series (tokenized) @@ -365,17 +267,16 @@ def tfidf(s: pd.Series, max_features=None, min_df=1, max_df=1.0,) -> pd.Series: >>> import pandas as pd >>> s = pd.Series(["Hi Bye", "Test Bye Bye"]).pipe(hero.tokenize) >>> hero.tfidf(s) - 0 Bye 1.000000 - Hi 1.405465 - 1 Bye 2.000000 - Test 1.405465 - dtype: Sparse[float64, nan] + tfidf + Bye Hi Test + 0 1.0 1.405465 0.000000 + 1 2.0 0.000000 1.405465 See Also -------- `TF-IDF on Wikipedia `_ - Document Representation Series: TODO add tutorial link + Document Term DataFrame: TODO add tutorial link """ # Check if input is tokenized. Else, print warning and tokenize. @@ -395,16 +296,13 @@ def tfidf(s: pd.Series, max_features=None, min_df=1, max_df=1.0,) -> pd.Series: tfidf_vectors_csr = tfidf.fit_transform(s) - # Result from sklearn is in Compressed Sparse Row format. - # Pandas Sparse Series can only be initialized from Coordinate format. - tfidf_vectors_coo = coo_matrix(tfidf_vectors_csr) - s_out = pd.Series.sparse.from_coo(tfidf_vectors_coo) - - # Map word index to word name and keep original index of documents. - feature_names = tfidf.get_feature_names() - s_out.index = s_out.index.map(lambda x: (s.index[x[0]], feature_names[x[1]])) + multiindexed_columns = pd.MultiIndex.from_tuples( + [("tfidf", word) for word in tfidf.get_feature_names()] + ) - return s_out + return pd.DataFrame.sparse.from_spmatrix( + tfidf_vectors_csr, s.index, multiindexed_columns + ) """ @@ -412,7 +310,9 @@ def tfidf(s: pd.Series, max_features=None, min_df=1, max_df=1.0,) -> pd.Series: """ -def pca(s, n_components=2, random_state=None) -> pd.Series: +def pca( + s: Union[pd.Series, pd.DataFrame], n_components=2, random_state=None +) -> pd.Series: """ Perform principal component analysis on the given Pandas Series. @@ -434,7 +334,7 @@ def pca(s, n_components=2, random_state=None) -> pd.Series: Parameters ---------- - s : Pandas Series + s : Pandas Series or MuliIndex Sparse DataFrame n_components : Int. Default is 2. Number of components to keep (dimensionality of output vectors). @@ -468,10 +368,18 @@ def pca(s, n_components=2, random_state=None) -> pd.Series: """ pca = PCA(n_components=n_components, random_state=random_state, copy=False) - return pd.Series(pca.fit_transform(list(s)).tolist(), index=s.index) + + if _check_is_valid_DocumentTermDF(s): + values = s.values + else: + values = list(s) + + return pd.Series(pca.fit_transform(values).tolist(), index=s.index) -def nmf(s, n_components=2, random_state=None) -> pd.Series: +def nmf( + s: Union[pd.Series, pd.DataFrame], n_components=2, random_state=None +) -> pd.Series: """ Performs non-negative matrix factorization. @@ -491,7 +399,7 @@ def nmf(s, n_components=2, random_state=None) -> pd.Series: Parameters ---------- - s : Pandas Series + s : Pandas Series or Pandas MultiIndex Sparse DataFrame n_components : Int. Default is 2. Number of components to keep (dimensionality of output vectors). @@ -527,11 +435,17 @@ def nmf(s, n_components=2, random_state=None) -> pd.Series: """ nmf = NMF(n_components=n_components, init="random", random_state=random_state,) - return pd.Series(nmf.fit_transform(list(s)).tolist(), index=s.index) + + if _check_is_valid_DocumentTermDF(s): + values = s.sparse.to_coo() + else: + values = list(s) + + return pd.Series(nmf.fit_transform(values).tolist(), index=s.index) def tsne( - s: pd.Series, + s: Union[pd.Series, pd.DataFrame], n_components=2, perplexity=30.0, learning_rate=200.0, @@ -557,7 +471,7 @@ def tsne( Parameters ---------- - s : Pandas Series + s : Pandas Series or Pandas MultiIndex Sparse DataFrame n_components : int, default is 2. Number of components to keep (dimensionality of output vectors). @@ -619,7 +533,13 @@ def tsne( random_state=random_state, n_jobs=n_jobs, ) - return pd.Series(tsne.fit_transform(list(s)).tolist(), index=s.index) + + if _check_is_valid_DocumentTermDF(s): + values = s.sparse.to_coo() + else: + values = list(s) + + return pd.Series(tsne.fit_transform(values).tolist(), index=s.index) """ @@ -628,7 +548,7 @@ def tsne( def kmeans( - s: pd.Series, + s: Union[pd.Series, pd.DataFrame], n_clusters=5, n_init=10, max_iter=300, @@ -653,7 +573,7 @@ def kmeans( Parameters ---------- - s: Pandas Series + s: Pandas Series or Pandas MultiIndex Sparse DataFrame n_clusters: Int, default to 5. The number of clusters to separate the data into. @@ -686,7 +606,7 @@ def kmeans( >>> import texthero as hero >>> import pandas as pd >>> s = pd.Series(["Football, Sports, Soccer", "music, violin, orchestra", "football, fun, sports", "music, fun, guitar"]) - >>> s = s.pipe(hero.clean).pipe(hero.tokenize).pipe(hero.term_frequency).pipe(hero.flatten) # TODO: when others get Representation Support: remove flatten + >>> s = s.pipe(hero.clean).pipe(hero.tokenize).pipe(hero.term_frequency) >>> hero.kmeans(s, n_clusters=2, random_state=42) 0 1 1 0 @@ -702,7 +622,12 @@ def kmeans( `kmeans on Wikipedia `_ """ - vectors = list(s) + + if _check_is_valid_DocumentTermDF(s): + vectors = s.sparse.to_coo() + else: + vectors = list(s) + kmeans = KMeans( n_clusters=n_clusters, n_init=n_init, @@ -715,7 +640,7 @@ def kmeans( def dbscan( - s, + s: Union[pd.Series, pd.DataFrame], eps=0.5, min_samples=5, metric="euclidean", @@ -743,7 +668,7 @@ def dbscan( Parameters ---------- - s: Pandas Series + s: Pandas Series or Pandas MultiIndex Sparse DataFrame eps : float, default=0.5 The maximum distance between two samples for one to be considered @@ -783,7 +708,7 @@ def dbscan( >>> import texthero as hero >>> import pandas as pd >>> s = pd.Series(["Football, Sports, Soccer", "music, violin, orchestra", "football, fun, sports", "music, enjoy, guitar"]) - >>> s = s.pipe(hero.clean).pipe(hero.tokenize).pipe(hero.tfidf).pipe(hero.flatten) # TODO: when others get Representation Support: remove flatten + >>> s = s.pipe(hero.clean).pipe(hero.tokenize).pipe(hero.tfidf) >>> hero.dbscan(s, min_samples=1, eps=4) 0 0 1 1 @@ -801,6 +726,11 @@ def dbscan( """ + if _check_is_valid_DocumentTermDF(s): + vectors = s.sparse.to_coo() + else: + vectors = list(s) + return pd.Series( DBSCAN( eps=eps, @@ -809,13 +739,13 @@ def dbscan( metric_params=metric_params, leaf_size=leaf_size, n_jobs=n_jobs, - ).fit_predict(list(s)), + ).fit_predict(vectors), index=s.index, ).astype("category") def meanshift( - s, + s: Union[pd.Series, pd.DataFrame], bandwidth=None, bin_seeding=False, min_bin_freq=1, @@ -843,7 +773,7 @@ def meanshift( Parameters ---------- - s: Pandas Series + s: Pandas Series or Pandas MultiIndex Sparse DataFrame bandwidth : float, default=None Bandwidth used in the RBF kernel. @@ -901,6 +831,11 @@ def meanshift( """ + if _check_is_valid_DocumentTermDF(s): + vectors = s.values + else: + vectors = list(s) + return pd.Series( MeanShift( bandwidth=bandwidth, @@ -909,7 +844,7 @@ def meanshift( cluster_all=cluster_all, n_jobs=n_jobs, max_iter=max_iter, - ).fit_predict(list(s)), + ).fit_predict(vectors), index=s.index, ).astype("category") @@ -962,31 +897,18 @@ def normalize(s: pd.Series, norm="l2") -> pd.Series: `Norm on Wikipedia `_ """ + isDocumentTermDF = _check_is_valid_DocumentTermDF(s) - is_valid_representation = ( - isinstance(s.index, pd.MultiIndex) and s.index.nlevels == 2 - ) - - if not is_valid_representation: - raise TypeError( - "The input Pandas Series should be a Representation Pandas Series and should have a MultiIndex. The given Pandas Series does not appears to have MultiIndex" - ) - # TODO after merging representation: use _check_is_valid_representation instead - - if pd.api.types.is_sparse(s): - s_coo_matrix = s.sparse.to_coo()[0] + if isDocumentTermDF: + s_for_vectorization = s.sparse.to_coo() else: - s = s.astype("Sparse") - s_coo_matrix = s.sparse.to_coo()[0] - - s_for_vectorization = s_coo_matrix + s_for_vectorization = list(s) result = sklearn_normalize( s_for_vectorization, norm=norm ) # Can handle sparse input. - result_coo = coo_matrix(result) - s_result = pd.Series.sparse.from_coo(result_coo) - s_result.index = s.index - - return s_result + if isDocumentTermDF: + return pd.DataFrame.sparse.from_spmatrix(result, s.index, s.columns) + else: + return pd.Series(result.tolist(), index=s.index) diff --git a/texthero/visualization.py b/texthero/visualization.py index e213285e..2426ab4d 100644 --- a/texthero/visualization.py +++ b/texthero/visualization.py @@ -63,8 +63,8 @@ def scatterplot( >>> import pandas as pd >>> df = pd.DataFrame(["Football, Sports, Soccer", "music, violin, orchestra", "football, fun, sports", "music, fun, guitar"], columns=["texts"]) >>> df["texts"] = hero.clean(df["texts"]).pipe(hero.tokenize) - >>> df["pca"] = hero.tfidf(df["texts"]).pipe(hero.flatten).pipe(hero.pca, n_components=3) # TODO: when others get Representation Support: remove flatten - >>> df["topics"] = hero.tfidf(df["texts"]).pipe(hero.flatten).pipe(hero.kmeans, n_clusters=2) # TODO: when others get Representation Support: remove flatten + >>> df["pca"] = hero.tfidf(df["texts"]).pipe(hero.pca, n_components=3) + >>> df["topics"] = hero.tfidf(df["texts"]).pipe(hero.kmeans, n_clusters=2) >>> hero.scatterplot(df, col="pca", color="topics", hover_data=["texts"]) # doctest: +SKIP """ From 59a9f8c0df70d8136780b3160bc1d2ca59f48b26 Mon Sep 17 00:00:00 2001 From: Henri Froese Date: Wed, 19 Aug 2020 19:39:30 +0200 Subject: [PATCH 02/21] beginning with tests --- tests/test_representation.py | 147 +++++++++++++++++------------------ texthero/representation.py | 8 +- 2 files changed, 76 insertions(+), 79 deletions(-) diff --git a/tests/test_representation.py b/tests/test_representation.py index 41b81ffa..d4acd369 100644 --- a/tests/test_representation.py +++ b/tests/test_representation.py @@ -50,32 +50,84 @@ def _tfidf(term, corpus, document_index): [["Test", "Test", "TEST", "!"], ["Test", "?", ".", "."]], index=[5, 7] ) -s_tokenized_output_index = [0,1] +s_tokenized_output_index = [0, 1] + +s_tokenized_output_index_noncontinous = [5, 7] + + +def _get_multiindex_for_tokenized_output(first_level_name): + return pd.MultiIndex.from_product( + [[first_level_name], ["!", ".", "?", "TEST", "Test"]] + ) -s_tokenized_output_index_noncontinous = [5,7] test_cases_vectorization = [ - # format: [function_name, function, correct output for tokenized input above, dtype of output] - ["count", representation.count, [1, 1, 2, 2, 1, 1], "int"], + # format: [function_name, function, correct output for tokenized input above] + [ + "count", + representation.count, + pd.DataFrame( + [[1, 0, 0, 1, 2], [0, 2, 1, 0, 1]], + index=s_tokenized_output_index, + columns=_get_multiindex_for_tokenized_output("count"), + ).astype("Sparse"), + ], [ "term_frequency", representation.term_frequency, - [0.125, 0.125, 0.250, 0.250, 0.125, 0.125], - "float", + pd.DataFrame( + [[0.125, 0.0, 0.0, 0.125, 0.250], [0.0, 0.25, 0.125, 0.0, 0.125]], + index=s_tokenized_output_index, + columns=_get_multiindex_for_tokenized_output("term_frequency"), + ).astype("Sparse"), ], [ "tfidf", representation.tfidf, - [_tfidf(x[1], s_tokenized, x[0]) for x in s_tokenized_output_index], - "float", + pd.DataFrame( + [ + [ + _tfidf(x, s_tokenized, 0) # Testing the tfidf formula here + for x in ["!", ".", "?", "TEST", "Test"] + ], + [_tfidf(x, s_tokenized, 0) for x in ["!", ".", "?", "TEST", "Test"]], + ], + index=s_tokenized_output_index, + columns=_get_multiindex_for_tokenized_output("tfidf"), + ).astype("Sparse"), ], ] + test_cases_vectorization_min_df = [ - # format: [function_name, function, correct output for tokenized input above, dtype of output] - ["count", representation.count, [2, 1], "int"], - ["term_frequency", representation.term_frequency, [0.666667, 0.333333], "float",], - ["tfidf", representation.tfidf, [2.0, 1.0], "float",], + # format: [function_name, function, correct output for tokenized input above] + [ + "count", + representation.count, + pd.DataFrame( + [2, 1], + index=s_tokenized_output_index, + columns=pd.MultiIndex.from_tuples([("count", "Test")]), + ).astype("Sparse"), + ], + [ + "term_frequency", + representation.term_frequency, + pd.DataFrame( + [0.666667, 0.333333], + index=s_tokenized_output_index, + columns=pd.MultiIndex.from_tuples([("term_frequency", "Test")]), + ).astype("Sparse"), + ], + [ + "tfidf", + representation.tfidf, + pd.DataFrame( + [2.0, 1.0], + index=s_tokenized_output_index, + columns=pd.MultiIndex.from_tuples([("tfidf", "Test")]), + ).astype("Sparse"), + ], ] @@ -91,62 +143,23 @@ class AbstractRepresentationTest(PandasTestCase): """ @parameterized.expand(test_cases_vectorization) - def test_vectorization_simple( - self, name, test_function, correct_output_values, int_or_float - ): - if int_or_float == "int": - s_true = pd.Series( - correct_output_values, index=s_tokenized_output_index, dtype="int" - ).astype(pd.SparseDtype(np.int64, 0)) - else: - s_true = pd.Series( - correct_output_values, index=s_tokenized_output_index, dtype="float" - ).astype(pd.SparseDtype("float", np.nan)) + def test_vectorization_simple(self, name, test_function, correct_output): + s_true = correct_output result_s = test_function(s_tokenized) - - pd.testing.assert_series_equal(s_true, result_s) + pd.testing.assert_series_equal(s_true, result_s, check_less_precise=True) @parameterized.expand(test_cases_vectorization) def test_vectorization_noncontinuous_index_kept( - self, name, test_function, correct_output_values, int_or_float + self, name, test_function, correct_output=None ): - if int_or_float == "int": - s_true = pd.Series( - correct_output_values, - index=s_tokenized_output_noncontinuous_index, - dtype="int", - ).astype(pd.SparseDtype(np.int64, 0)) - else: - s_true = pd.Series( - correct_output_values, - index=s_tokenized_output_noncontinuous_index, - dtype="float", - ).astype(pd.SparseDtype("float", np.nan)) - result_s = test_function(s_tokenized_with_noncontinuous_index) - - pd.testing.assert_series_equal(s_true, result_s) + pd.testing.assert_series_equal(s_tokenized_output_index_noncontinous, result_s) @parameterized.expand(test_cases_vectorization_min_df) - def test_vectorization_min_df( - self, name, test_function, correct_output_values, int_or_float - ): - if int_or_float == "int": - s_true = pd.Series( - correct_output_values, - index=s_tokenized_output_min_df_index, - dtype="int", - ).astype(pd.SparseDtype(np.int64, 0)) - else: - s_true = pd.Series( - correct_output_values, - index=s_tokenized_output_min_df_index, - dtype="float", - ).astype(pd.SparseDtype("float", np.nan)) - + def test_vectorization_min_df(self, name, test_function, correct_output): + s_true = correct_output result_s = test_function(s_tokenized, min_df=2) - - pd.testing.assert_series_equal(s_true, result_s) + pd.testing.assert_series_equal(s_true, result_s, check_less_precise=True) @parameterized.expand(test_cases_vectorization) def test_vectorization_not_tokenized_yet_warning(self, name, test_function, *args): @@ -159,19 +172,3 @@ def test_vectorization_arguments_to_sklearn(self, name, test_function, *args): test_function(s_not_tokenized, max_features=1, min_df=1, max_df=1.0) except TypeError: self.fail("Sklearn arguments not handled correctly.") - - """ - Individual / special tests. - """ - - def test_tfidf_formula(self): - s = pd.Series(["Hi Bye", "Test Bye Bye"]) - s = preprocessing.tokenize(s) - s_true_index = pd.MultiIndex.from_tuples( - [(0, "Bye"), (0, "Hi"), (1, "Bye"), (1, "Test")], - ) - s_true = pd.Series( - [_tfidf(x[1], s, x[0]) for x in s_true_index], index=s_true_index - ).astype("Sparse") - - self.assertEqual(representation.tfidf(s), s_true) diff --git a/texthero/representation.py b/texthero/representation.py index 042db71a..efabc9c6 100644 --- a/texthero/representation.py +++ b/texthero/representation.py @@ -97,11 +97,11 @@ def count( >>> import pandas as pd >>> s = pd.Series(["Sentence one", "Sentence two"]).pipe(hero.tokenize) >>> hero.count(s) - count - Sentence one two + count + Sentence one two 0 1 1 0 1 1 0 1 - +# FIXME columns pandas doctest See Also -------- Document Term DataFrame: TODO add tutorial link @@ -375,7 +375,7 @@ def pca( values = list(s) return pd.Series(pca.fit_transform(values).tolist(), index=s.index) - +# FIXME: merge master again def nmf( s: Union[pd.Series, pd.DataFrame], n_components=2, random_state=None From 19c52de3f5ae6a1a01e4262dca00ea5177718311 Mon Sep 17 00:00:00 2001 From: Maximilian Krahn Date: Wed, 19 Aug 2020 22:02:41 +0200 Subject: [PATCH 03/21] implemented correct sparse support *missing: test adopting for new types Co-authored-by: Henri Froese --- tests/test_representation.py | 12 ++++---- texthero/representation.py | 59 +++++++++++++++++++++--------------- 2 files changed, 40 insertions(+), 31 deletions(-) diff --git a/tests/test_representation.py b/tests/test_representation.py index d4acd369..7c02ccd2 100644 --- a/tests/test_representation.py +++ b/tests/test_representation.py @@ -70,7 +70,7 @@ def _get_multiindex_for_tokenized_output(first_level_name): [[1, 0, 0, 1, 2], [0, 2, 1, 0, 1]], index=s_tokenized_output_index, columns=_get_multiindex_for_tokenized_output("count"), - ).astype("Sparse"), + ).astype("Sparse[int64, 0]"), ], [ "term_frequency", @@ -108,7 +108,7 @@ def _get_multiindex_for_tokenized_output(first_level_name): [2, 1], index=s_tokenized_output_index, columns=pd.MultiIndex.from_tuples([("count", "Test")]), - ).astype("Sparse"), + ).astype("Sparse[int64, 0]"), ], [ "term_frequency", @@ -123,7 +123,7 @@ def _get_multiindex_for_tokenized_output(first_level_name): "tfidf", representation.tfidf, pd.DataFrame( - [2.0, 1.0], + [2, 1], index=s_tokenized_output_index, columns=pd.MultiIndex.from_tuples([("tfidf", "Test")]), ).astype("Sparse"), @@ -146,20 +146,20 @@ class AbstractRepresentationTest(PandasTestCase): def test_vectorization_simple(self, name, test_function, correct_output): s_true = correct_output result_s = test_function(s_tokenized) - pd.testing.assert_series_equal(s_true, result_s, check_less_precise=True) + pd.testing.assert_frame_equal(s_true, result_s, check_less_precise=True, check_dtype = False) @parameterized.expand(test_cases_vectorization) def test_vectorization_noncontinuous_index_kept( self, name, test_function, correct_output=None ): result_s = test_function(s_tokenized_with_noncontinuous_index) - pd.testing.assert_series_equal(s_tokenized_output_index_noncontinous, result_s) + pd.testing.assert_frame_equal(s_tokenized_output_index_noncontinous, result_s.index, check_dtype = False) @parameterized.expand(test_cases_vectorization_min_df) def test_vectorization_min_df(self, name, test_function, correct_output): s_true = correct_output result_s = test_function(s_tokenized, min_df=2) - pd.testing.assert_series_equal(s_true, result_s, check_less_precise=True) + pd.testing.assert_frame_equal(s_true, result_s, check_less_precise=True, check_dtype = False) @parameterized.expand(test_cases_vectorization) def test_vectorization_not_tokenized_yet_warning(self, name, test_function, *args): diff --git a/texthero/representation.py b/texthero/representation.py index efabc9c6..ff691212 100644 --- a/texthero/representation.py +++ b/texthero/representation.py @@ -101,9 +101,12 @@ def count( Sentence one two 0 1 1 0 1 1 0 1 -# FIXME columns pandas doctest + See Also -------- + + # FIXME columns pandas doctest + Document Term DataFrame: TODO add tutorial link """ # TODO. Can be rewritten without sklearn. @@ -375,8 +378,11 @@ def pca( values = list(s) return pd.Series(pca.fit_transform(values).tolist(), index=s.index) + + # FIXME: merge master again + def nmf( s: Union[pd.Series, pd.DataFrame], n_components=2, random_state=None ) -> pd.Series: @@ -437,11 +443,12 @@ def nmf( nmf = NMF(n_components=n_components, init="random", random_state=random_state,) if _check_is_valid_DocumentTermDF(s): - values = s.sparse.to_coo() + s_coo = s.sparse.to_coo() + s_for_vectorization = s_coo.astype("float64") else: - values = list(s) + s_for_vectorization = list(s) - return pd.Series(nmf.fit_transform(values).tolist(), index=s.index) + return pd.Series(nmf.fit_transform(s_for_vectorization).tolist(), index=s.index) def tsne( @@ -535,11 +542,12 @@ def tsne( ) if _check_is_valid_DocumentTermDF(s): - values = s.sparse.to_coo() + s_coo = s.sparse.to_coo() + s_for_vectorization = s_coo.astype("float64") else: - values = list(s) + s_for_vectorization = list(s) - return pd.Series(tsne.fit_transform(values).tolist(), index=s.index) + return pd.Series(tsne.fit_transform(s_for_vectorization).tolist(), index=s.index) """ @@ -624,9 +632,10 @@ def kmeans( """ if _check_is_valid_DocumentTermDF(s): - vectors = s.sparse.to_coo() + s_coo = s.sparse.to_coo() + s_for_vectorization = s_coo.astype("float64") else: - vectors = list(s) + s_for_vectorization = list(s) kmeans = KMeans( n_clusters=n_clusters, @@ -635,8 +644,8 @@ def kmeans( random_state=random_state, copy_x=True, algorithm=algorithm, - ).fit(vectors) - return pd.Series(kmeans.predict(vectors), index=s.index).astype("category") + ).fit(s_for_vectorization) + return pd.Series(kmeans.predict(s_for_vectorization), index=s.index).astype("category") def dbscan( @@ -727,9 +736,10 @@ def dbscan( """ if _check_is_valid_DocumentTermDF(s): - vectors = s.sparse.to_coo() + s_coo = s.sparse.to_coo() + s_for_vectorization = s_coo.astype("float64") else: - vectors = list(s) + s_for_vectorization = list(s) return pd.Series( DBSCAN( @@ -739,7 +749,7 @@ def dbscan( metric_params=metric_params, leaf_size=leaf_size, n_jobs=n_jobs, - ).fit_predict(vectors), + ).fit_predict(s_for_vectorization), index=s.index, ).astype("category") @@ -877,17 +887,15 @@ def normalize(s: pd.Series, norm="l2") -> pd.Series: -------- >>> import texthero as hero >>> import pandas as pd - >>> idx = pd.MultiIndex.from_tuples( - ... [(0, "a"), (0, "b"), (1, "c"), (1, "d")], names=("document", "word") - ... ) - >>> s = pd.Series([1, 2, 3, 4], index=idx) + >>> col = pd.MultiIndex.from_tuples([(0, "a"), (0, "b"), (1, "c"), (1, "d")]) + >>> s = pd.DataFrame([[1, 2, 3, 4],[4, 2, 7, 5],[2, 2, 3, 5],[1, 2, 9, 8]], columns=col).astype("Sparse") >>> hero.normalize(s, norm="max") - document word - 0 a 0.50 - b 1.00 - 1 c 0.75 - d 1.00 - dtype: Sparse[float64, nan] + 0 1 + a b c d + 0 0.250000 0.500000 0.75 1.000000 + 1 0.571429 0.285714 1.00 0.714286 + 2 0.400000 0.400000 0.60 1.000000 + 3 0.111111 0.222222 1.00 0.888889 See Also @@ -900,7 +908,8 @@ def normalize(s: pd.Series, norm="l2") -> pd.Series: isDocumentTermDF = _check_is_valid_DocumentTermDF(s) if isDocumentTermDF: - s_for_vectorization = s.sparse.to_coo() + s_coo = s.sparse.to_coo() + s_for_vectorization = s_coo.astype("float64") else: s_for_vectorization = list(s) From 41f55a8a359f15ce4ba65e1e726b9e0757fc596b Mon Sep 17 00:00:00 2001 From: Maximilian Krahn Date: Fri, 21 Aug 2020 10:20:02 +0200 Subject: [PATCH 04/21] added back list() and rm .tolist() --- texthero/representation.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/texthero/representation.py b/texthero/representation.py index 048b42ec..025652d9 100644 --- a/texthero/representation.py +++ b/texthero/representation.py @@ -37,7 +37,7 @@ def _check_is_valid_DocumentTermDF(df: Union[pd.DataFrame, pd.Series]) -> bool: return isinstance(df, pd.DataFrame) and isinstance(df.columns, pd.MultiIndex) - s = pd.Series(s.values.tolist(), index=s.index) + s = pd.Series(list(s.values), index=s.index) return s @@ -415,7 +415,7 @@ def pca( else: values = list(s) - return pd.Series(pca.fit_transform(values).tolist(), index=s.index) + return pd.Series(list(pca.fit_transform(values)), index=s.index) # FIXME: merge master again @@ -489,7 +489,7 @@ def nmf( else: s_for_vectorization = list(s) - return pd.Series(nmf.fit_transform(s_for_vectorization).tolist(), index=s.index) + return pd.Series(list(nmf.fit_transform(s_for_vectorization)), index=s.index) def tsne( @@ -589,7 +589,7 @@ def tsne( else: s_for_vectorization = list(s) - return pd.Series(tsne.fit_transform(s_for_vectorization).tolist(), index=s.index) + return pd.Series(list(tsne.fit_transform(s_for_vectorization)), index=s.index) """ @@ -963,4 +963,4 @@ def normalize(s: pd.Series, norm="l2") -> pd.Series: if isDocumentTermDF: return pd.DataFrame.sparse.from_spmatrix(result, s.index, s.columns) else: - return pd.Series(result.tolist(), index=s.index) + return pd.Series(list(result), index=s.index) From 217611a2c648db4044d240a9c12a157b94b36bca Mon Sep 17 00:00:00 2001 From: Maximilian Krahn Date: Fri, 21 Aug 2020 10:21:41 +0200 Subject: [PATCH 05/21] rm .tolist() and added list() --- texthero/representation.py | 32 +------------------------------- 1 file changed, 1 insertion(+), 31 deletions(-) diff --git a/texthero/representation.py b/texthero/representation.py index 025652d9..fdab73dd 100644 --- a/texthero/representation.py +++ b/texthero/representation.py @@ -37,36 +37,6 @@ def _check_is_valid_DocumentTermDF(df: Union[pd.DataFrame, pd.Series]) -> bool: return isinstance(df, pd.DataFrame) and isinstance(df.columns, pd.MultiIndex) - s = pd.Series(list(s.values), index=s.index) - - return s - - -def _check_is_valid_representation(s: pd.Series) -> bool: - """ - Check if the given Pandas Series is a Document Representation Series. - - Returns true if Series is Document Representation Series, else False. - - """ - - # TODO: in Version 2 when only representation is accepted as input -> change "return False" to "raise ValueError" - - if not isinstance(s.index, pd.MultiIndex): - return False - # raise ValueError( - # f"The input Pandas Series should be a Representation Pandas Series and should have a MultiIndex. The given Pandas Series does not appears to have MultiIndex" - # ) - - if s.index.nlevels != 2: - return False - # raise ValueError( - # f"The input Pandas Series should be a Representation Pandas Series and should have a MultiIndex, where the first level represent the document and the second one the words/token. The given Pandas Series has {s.index.nlevels} number of levels instead of 2." - # ) - - return True - - # Warning message for not-tokenized inputs _not_tokenized_warning_message = ( "It seems like the given Pandas Series s is not tokenized. This function will" @@ -963,4 +933,4 @@ def normalize(s: pd.Series, norm="l2") -> pd.Series: if isDocumentTermDF: return pd.DataFrame.sparse.from_spmatrix(result, s.index, s.columns) else: - return pd.Series(list(result), index=s.index) + return pd.Series((result), index=s.index) From 6a3b56d1a56401880efa7cfa7dd32668e23b25ea Mon Sep 17 00:00:00 2001 From: Maximilian Krahn Date: Fri, 21 Aug 2020 10:41:22 +0200 Subject: [PATCH 06/21] Adopted the test to the new dataframes --- tests/test_representation.py | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/tests/test_representation.py b/tests/test_representation.py index 7c02ccd2..3564730e 100644 --- a/tests/test_representation.py +++ b/tests/test_representation.py @@ -90,7 +90,7 @@ def _get_multiindex_for_tokenized_output(first_level_name): _tfidf(x, s_tokenized, 0) # Testing the tfidf formula here for x in ["!", ".", "?", "TEST", "Test"] ], - [_tfidf(x, s_tokenized, 0) for x in ["!", ".", "?", "TEST", "Test"]], + [_tfidf(x, s_tokenized, 1) for x in ["!", ".", "?", "TEST", "Test"]], ], index=s_tokenized_output_index, columns=_get_multiindex_for_tokenized_output("tfidf"), @@ -146,20 +146,28 @@ class AbstractRepresentationTest(PandasTestCase): def test_vectorization_simple(self, name, test_function, correct_output): s_true = correct_output result_s = test_function(s_tokenized) - pd.testing.assert_frame_equal(s_true, result_s, check_less_precise=True, check_dtype = False) + pd.testing.assert_frame_equal( + s_true, result_s, check_less_precise=True, check_dtype=False + ) @parameterized.expand(test_cases_vectorization) def test_vectorization_noncontinuous_index_kept( self, name, test_function, correct_output=None ): result_s = test_function(s_tokenized_with_noncontinuous_index) - pd.testing.assert_frame_equal(s_tokenized_output_index_noncontinous, result_s.index, check_dtype = False) + pd.testing.assert_series_equal( + pd.Series(s_tokenized_output_index_noncontinous), + pd.Series(result_s.index), + check_dtype=False, + ) @parameterized.expand(test_cases_vectorization_min_df) def test_vectorization_min_df(self, name, test_function, correct_output): s_true = correct_output result_s = test_function(s_tokenized, min_df=2) - pd.testing.assert_frame_equal(s_true, result_s, check_less_precise=True, check_dtype = False) + pd.testing.assert_frame_equal( + s_true, result_s, check_less_precise=True, check_dtype=False + ) @parameterized.expand(test_cases_vectorization) def test_vectorization_not_tokenized_yet_warning(self, name, test_function, *args): From b8ff5611e550f5f4bc023b2b76ef8ebcff7f8021 Mon Sep 17 00:00:00 2001 From: Maximilian Krahn Date: Fri, 21 Aug 2020 10:41:35 +0200 Subject: [PATCH 07/21] wrong format --- texthero/representation.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/texthero/representation.py b/texthero/representation.py index fdab73dd..ac0a458f 100644 --- a/texthero/representation.py +++ b/texthero/representation.py @@ -657,7 +657,9 @@ def kmeans( copy_x=True, algorithm=algorithm, ).fit(s_for_vectorization) - return pd.Series(kmeans.predict(s_for_vectorization), index=s.index).astype("category") + return pd.Series(kmeans.predict(s_for_vectorization), index=s.index).astype( + "category" + ) def dbscan( From e3af2f9da094505861cddc420f57490700ca88ef Mon Sep 17 00:00:00 2001 From: Henri Froese Date: Fri, 21 Aug 2020 18:48:51 +0200 Subject: [PATCH 08/21] Address most review comments. --- tests/test_representation.py | 19 ++++++++-------- texthero/representation.py | 42 +++++++++++++++++++++++++----------- 2 files changed, 38 insertions(+), 23 deletions(-) diff --git a/tests/test_representation.py b/tests/test_representation.py index 3564730e..5f985996 100644 --- a/tests/test_representation.py +++ b/tests/test_representation.py @@ -50,9 +50,9 @@ def _tfidf(term, corpus, document_index): [["Test", "Test", "TEST", "!"], ["Test", "?", ".", "."]], index=[5, 7] ) -s_tokenized_output_index = [0, 1] +s_tokenized_output_index = pd.Index([0, 1]) -s_tokenized_output_index_noncontinous = [5, 7] +s_tokenized_output_index_noncontinous = pd.Index([5, 7]) def _get_multiindex_for_tokenized_output(first_level_name): @@ -79,7 +79,8 @@ def _get_multiindex_for_tokenized_output(first_level_name): [[0.125, 0.0, 0.0, 0.125, 0.250], [0.0, 0.25, 0.125, 0.0, 0.125]], index=s_tokenized_output_index, columns=_get_multiindex_for_tokenized_output("term_frequency"), - ).astype("Sparse"), + dtype="Sparse", + ).astype("Sparse[float64, nan]"), ], [ "tfidf", @@ -94,7 +95,7 @@ def _get_multiindex_for_tokenized_output(first_level_name): ], index=s_tokenized_output_index, columns=_get_multiindex_for_tokenized_output("tfidf"), - ).astype("Sparse"), + ).astype("Sparse[float64, nan]"), ], ] @@ -117,7 +118,7 @@ def _get_multiindex_for_tokenized_output(first_level_name): [0.666667, 0.333333], index=s_tokenized_output_index, columns=pd.MultiIndex.from_tuples([("term_frequency", "Test")]), - ).astype("Sparse"), + ).astype("Sparse[float64, nan]"), ], [ "tfidf", @@ -126,7 +127,7 @@ def _get_multiindex_for_tokenized_output(first_level_name): [2, 1], index=s_tokenized_output_index, columns=pd.MultiIndex.from_tuples([("tfidf", "Test")]), - ).astype("Sparse"), + ).astype("Sparse[float64, nan]"), ], ] @@ -155,10 +156,8 @@ def test_vectorization_noncontinuous_index_kept( self, name, test_function, correct_output=None ): result_s = test_function(s_tokenized_with_noncontinuous_index) - pd.testing.assert_series_equal( - pd.Series(s_tokenized_output_index_noncontinous), - pd.Series(result_s.index), - check_dtype=False, + pd.testing.assert_index_equal( + s_tokenized_output_index_noncontinous, result_s.index ) @parameterized.expand(test_cases_vectorization_min_df) diff --git a/texthero/representation.py b/texthero/representation.py index ac0a458f..7793cb2b 100644 --- a/texthero/representation.py +++ b/texthero/representation.py @@ -145,7 +145,7 @@ def term_frequency( Return a Document Term DataFrame with the term frequencies of the terms for every - document. + document. The output is sparse. TODO add tutorial link The input Series should already be tokenized. If not, it will @@ -241,7 +241,7 @@ def tfidf(s: pd.Series, max_features=None, min_df=1, max_df=1.0,) -> pd.DataFram formula described above. Return a Document Term DataFrame with the - tfidf of every word in the document. + tfidf of every word in the document. The output is sparse. TODO add tutorial link The input Series should already be tokenized. If not, it will @@ -341,9 +341,13 @@ def pca( In general, *pca* should be called after the text has already been represented to a matrix form. + PCA cannot directly handle sparse input, so when calling pca on a + DocumentTermDF, the input has to be expanded which can lead to + memory problems with big datasets. + Parameters ---------- - s : Pandas Series or MuliIndex Sparse DataFrame + s : Pandas Series (VectorSeries) or MultiIndex Sparse DataFrame (DocumentTermDF) n_components : Int. Default is 2. Number of components to keep (dimensionality of output vectors). @@ -388,9 +392,6 @@ def pca( return pd.Series(list(pca.fit_transform(values)), index=s.index) -# FIXME: merge master again - - def nmf( s: Union[pd.Series, pd.DataFrame], n_components=2, random_state=None ) -> pd.Series: @@ -410,10 +411,12 @@ def nmf( n_components many topics (clusters) and calculate a vector for each document that places it correctly among the topics. + NMF can directly handle sparse input, so when calling nmf on a + DocumentTermDF, the advantage of sparseness is kept. Parameters ---------- - s : Pandas Series or Pandas MultiIndex Sparse DataFrame + s : Pandas Series (VectorSeries) or MultiIndex Sparse DataFrame (DocumentTermDF) n_components : Int. Default is 2. Number of components to keep (dimensionality of output vectors). @@ -484,10 +487,12 @@ def tsne( document gets a new, low-dimensional (n_components entries) vector in such a way that the differences / similarities between documents are preserved. + T-SNE can directly handle sparse input, so when calling tsne on a + DocumentTermDF, the advantage of sparseness is kept. Parameters ---------- - s : Pandas Series or Pandas MultiIndex Sparse DataFrame + s : Pandas Series (VectorSeries) or MultiIndex Sparse DataFrame (DocumentTermDF) n_components : int, default is 2. Number of components to keep (dimensionality of output vectors). @@ -591,9 +596,12 @@ def kmeans( function that assigns a scalar (a weight) to each word), K-means will find k topics (clusters) and assign a topic to each document. + Kmeans can directly handle sparse input, so when calling kmeans on a + DocumentTermDF, the advantage of sparseness is kept. + Parameters ---------- - s: Pandas Series or Pandas MultiIndex Sparse DataFrame + s: Pandas Series (VectorSeries) or MultiIndex Sparse DataFrame (DocumentTermDF) n_clusters: Int, default to 5. The number of clusters to separate the data into. @@ -689,9 +697,12 @@ def dbscan( function that assigns a scalar (a weight) to each word), DBSCAN will find topics (clusters) and assign a topic to each document. + DBSCAN can directly handle sparse input, so when calling dbscan on a + DocumentTermDF, the advantage of sparseness is kept. + Parameters ---------- - s: Pandas Series or Pandas MultiIndex Sparse DataFrame + s: Pandas Series (VectorSeries) or MultiIndex Sparse DataFrame (DocumentTermDF) eps : float, default=0.5 The maximum distance between two samples for one to be considered @@ -795,9 +806,13 @@ def meanshift( function that assigns a scalar (a weight) to each word), mean shift will find topics (clusters) and assign a topic to each document. + Menashift cannot directly handle sparse input, so when calling meanshift on a + DocumentTermDF, the input has to be expanded which can lead to + memory problems with big datasets. + Parameters ---------- - s: Pandas Series or Pandas MultiIndex Sparse DataFrame + s: Pandas Series (VectorSeries) or MultiIndex Sparse DataFrame (DocumentTermDF) bandwidth : float, default=None Bandwidth used in the RBF kernel. @@ -889,11 +904,12 @@ def normalize(s: pd.Series, norm="l2") -> pd.Series: """ Normalize every cell in a Pandas Series. - Input has to be a Representation Series. + Input can be VectorSeries or DocumentTermDF. For DocumentTermDFs, + the sparseness is kept. Parameters ---------- - s: Pandas Series + s: Pandas Series (VectorSeries) or MultiIndex Sparse DataFrame (DocumentTermDF) norm: str, default to "l2" One of "l1", "l2", or "max". The norm that is used. From 77ad80ecf8977a098b73c4f12c8f28951c769dfc Mon Sep 17 00:00:00 2001 From: Henri Froese Date: Fri, 21 Aug 2020 19:45:48 +0200 Subject: [PATCH 09/21] Add more unittests for representation --- tests/test_representation.py | 118 +++++++++++++++++++++++++++++++++-- texthero/representation.py | 14 ++--- 2 files changed, 118 insertions(+), 14 deletions(-) diff --git a/tests/test_representation.py b/tests/test_representation.py index 5f985996..2722289e 100644 --- a/tests/test_representation.py +++ b/tests/test_representation.py @@ -132,6 +132,50 @@ def _get_multiindex_for_tokenized_output(first_level_name): ] +s_vector_series = pd.Series([[1.0, 0.0], [0.0, 0.0]], index=[5, 7]) +s_documenttermDF = pd.DataFrame( + [[1.0, 0.0], [0.0, 0.0]], + index=[5, 7], + columns=pd.MultiIndex.from_product([["test"], ["a", "b"]]), +).astype("Sparse[float64, nan]") + + +test_cases_dim_reduction_and_clustering = [ + # format: [function_name, function, correct output for s_vector_series and s_documenttermDF input above] + ["pca", representation.pca, pd.Series([[-0.5, 0.0], [0.5, 0.0]], index=[5, 7],),], + [ + "nmf", + representation.nmf, + pd.Series([[5.119042424626627, 0.0], [0.0, 0.0]], index=[5, 7],), + ], + [ + "tsne", + representation.tsne, + pd.Series([[164.86682, 1814.1647], [-164.8667, -1814.1644]], index=[5, 7],), + ], + [ + "kmeans", + representation.kmeans, + pd.Series([1, 0], index=[5, 7], dtype="category"), + ], + [ + "dbscan", + representation.dbscan, + pd.Series([-1, -1], index=[5, 7], dtype="category"), + ], + [ + "meanshift", + representation.meanshift, + pd.Series([0, 1], index=[5, 7], dtype="category"), + ], + [ + "normalize", + representation.normalize, + pd.Series([[1.0, 0.0], [0.0, 0.0]], index=[5, 7],), + ], +] + + class AbstractRepresentationTest(PandasTestCase): """ Class for representation test cases. Most tests are @@ -147,9 +191,7 @@ class AbstractRepresentationTest(PandasTestCase): def test_vectorization_simple(self, name, test_function, correct_output): s_true = correct_output result_s = test_function(s_tokenized) - pd.testing.assert_frame_equal( - s_true, result_s, check_less_precise=True, check_dtype=False - ) + pd.testing.assert_frame_equal(s_true, result_s, check_dtype=False) @parameterized.expand(test_cases_vectorization) def test_vectorization_noncontinuous_index_kept( @@ -164,9 +206,7 @@ def test_vectorization_noncontinuous_index_kept( def test_vectorization_min_df(self, name, test_function, correct_output): s_true = correct_output result_s = test_function(s_tokenized, min_df=2) - pd.testing.assert_frame_equal( - s_true, result_s, check_less_precise=True, check_dtype=False - ) + pd.testing.assert_frame_equal(s_true, result_s, check_dtype=False) @parameterized.expand(test_cases_vectorization) def test_vectorization_not_tokenized_yet_warning(self, name, test_function, *args): @@ -179,3 +219,69 @@ def test_vectorization_arguments_to_sklearn(self, name, test_function, *args): test_function(s_not_tokenized, max_features=1, min_df=1, max_df=1.0) except TypeError: self.fail("Sklearn arguments not handled correctly.") + + """ + Dimensionality Reduction and Clustering + """ + + @parameterized.expand(test_cases_dim_reduction_and_clustering) + def test_dim_reduction_and_clustering_with_vector_series_input( + self, name, test_function, correct_output + ): + s_true = correct_output + + if name == "kmeans": + result_s = test_function(s_vector_series, random_state=42, n_clusters=2) + elif name == "dbscan" or name == "meanshift" or name == "normalize": + result_s = test_function(s_vector_series) + else: + result_s = test_function(s_vector_series, random_state=42) + + pd.testing.assert_series_equal( + s_true, + result_s, + check_dtype=False, + rtol=0.1, + atol=0.1, + check_category_order=False, + ) + + @parameterized.expand(test_cases_dim_reduction_and_clustering) + def test_dim_reduction_and_clustering_with_documenttermDF_input( + self, name, test_function, correct_output + ): + s_true = correct_output + + if name == "normalize": + # testing this below separately + return + + if name == "kmeans": + result_s = test_function(s_documenttermDF, random_state=42, n_clusters=2) + elif name == "dbscan" or name == "meanshift" or name == "normalize": + result_s = test_function(s_documenttermDF) + else: + result_s = test_function(s_documenttermDF, random_state=42) + + pd.testing.assert_series_equal( + s_true, + result_s, + check_dtype=False, + rtol=0.1, + atol=0.1, + check_category_order=False, + ) + + def test_normalize_documenttermDF_also_as_output(self): + # normalize should also return DocumentTermDF output for DocumentTermDF + # input so we test it separately + result = representation.normalize(s_documenttermDF) + correct_output = pd.DataFrame( + [[1.0, 0.0], [0.0, 0.0]], + index=[5, 7], + columns=pd.MultiIndex.from_product([["test"], ["a", "b"]]), + ) + + pd.testing.assert_frame_equal( + result, correct_output, check_dtype=False, rtol=0.1, atol=0.1, + ) diff --git a/texthero/representation.py b/texthero/representation.py index 7793cb2b..8e876088 100644 --- a/texthero/representation.py +++ b/texthero/representation.py @@ -97,7 +97,7 @@ def count( >>> import texthero as hero >>> import pandas as pd >>> s = pd.Series(["Sentence one", "Sentence two"]).pipe(hero.tokenize) - >>> hero.count(s) + >>> hero.count(s) # doctest: +SKIP count Sentence one two 0 1 1 0 @@ -106,8 +106,6 @@ def count( See Also -------- - # FIXME columns pandas doctest - Document Term DataFrame: TODO add tutorial link """ # TODO. Can be rewritten without sklearn. @@ -177,7 +175,7 @@ def term_frequency( >>> import texthero as hero >>> import pandas as pd >>> s = pd.Series(["Sentence one hey", "Sentence two"]).pipe(hero.tokenize) - >>> hero.term_frequency(s) + >>> hero.term_frequency(s) # doctest: +SKIP term_frequency Sentence hey one two 0 0.2 0.2 0.2 0.0 @@ -273,7 +271,7 @@ def tfidf(s: pd.Series, max_features=None, min_df=1, max_df=1.0,) -> pd.DataFram >>> import texthero as hero >>> import pandas as pd >>> s = pd.Series(["Hi Bye", "Test Bye Bye"]).pipe(hero.tokenize) - >>> hero.tfidf(s) + >>> hero.tfidf(s) # doctest: +SKIP tfidf Bye Hi Test 0 1.0 1.405465 0.000000 @@ -900,7 +898,7 @@ def meanshift( """ -def normalize(s: pd.Series, norm="l2") -> pd.Series: +def normalize(s: Union[pd.DataFrame, pd.Series], norm="l2") -> pd.Series: """ Normalize every cell in a Pandas Series. @@ -920,7 +918,7 @@ def normalize(s: pd.Series, norm="l2") -> pd.Series: >>> import pandas as pd >>> col = pd.MultiIndex.from_tuples([(0, "a"), (0, "b"), (1, "c"), (1, "d")]) >>> s = pd.DataFrame([[1, 2, 3, 4],[4, 2, 7, 5],[2, 2, 3, 5],[1, 2, 9, 8]], columns=col).astype("Sparse") - >>> hero.normalize(s, norm="max") + >>> hero.normalize(s, norm="max") # doctest: +SKIP 0 1 a b c d 0 0.250000 0.500000 0.75 1.000000 @@ -951,4 +949,4 @@ def normalize(s: pd.Series, norm="l2") -> pd.Series: if isDocumentTermDF: return pd.DataFrame.sparse.from_spmatrix(result, s.index, s.columns) else: - return pd.Series((result), index=s.index) + return pd.Series(list(result), index=s.index) From 3fbeaa56da6b448c28f5157ec2c181fde3103e47 Mon Sep 17 00:00:00 2001 From: Maximilian Krahn Date: Wed, 26 Aug 2020 00:38:10 +0200 Subject: [PATCH 10/21] Fix the term_frequency formula. Simplify the function body. Co-authored-by: Henri Froese --- tests/test_representation.py | 4 +-- texthero/representation.py | 48 +++++++++++++----------------------- 2 files changed, 19 insertions(+), 33 deletions(-) diff --git a/tests/test_representation.py b/tests/test_representation.py index 2722289e..3dd12d62 100644 --- a/tests/test_representation.py +++ b/tests/test_representation.py @@ -76,7 +76,7 @@ def _get_multiindex_for_tokenized_output(first_level_name): "term_frequency", representation.term_frequency, pd.DataFrame( - [[0.125, 0.0, 0.0, 0.125, 0.250], [0.0, 0.25, 0.125, 0.0, 0.125]], + [[0.25, 0.0, 0.0, 0.25, 0.50], [0.0, 0.5, 0.25, 0.0, 0.25]], index=s_tokenized_output_index, columns=_get_multiindex_for_tokenized_output("term_frequency"), dtype="Sparse", @@ -115,7 +115,7 @@ def _get_multiindex_for_tokenized_output(first_level_name): "term_frequency", representation.term_frequency, pd.DataFrame( - [0.666667, 0.333333], + [1.0, 1.0], index=s_tokenized_output_index, columns=pd.MultiIndex.from_tuples([("term_frequency", "Test")]), ).astype("Sparse[float64, nan]"), diff --git a/texthero/representation.py b/texthero/representation.py index 8e876088..213d02a4 100644 --- a/texthero/representation.py +++ b/texthero/representation.py @@ -174,43 +174,29 @@ def term_frequency( -------- >>> import texthero as hero >>> import pandas as pd - >>> s = pd.Series(["Sentence one hey", "Sentence two"]).pipe(hero.tokenize) - >>> hero.term_frequency(s) # doctest: +SKIP - term_frequency - Sentence hey one two - 0 0.2 0.2 0.2 0.0 - 1 0.2 0.0 0.0 0.2 + >>> s = pd.Series(["Text Text of doc one", "Text of of doc two", "Aha hi bnd one"]).pipe(hero.tokenize) + >>> hero.term_frequency(s) # doctest: +SKIP + term_frequency + Aha Text bnd doc hi of one two + 0 0.00 0.4 0.00 0.2 0.00 0.2 0.20 0.0 + 1 0.00 0.2 0.00 0.2 0.00 0.4 0.00 0.2 + 2 0.25 0.0 0.25 0.0 0.25 0.0 0.25 0.0 See Also -------- Document Term DataFrame: TODO add tutorial link """ - # Check if input is tokenized. Else, print warning and tokenize. - if not isinstance(s.iloc[0], list): - warnings.warn(_not_tokenized_warning_message, DeprecationWarning) - s = preprocessing.tokenize(s) - - tf = CountVectorizer( - max_features=max_features, - tokenizer=lambda x: x, - preprocessor=lambda x: x, - min_df=min_df, - max_df=max_df, - ) - - tf_vectors_csr = tf.fit_transform(s) - tf_vectors_coo = coo_matrix(tf_vectors_csr) - - total_count_coo = np.sum(tf_vectors_coo) - frequency_coo = np.divide(tf_vectors_coo, total_count_coo) - - multiindexed_columns = pd.MultiIndex.from_tuples( - [("term_frequency", word) for word in tf.get_feature_names()] - ) - - return pd.DataFrame.sparse.from_spmatrix( - frequency_coo, s.index, multiindexed_columns + # Term frequency is just the word counts for each document + # with each document divided by the number of terms in the + # document. That's just l1 normalization! + s_term_frequency = s.pipe( + count, max_features=max_features, min_df=min_df, max_df=max_df + ).pipe(normalize, norm="l1") + # Rename first level of index from "count" to "term_frequency" + s_term_frequency.columns = pd.MultiIndex.from_product( + [["term_frequency"], s_term_frequency.columns.levels[1]] ) + return s_term_frequency def tfidf(s: pd.Series, max_features=None, min_df=1, max_df=1.0,) -> pd.DataFrame: From 1e8857acc0a7724a2e55041b6fb334657c774787 Mon Sep 17 00:00:00 2001 From: Henri Froese Date: Wed, 26 Aug 2020 17:07:50 +0200 Subject: [PATCH 11/21] Implement filter_extremes. Missing: Tests & Docstring Co-authored-by: Maximilian Krahn --- texthero/preprocessing.py | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/texthero/preprocessing.py b/texthero/preprocessing.py index 0360ab29..baf53516 100644 --- a/texthero/preprocessing.py +++ b/texthero/preprocessing.py @@ -15,6 +15,7 @@ from texthero import stopwords as _stopwords from texthero._types import TokenSeries, TextSeries, InputSeries +from texthero import representation from typing import List, Callable, Union @@ -959,3 +960,26 @@ def remove_hashtags(s: TextSeries) -> TextSeries: with a custom symbol. """ return replace_hashtags(s, " ") + + +def filter_extremes( + s: pd.Series, + max_features=None, + min_df=1, + max_df=1.0 +): + """ + Filter out tokens in the dictionary by their frequency. + """ + + s_term_frequency = representation.term_frequency( + s, + max_features=max_features, + min_df=min_df, + max_df=max_df + ) + + tokens_to_keep = set(s_term_frequency.columns.levels[1]) + + # FIXME: Parallelize this after #162 is merged. + return s.apply(lambda token_list: [token for token in token_list if token in tokens_to_keep]) From a7ddccbe195c44ab4d5fd2ec8ffe0ad88b702058 Mon Sep 17 00:00:00 2001 From: Henri Froese Date: Wed, 26 Aug 2020 17:27:15 +0200 Subject: [PATCH 12/21] Add Docstring to filter_extremes --- texthero/preprocessing.py | 61 ++++++++++++++++++++++++++++++--------- 1 file changed, 48 insertions(+), 13 deletions(-) diff --git a/texthero/preprocessing.py b/texthero/preprocessing.py index baf53516..07584d66 100644 --- a/texthero/preprocessing.py +++ b/texthero/preprocessing.py @@ -962,24 +962,59 @@ def remove_hashtags(s: TextSeries) -> TextSeries: return replace_hashtags(s, " ") -def filter_extremes( - s: pd.Series, - max_features=None, - min_df=1, - max_df=1.0 -): - """ - Filter out tokens in the dictionary by their frequency. +def filter_extremes(s: pd.Series, max_words=None, min_df=1, max_df=1.0): """ + Decrease the size of your documents by + filtering out words by their frequency. + + It is often useful to reduce the size of your dataset + by dropping words in order to + reduce noise and improve performance. + This function removes all words/tokens from + all documents where the + document frequency (=number of documents a term appears in) is + - below min_df + - above max_df. + + When min_df or max_df is an integer, then document frequency + is the absolute number of documents that a term + appears in. When it's a float, it is the + proportion of documents a term appears in. + + Additionally, only max_words many words are kept. + Parameters + ---------- + max_words : int, default to None + The maximum number of words/tokens that + are kept, according to term frequency descending. + If None, will consider all features. + + min_df : int or float, default to 1 + Remove words that have a document frequency + lower than min_df. If float, it represents a + proportion of documents, integer absolute counts. + + max_df : int or float, default to 1 + Remove words that have a document frequency + higher than max_df. If float, it represents a + proportion of documents, integer absolute counts. + + """ + # Use term_frequency to do the filtering + # for us (cannot do this faster as we + # need to build the document-term matrix + # anyway to filter by min_df and max_df). s_term_frequency = representation.term_frequency( - s, - max_features=max_features, - min_df=min_df, - max_df=max_df + s, max_features=max_words, min_df=min_df, max_df=max_df ) + # The remaining tokens are exactly the subcolumn names + # in the term_frequency DocumentTermDF. tokens_to_keep = set(s_term_frequency.columns.levels[1]) + # Go through documents and only keep tokens in tokens_to_keep. # FIXME: Parallelize this after #162 is merged. - return s.apply(lambda token_list: [token for token in token_list if token in tokens_to_keep]) + return s.apply( + lambda token_list: [token for token in token_list if token in tokens_to_keep] + ) From 4cdf2c1dac74af21d982ad9c675a70d009597f7c Mon Sep 17 00:00:00 2001 From: Maximilian Krahn Date: Wed, 26 Aug 2020 17:30:16 +0200 Subject: [PATCH 13/21] added test for filter extrems --- tests/test_preprocessing.py | 42 +++++++++++++++++++++++++++++++++++++ texthero/preprocessing.py | 16 +++++--------- 2 files changed, 47 insertions(+), 11 deletions(-) diff --git a/tests/test_preprocessing.py b/tests/test_preprocessing.py index 4ca3ace2..50f3ded1 100644 --- a/tests/test_preprocessing.py +++ b/tests/test_preprocessing.py @@ -381,3 +381,45 @@ def test_remove_hashtags(self): s_true = pd.Series("Hi , we will remove you") self.assertEqual(preprocessing.remove_hashtags(s), s_true) + + """ + Filter Extremes + """ + + def test_filter_extrems(self): + s = pd.Series( + [ + "Here one two one one one go there", + "two go one one one two two two is important", + ] + ) + s_result = s.pipe(preprocessing.tokenize).pipe(preprocessing.filter_extremes, 3) + s_true = pd.Series( + [ + ["one", "two", "one", "one", "one", "go"], + ["two", "go", "one", "one", "one", "two", "two", "two"], + ] + ) + pd.testing.assert_series_equal(s_result, s_true) + + def test_filter_extrems_min_and_max(self): + s = pd.Series( + [ + "Here one two one one one go there", + "two go one one one two two two is important", + "one two three four this is good", + "here one one important statement", + ] + ) + s_result = s.pipe(preprocessing.tokenize).pipe( + preprocessing.filter_extremes, min_df=2, max_df=3 + ) + s_true = pd.Series( + [ + ["two", "go"], + ["two", "go", "two", "two", "two", "is", "important"], + ["two", "is"], + ["important"], + ] + ) + pd.testing.assert_series_equal(s_result, s_true) diff --git a/texthero/preprocessing.py b/texthero/preprocessing.py index baf53516..929cb130 100644 --- a/texthero/preprocessing.py +++ b/texthero/preprocessing.py @@ -962,24 +962,18 @@ def remove_hashtags(s: TextSeries) -> TextSeries: return replace_hashtags(s, " ") -def filter_extremes( - s: pd.Series, - max_features=None, - min_df=1, - max_df=1.0 -): +def filter_extremes(s: pd.Series, max_features=None, min_df=1, max_df=1.0): """ Filter out tokens in the dictionary by their frequency. """ s_term_frequency = representation.term_frequency( - s, - max_features=max_features, - min_df=min_df, - max_df=max_df + s, max_features=max_features, min_df=min_df, max_df=max_df ) tokens_to_keep = set(s_term_frequency.columns.levels[1]) # FIXME: Parallelize this after #162 is merged. - return s.apply(lambda token_list: [token for token in token_list if token in tokens_to_keep]) + return s.apply( + lambda token_list: [token for token in token_list if token in tokens_to_keep] + ) From a5f3736fb46a27bacb72844c76adec03058f4940 Mon Sep 17 00:00:00 2001 From: Maximilian Krahn Date: Wed, 26 Aug 2020 17:36:56 +0200 Subject: [PATCH 14/21] added example in docstring + typing --- texthero/preprocessing.py | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/texthero/preprocessing.py b/texthero/preprocessing.py index 07584d66..ed461ad4 100644 --- a/texthero/preprocessing.py +++ b/texthero/preprocessing.py @@ -962,7 +962,10 @@ def remove_hashtags(s: TextSeries) -> TextSeries: return replace_hashtags(s, " ") -def filter_extremes(s: pd.Series, max_words=None, min_df=1, max_df=1.0): +@InputSeries(TokenSeries) +def filter_extremes( + s: TokenSeries, max_words=None, min_df=1, max_df=1.0 +) -> TokenSeries: """ Decrease the size of your documents by filtering out words by their frequency. @@ -1000,6 +1003,20 @@ def filter_extremes(s: pd.Series, max_words=None, min_df=1, max_df=1.0): higher than max_df. If float, it represents a proportion of documents, integer absolute counts. + Example + ------- + >>> import texthero as hero + >>> import pandas as pd + >>> s = pd.Series( + ... [ + ... "Here one two one one one go there", + ... "two go one one one two two two is important", + ... ] + ... ) + >>> s.pipe(hero.tokenize).pipe(hero.filter_extremes, 3) + 0 [one, two, one, one, one, go] + 1 [two, go, one, one, one, two, two, two] + """ # Use term_frequency to do the filtering # for us (cannot do this faster as we From 8a6fde0ce9d44bbbff5615439cd2187cf2e4792d Mon Sep 17 00:00:00 2001 From: Maximilian Krahn Date: Wed, 26 Aug 2020 17:38:30 +0200 Subject: [PATCH 15/21] addded example --- texthero/preprocessing.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/texthero/preprocessing.py b/texthero/preprocessing.py index ed461ad4..15f13713 100644 --- a/texthero/preprocessing.py +++ b/texthero/preprocessing.py @@ -1007,7 +1007,7 @@ def filter_extremes( ------- >>> import texthero as hero >>> import pandas as pd - >>> s = pd.Series( + >>> s = pd.Series( ... [ ... "Here one two one one one go there", ... "two go one one one two two two is important", @@ -1016,7 +1016,7 @@ def filter_extremes( >>> s.pipe(hero.tokenize).pipe(hero.filter_extremes, 3) 0 [one, two, one, one, one, go] 1 [two, go, one, one, one, two, two, two] - + dtype: object """ # Use term_frequency to do the filtering # for us (cannot do this faster as we From d6cc5f8850898ca8854bd6488c23419ff94c5136 Mon Sep 17 00:00:00 2001 From: Maximilian Krahn Date: Wed, 26 Aug 2020 20:09:38 +0200 Subject: [PATCH 16/21] format with new black version --- tests/test_indexes.py | 18 ++++++++++--- tests/test_representation.py | 30 ++++++++++++++++++---- texthero/nlp.py | 2 +- texthero/preprocessing.py | 50 ++++++++++++++++++------------------ texthero/representation.py | 44 +++++++++++++++++++------------ 5 files changed, 94 insertions(+), 50 deletions(-) diff --git a/tests/test_indexes.py b/tests/test_indexes.py index af7afcd2..01fe3e4f 100644 --- a/tests/test_indexes.py +++ b/tests/test_indexes.py @@ -56,9 +56,21 @@ ] test_cases_representation = [ - ["count", representation.count, (s_tokenized_lists,),], - ["term_frequency", representation.term_frequency, (s_tokenized_lists,),], - ["tfidf", representation.tfidf, (s_tokenized_lists,),], + [ + "count", + representation.count, + (s_tokenized_lists,), + ], + [ + "term_frequency", + representation.term_frequency, + (s_tokenized_lists,), + ], + [ + "tfidf", + representation.tfidf, + (s_tokenized_lists,), + ], ["pca", representation.pca, (s_numeric_lists, 0)], ["nmf", representation.nmf, (s_numeric_lists,)], ["tsne", representation.tsne, (s_numeric_lists,)], diff --git a/tests/test_representation.py b/tests/test_representation.py index 3dd12d62..6643b75f 100644 --- a/tests/test_representation.py +++ b/tests/test_representation.py @@ -142,16 +142,29 @@ def _get_multiindex_for_tokenized_output(first_level_name): test_cases_dim_reduction_and_clustering = [ # format: [function_name, function, correct output for s_vector_series and s_documenttermDF input above] - ["pca", representation.pca, pd.Series([[-0.5, 0.0], [0.5, 0.0]], index=[5, 7],),], + [ + "pca", + representation.pca, + pd.Series( + [[-0.5, 0.0], [0.5, 0.0]], + index=[5, 7], + ), + ], [ "nmf", representation.nmf, - pd.Series([[5.119042424626627, 0.0], [0.0, 0.0]], index=[5, 7],), + pd.Series( + [[5.119042424626627, 0.0], [0.0, 0.0]], + index=[5, 7], + ), ], [ "tsne", representation.tsne, - pd.Series([[164.86682, 1814.1647], [-164.8667, -1814.1644]], index=[5, 7],), + pd.Series( + [[164.86682, 1814.1647], [-164.8667, -1814.1644]], + index=[5, 7], + ), ], [ "kmeans", @@ -171,7 +184,10 @@ def _get_multiindex_for_tokenized_output(first_level_name): [ "normalize", representation.normalize, - pd.Series([[1.0, 0.0], [0.0, 0.0]], index=[5, 7],), + pd.Series( + [[1.0, 0.0], [0.0, 0.0]], + index=[5, 7], + ), ], ] @@ -283,5 +299,9 @@ def test_normalize_documenttermDF_also_as_output(self): ) pd.testing.assert_frame_equal( - result, correct_output, check_dtype=False, rtol=0.1, atol=0.1, + result, + correct_output, + check_dtype=False, + rtol=0.1, + atol=0.1, ) diff --git a/texthero/nlp.py b/texthero/nlp.py index f32498e5..863d5f70 100644 --- a/texthero/nlp.py +++ b/texthero/nlp.py @@ -156,7 +156,7 @@ def pos_tag(s: TextSeries) -> pd.Series: coarse-grained POS has a NOUN value, then the refined POS will give more details about the type of the noun, whether it is singular, plural and/or proper. - + You can use the spacy `explain` function to find out which fine-grained POS it is. diff --git a/texthero/preprocessing.py b/texthero/preprocessing.py index 15f13713..1e7d1d54 100644 --- a/texthero/preprocessing.py +++ b/texthero/preprocessing.py @@ -38,8 +38,8 @@ def fillna(s: TextSeries) -> TextSeries: >>> s = pd.Series(["I'm", np.NaN, pd.NA, "You're"]) >>> hero.fillna(s) 0 I'm - 1 - 2 + 1 + 2 3 You're dtype: object """ @@ -51,7 +51,7 @@ def lowercase(s: TextSeries) -> TextSeries: """ Lowercase all texts in a series. - + Examples -------- >>> import texthero as hero @@ -129,10 +129,10 @@ def remove_digits(s: TextSeries, only_blocks=True) -> TextSeries: >>> import pandas as pd >>> s = pd.Series("7ex7hero is fun 1111") >>> hero.preprocessing.remove_digits(s) - 0 7ex7hero is fun + 0 7ex7hero is fun dtype: object >>> hero.preprocessing.remove_digits(s, only_blocks=False) - 0 ex hero is fun + 0 ex hero is fun dtype: object """ @@ -145,8 +145,8 @@ def replace_punctuation(s: TextSeries, symbol: str = " ") -> TextSeries: Replace all punctuation with a given symbol. Replace all punctuation from the given - Pandas Series with a custom symbol. - It considers as punctuation characters all :data:`string.punctuation` + Pandas Series with a custom symbol. + It considers as punctuation characters all :data:`string.punctuation` symbols `!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~).` @@ -155,7 +155,7 @@ def replace_punctuation(s: TextSeries, symbol: str = " ") -> TextSeries: s : :class:`texthero._types.TextSeries` symbol : str (default single empty space) - Symbol to use as replacement for all string punctuation. + Symbol to use as replacement for all string punctuation. Examples -------- @@ -163,7 +163,7 @@ def replace_punctuation(s: TextSeries, symbol: str = " ") -> TextSeries: >>> import pandas as pd >>> s = pd.Series("Finnaly.") >>> hero.replace_punctuation(s, " ") - 0 Finnaly + 0 Finnaly dtype: object """ @@ -188,7 +188,7 @@ def remove_punctuation(s: TextSeries) -> TextSeries: >>> import pandas as pd >>> s = pd.Series("Finnaly.") >>> hero.remove_punctuation(s) - 0 Finnaly + 0 Finnaly dtype: object """ return replace_punctuation(s, " ") @@ -310,7 +310,7 @@ def replace_stopwords( stopwords : Set[str], Optional Set of stopwords string to remove. If not passed, by default it used - NLTK English stopwords. + NLTK English stopwords. Examples -------- @@ -366,7 +366,7 @@ def remove_stopwords( >>> custom_stopwords = default_stopwords.union(set(["heroes"])) >>> s = pd.Series("Texthero is not only for the heroes") >>> hero.remove_stopwords(s, custom_stopwords) - 0 Texthero + 0 Texthero dtype: object @@ -476,7 +476,7 @@ def clean(s: TextSeries, pipeline=None) -> TextSeries: pipeline :List[Callable[[Pandas Series], Pandas Series]] inserting specific pipeline to clean a text - + Examples -------- For the default pipeline: @@ -551,7 +551,7 @@ def remove_round_brackets(s: TextSeries) -> TextSeries: >>> import pandas as pd >>> s = pd.Series("Texthero (is not a superhero!)") >>> hero.remove_round_brackets(s) - 0 Texthero + 0 Texthero dtype: object See also @@ -577,7 +577,7 @@ def remove_curly_brackets(s: TextSeries) -> TextSeries: >>> import pandas as pd >>> s = pd.Series("Texthero {is not a superhero!}") >>> hero.remove_curly_brackets(s) - 0 Texthero + 0 Texthero dtype: object See also @@ -603,7 +603,7 @@ def remove_square_brackets(s: TextSeries) -> TextSeries: >>> import pandas as pd >>> s = pd.Series("Texthero [is not a superhero!]") >>> hero.remove_square_brackets(s) - 0 Texthero + 0 Texthero dtype: object See also @@ -630,7 +630,7 @@ def remove_angle_brackets(s: TextSeries) -> TextSeries: >>> import pandas as pd >>> s = pd.Series("Texthero ") >>> hero.remove_angle_brackets(s) - 0 Texthero + 0 Texthero dtype: object See also @@ -657,7 +657,7 @@ def remove_brackets(s: TextSeries) -> TextSeries: >>> import pandas as pd >>> s = pd.Series("Texthero (round) [square] [curly] [angle]") >>> hero.remove_brackets(s) - 0 Texthero + 0 Texthero dtype: object See also @@ -760,13 +760,13 @@ def phrases( Parameters ---------- s : :class:`texthero._types.TokenSeries` - + min_count : Int, optional. Default is 5. ignore tokens with frequency less than this - + threshold : Int, optional. Default is 10. ignore tokens with a score under that threshold - + symbol : Str, optional. Default is '_'. character used to join collocation words @@ -844,7 +844,7 @@ def remove_urls(s: TextSeries) -> TextSeries: >>> import pandas as pd >>> s = pd.Series("Go to: https://example.com") >>> hero.remove_urls(s) - 0 Go to: + 0 Go to: dtype: object See also @@ -915,7 +915,7 @@ def replace_hashtags(s: TextSeries, symbol: str) -> TextSeries: """Replace all hashtags from a Pandas Series with symbol A hashtag is a string formed by # concatenated with a sequence of - characters, digits and underscores. Example: #texthero_123. + characters, digits and underscores. Example: #texthero_123. Parameters ---------- @@ -923,7 +923,7 @@ def replace_hashtags(s: TextSeries, symbol: str) -> TextSeries: symbols : str Symbols to replace - + Examples -------- >>> import texthero as hero @@ -943,7 +943,7 @@ def remove_hashtags(s: TextSeries) -> TextSeries: """Remove all hashtags from a given Pandas Series A hashtag is a string formed by # concatenated with a sequence of - characters, digits and underscores. Example: #texthero_123. + characters, digits and underscores. Example: #texthero_123. Examples -------- diff --git a/texthero/representation.py b/texthero/representation.py index 213d02a4..c03d93ef 100644 --- a/texthero/representation.py +++ b/texthero/representation.py @@ -78,7 +78,7 @@ def count( min_df : float in range [0.0, 1.0] or int, default=1 When building the vocabulary ignore terms that have a document - frequency (number of documents they appear in) strictly + frequency (number of documents they appear in) strictly lower than the given threshold. If float, the parameter represents a proportion of documents, integer absolute counts. @@ -98,11 +98,11 @@ def count( >>> import pandas as pd >>> s = pd.Series(["Sentence one", "Sentence two"]).pipe(hero.tokenize) >>> hero.count(s) # doctest: +SKIP - count + count Sentence one two 0 1 1 0 1 1 0 1 - + See Also -------- @@ -136,7 +136,10 @@ def count( def term_frequency( - s: pd.Series, max_features: Optional[int] = None, min_df=1, max_df=1.0, + s: pd.Series, + max_features: Optional[int] = None, + min_df=1, + max_df=1.0, ) -> pd.DataFrame: """ Represent a text-based Pandas Series using term frequency. @@ -159,7 +162,7 @@ def term_frequency( min_df : float in range [0.0, 1.0] or int, default=1 When building the vocabulary ignore terms that have a document - frequency (number of documents they appear in) strictly + frequency (number of documents they appear in) strictly lower than the given threshold. If float, the parameter represents a proportion of documents, integer absolute counts. @@ -176,7 +179,7 @@ def term_frequency( >>> import pandas as pd >>> s = pd.Series(["Text Text of doc one", "Text of of doc two", "Aha hi bnd one"]).pipe(hero.tokenize) >>> hero.term_frequency(s) # doctest: +SKIP - term_frequency + term_frequency Aha Text bnd doc hi of one two 0 0.00 0.4 0.00 0.2 0.00 0.2 0.20 0.0 1 0.00 0.2 0.00 0.2 0.00 0.4 0.00 0.2 @@ -199,7 +202,12 @@ def term_frequency( return s_term_frequency -def tfidf(s: pd.Series, max_features=None, min_df=1, max_df=1.0,) -> pd.DataFrame: +def tfidf( + s: pd.Series, + max_features=None, + min_df=1, + max_df=1.0, +) -> pd.DataFrame: """ Represent a text-based Pandas Series using TF-IDF. @@ -218,7 +226,7 @@ def tfidf(s: pd.Series, max_features=None, min_df=1, max_df=1.0,) -> pd.DataFram Finally, tf-idf(document d, term t) = tf(d, t) * idf(t). - Different from the `sklearn-implementation of + Different from the `sklearn-implementation of tfidf `, this function does *not* normalize the output in any way, so the result is exactly what you get applying the @@ -240,7 +248,7 @@ def tfidf(s: pd.Series, max_features=None, min_df=1, max_df=1.0,) -> pd.DataFram min_df : float in range [0.0, 1.0] or int, default=1 When building the vocabulary ignore terms that have a document - frequency (number of documents they appear in) strictly + frequency (number of documents they appear in) strictly lower than the given threshold. If float, the parameter represents a proportion of documents, integer absolute counts. @@ -258,7 +266,7 @@ def tfidf(s: pd.Series, max_features=None, min_df=1, max_df=1.0,) -> pd.DataFram >>> import pandas as pd >>> s = pd.Series(["Hi Bye", "Test Bye Bye"]).pipe(hero.tokenize) >>> hero.tfidf(s) # doctest: +SKIP - tfidf + tfidf Bye Hi Test 0 1.0 1.405465 0.000000 1 2.0 0.000000 1.405465 @@ -386,7 +394,7 @@ def nmf( natural language processing to find clusters of similar texts (e.g. some texts in a corpus might be about sports and some about music, so they will differ in the usage - of technical terms; see the example below). + of technical terms; see the example below). Given a document-term matrix (so in texthero usually a Series after applying @@ -429,7 +437,7 @@ def nmf( >>> # As we can see, the third document, which >>> # is a mix of sports and music, is placed >>> # between the two axes (the topics) while - >>> # the other documents are placed right on + >>> # the other documents are placed right on >>> # one topic axis each. See also @@ -438,7 +446,11 @@ def nmf( `_ """ - nmf = NMF(n_components=n_components, init="random", random_state=random_state,) + nmf = NMF( + n_components=n_components, + init="random", + random_state=random_state, + ) if _check_is_valid_DocumentTermDF(s): s_coo = s.sparse.to_coo() @@ -568,11 +580,11 @@ def kmeans( Performs K-means clustering algorithm. K-means clustering is used in natural language processing - to separate texts into k clusters (groups) + to separate texts into k clusters (groups) (e.g. some texts in a corpus might be about sports and some about music, so they will differ in the usage of technical terms; the K-means algorithm uses this - to separate them into two clusters). + to separate them into two clusters). Given a document-term matrix (so in texthero usually a Series after applying @@ -905,7 +917,7 @@ def normalize(s: Union[pd.DataFrame, pd.Series], norm="l2") -> pd.Series: >>> col = pd.MultiIndex.from_tuples([(0, "a"), (0, "b"), (1, "c"), (1, "d")]) >>> s = pd.DataFrame([[1, 2, 3, 4],[4, 2, 7, 5],[2, 2, 3, 5],[1, 2, 9, 8]], columns=col).astype("Sparse") >>> hero.normalize(s, norm="max") # doctest: +SKIP - 0 1 + 0 1 a b c d 0 0.250000 0.500000 0.75 1.000000 1 0.571429 0.285714 1.00 0.714286 From 86c1c097e6613f4161bbf9c4deb81b622eb9a672 Mon Sep 17 00:00:00 2001 From: Henri Froese Date: Fri, 28 Aug 2020 17:05:43 +0200 Subject: [PATCH 17/21] Fix formatting errors by rolling back black update. Black just rolled out V20.8b1. This creates errors with our ./tests.sh -> switch back --- .travis.yml | 2 +- setup.cfg | 2 +- texthero/nlp.py | 2 +- texthero/preprocessing.py | 25 ++++++++++++------------- texthero/representation.py | 26 +++++++++++++------------- 5 files changed, 28 insertions(+), 29 deletions(-) diff --git a/.travis.yml b/.travis.yml index f913f183..c76284b3 100644 --- a/.travis.yml +++ b/.travis.yml @@ -20,7 +20,7 @@ jobs: env: PATH=/c/Python38:/c/Python38/Scripts:$PATH install: - pip3 install --upgrade pip # all three OSes agree about 'pip3' - - pip3 install black + - pip3 install black==19.10b0 - pip3 install ".[dev]" . # 'python' points to Python 2.7 on macOS but points to Python 3.8 on Linux and Windows # 'python3' is a 'command not found' error on Windows but 'py' works on Windows only diff --git a/setup.cfg b/setup.cfg index d6103b02..3f86e7f3 100644 --- a/setup.cfg +++ b/setup.cfg @@ -41,7 +41,7 @@ install_requires = # TODO pick the correct version. [options.extras_require] dev = - black>=19.10b0 + black==19.10b0 pytest>=4.0.0 Sphinx>=3.0.3 sphinx-markdown-builder>=0.5.4 diff --git a/texthero/nlp.py b/texthero/nlp.py index f32498e5..863d5f70 100644 --- a/texthero/nlp.py +++ b/texthero/nlp.py @@ -156,7 +156,7 @@ def pos_tag(s: TextSeries) -> pd.Series: coarse-grained POS has a NOUN value, then the refined POS will give more details about the type of the noun, whether it is singular, plural and/or proper. - + You can use the spacy `explain` function to find out which fine-grained POS it is. diff --git a/texthero/preprocessing.py b/texthero/preprocessing.py index 07584d66..c099c44f 100644 --- a/texthero/preprocessing.py +++ b/texthero/preprocessing.py @@ -51,7 +51,7 @@ def lowercase(s: TextSeries) -> TextSeries: """ Lowercase all texts in a series. - + Examples -------- >>> import texthero as hero @@ -145,8 +145,8 @@ def replace_punctuation(s: TextSeries, symbol: str = " ") -> TextSeries: Replace all punctuation with a given symbol. Replace all punctuation from the given - Pandas Series with a custom symbol. - It considers as punctuation characters all :data:`string.punctuation` + Pandas Series with a custom symbol. + It considers as punctuation characters all :data:`string.punctuation` symbols `!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~).` @@ -155,7 +155,7 @@ def replace_punctuation(s: TextSeries, symbol: str = " ") -> TextSeries: s : :class:`texthero._types.TextSeries` symbol : str (default single empty space) - Symbol to use as replacement for all string punctuation. + Symbol to use as replacement for all string punctuation. Examples -------- @@ -310,7 +310,7 @@ def replace_stopwords( stopwords : Set[str], Optional Set of stopwords string to remove. If not passed, by default it used - NLTK English stopwords. + NLTK English stopwords. Examples -------- @@ -369,7 +369,6 @@ def remove_stopwords( 0 Texthero dtype: object - """ return replace_stopwords(s, symbol="", stopwords=stopwords) @@ -476,7 +475,7 @@ def clean(s: TextSeries, pipeline=None) -> TextSeries: pipeline :List[Callable[[Pandas Series], Pandas Series]] inserting specific pipeline to clean a text - + Examples -------- For the default pipeline: @@ -760,13 +759,13 @@ def phrases( Parameters ---------- s : :class:`texthero._types.TokenSeries` - + min_count : Int, optional. Default is 5. ignore tokens with frequency less than this - + threshold : Int, optional. Default is 10. ignore tokens with a score under that threshold - + symbol : Str, optional. Default is '_'. character used to join collocation words @@ -915,7 +914,7 @@ def replace_hashtags(s: TextSeries, symbol: str) -> TextSeries: """Replace all hashtags from a Pandas Series with symbol A hashtag is a string formed by # concatenated with a sequence of - characters, digits and underscores. Example: #texthero_123. + characters, digits and underscores. Example: #texthero_123. Parameters ---------- @@ -923,7 +922,7 @@ def replace_hashtags(s: TextSeries, symbol: str) -> TextSeries: symbols : str Symbols to replace - + Examples -------- >>> import texthero as hero @@ -943,7 +942,7 @@ def remove_hashtags(s: TextSeries) -> TextSeries: """Remove all hashtags from a given Pandas Series A hashtag is a string formed by # concatenated with a sequence of - characters, digits and underscores. Example: #texthero_123. + characters, digits and underscores. Example: #texthero_123. Examples -------- diff --git a/texthero/representation.py b/texthero/representation.py index 213d02a4..c21b7075 100644 --- a/texthero/representation.py +++ b/texthero/representation.py @@ -78,7 +78,7 @@ def count( min_df : float in range [0.0, 1.0] or int, default=1 When building the vocabulary ignore terms that have a document - frequency (number of documents they appear in) strictly + frequency (number of documents they appear in) strictly lower than the given threshold. If float, the parameter represents a proportion of documents, integer absolute counts. @@ -98,11 +98,11 @@ def count( >>> import pandas as pd >>> s = pd.Series(["Sentence one", "Sentence two"]).pipe(hero.tokenize) >>> hero.count(s) # doctest: +SKIP - count + count Sentence one two 0 1 1 0 1 1 0 1 - + See Also -------- @@ -159,7 +159,7 @@ def term_frequency( min_df : float in range [0.0, 1.0] or int, default=1 When building the vocabulary ignore terms that have a document - frequency (number of documents they appear in) strictly + frequency (number of documents they appear in) strictly lower than the given threshold. If float, the parameter represents a proportion of documents, integer absolute counts. @@ -176,7 +176,7 @@ def term_frequency( >>> import pandas as pd >>> s = pd.Series(["Text Text of doc one", "Text of of doc two", "Aha hi bnd one"]).pipe(hero.tokenize) >>> hero.term_frequency(s) # doctest: +SKIP - term_frequency + term_frequency Aha Text bnd doc hi of one two 0 0.00 0.4 0.00 0.2 0.00 0.2 0.20 0.0 1 0.00 0.2 0.00 0.2 0.00 0.4 0.00 0.2 @@ -218,7 +218,7 @@ def tfidf(s: pd.Series, max_features=None, min_df=1, max_df=1.0,) -> pd.DataFram Finally, tf-idf(document d, term t) = tf(d, t) * idf(t). - Different from the `sklearn-implementation of + Different from the `sklearn-implementation of tfidf `, this function does *not* normalize the output in any way, so the result is exactly what you get applying the @@ -240,7 +240,7 @@ def tfidf(s: pd.Series, max_features=None, min_df=1, max_df=1.0,) -> pd.DataFram min_df : float in range [0.0, 1.0] or int, default=1 When building the vocabulary ignore terms that have a document - frequency (number of documents they appear in) strictly + frequency (number of documents they appear in) strictly lower than the given threshold. If float, the parameter represents a proportion of documents, integer absolute counts. @@ -258,7 +258,7 @@ def tfidf(s: pd.Series, max_features=None, min_df=1, max_df=1.0,) -> pd.DataFram >>> import pandas as pd >>> s = pd.Series(["Hi Bye", "Test Bye Bye"]).pipe(hero.tokenize) >>> hero.tfidf(s) # doctest: +SKIP - tfidf + tfidf Bye Hi Test 0 1.0 1.405465 0.000000 1 2.0 0.000000 1.405465 @@ -386,7 +386,7 @@ def nmf( natural language processing to find clusters of similar texts (e.g. some texts in a corpus might be about sports and some about music, so they will differ in the usage - of technical terms; see the example below). + of technical terms; see the example below). Given a document-term matrix (so in texthero usually a Series after applying @@ -429,7 +429,7 @@ def nmf( >>> # As we can see, the third document, which >>> # is a mix of sports and music, is placed >>> # between the two axes (the topics) while - >>> # the other documents are placed right on + >>> # the other documents are placed right on >>> # one topic axis each. See also @@ -568,11 +568,11 @@ def kmeans( Performs K-means clustering algorithm. K-means clustering is used in natural language processing - to separate texts into k clusters (groups) + to separate texts into k clusters (groups) (e.g. some texts in a corpus might be about sports and some about music, so they will differ in the usage of technical terms; the K-means algorithm uses this - to separate them into two clusters). + to separate them into two clusters). Given a document-term matrix (so in texthero usually a Series after applying @@ -905,7 +905,7 @@ def normalize(s: Union[pd.DataFrame, pd.Series], norm="l2") -> pd.Series: >>> col = pd.MultiIndex.from_tuples([(0, "a"), (0, "b"), (1, "c"), (1, "d")]) >>> s = pd.DataFrame([[1, 2, 3, 4],[4, 2, 7, 5],[2, 2, 3, 5],[1, 2, 9, 8]], columns=col).astype("Sparse") >>> hero.normalize(s, norm="max") # doctest: +SKIP - 0 1 + 0 1 a b c d 0 0.250000 0.500000 0.75 1.000000 1 0.571429 0.285714 1.00 0.714286 From b0ca92c19e7263ae3828da954ddb207b3125d1ab Mon Sep 17 00:00:00 2001 From: Henri Froese Date: Sat, 29 Aug 2020 13:22:41 +0200 Subject: [PATCH 18/21] Finish fixing formatting. --- tests/test_indexes.py | 18 +++--------------- tests/test_representation.py | 30 +++++------------------------- texthero/preprocessing.py | 26 +++++++++++++------------- texthero/representation.py | 18 +++--------------- 4 files changed, 24 insertions(+), 68 deletions(-) diff --git a/tests/test_indexes.py b/tests/test_indexes.py index 01fe3e4f..af7afcd2 100644 --- a/tests/test_indexes.py +++ b/tests/test_indexes.py @@ -56,21 +56,9 @@ ] test_cases_representation = [ - [ - "count", - representation.count, - (s_tokenized_lists,), - ], - [ - "term_frequency", - representation.term_frequency, - (s_tokenized_lists,), - ], - [ - "tfidf", - representation.tfidf, - (s_tokenized_lists,), - ], + ["count", representation.count, (s_tokenized_lists,),], + ["term_frequency", representation.term_frequency, (s_tokenized_lists,),], + ["tfidf", representation.tfidf, (s_tokenized_lists,),], ["pca", representation.pca, (s_numeric_lists, 0)], ["nmf", representation.nmf, (s_numeric_lists,)], ["tsne", representation.tsne, (s_numeric_lists,)], diff --git a/tests/test_representation.py b/tests/test_representation.py index 6643b75f..3dd12d62 100644 --- a/tests/test_representation.py +++ b/tests/test_representation.py @@ -142,29 +142,16 @@ def _get_multiindex_for_tokenized_output(first_level_name): test_cases_dim_reduction_and_clustering = [ # format: [function_name, function, correct output for s_vector_series and s_documenttermDF input above] - [ - "pca", - representation.pca, - pd.Series( - [[-0.5, 0.0], [0.5, 0.0]], - index=[5, 7], - ), - ], + ["pca", representation.pca, pd.Series([[-0.5, 0.0], [0.5, 0.0]], index=[5, 7],),], [ "nmf", representation.nmf, - pd.Series( - [[5.119042424626627, 0.0], [0.0, 0.0]], - index=[5, 7], - ), + pd.Series([[5.119042424626627, 0.0], [0.0, 0.0]], index=[5, 7],), ], [ "tsne", representation.tsne, - pd.Series( - [[164.86682, 1814.1647], [-164.8667, -1814.1644]], - index=[5, 7], - ), + pd.Series([[164.86682, 1814.1647], [-164.8667, -1814.1644]], index=[5, 7],), ], [ "kmeans", @@ -184,10 +171,7 @@ def _get_multiindex_for_tokenized_output(first_level_name): [ "normalize", representation.normalize, - pd.Series( - [[1.0, 0.0], [0.0, 0.0]], - index=[5, 7], - ), + pd.Series([[1.0, 0.0], [0.0, 0.0]], index=[5, 7],), ], ] @@ -299,9 +283,5 @@ def test_normalize_documenttermDF_also_as_output(self): ) pd.testing.assert_frame_equal( - result, - correct_output, - check_dtype=False, - rtol=0.1, - atol=0.1, + result, correct_output, check_dtype=False, rtol=0.1, atol=0.1, ) diff --git a/texthero/preprocessing.py b/texthero/preprocessing.py index effeb074..d5d41fa0 100644 --- a/texthero/preprocessing.py +++ b/texthero/preprocessing.py @@ -38,8 +38,8 @@ def fillna(s: TextSeries) -> TextSeries: >>> s = pd.Series(["I'm", np.NaN, pd.NA, "You're"]) >>> hero.fillna(s) 0 I'm - 1 - 2 + 1 + 2 3 You're dtype: object """ @@ -129,10 +129,10 @@ def remove_digits(s: TextSeries, only_blocks=True) -> TextSeries: >>> import pandas as pd >>> s = pd.Series("7ex7hero is fun 1111") >>> hero.preprocessing.remove_digits(s) - 0 7ex7hero is fun + 0 7ex7hero is fun dtype: object >>> hero.preprocessing.remove_digits(s, only_blocks=False) - 0 ex hero is fun + 0 ex hero is fun dtype: object """ @@ -163,7 +163,7 @@ def replace_punctuation(s: TextSeries, symbol: str = " ") -> TextSeries: >>> import pandas as pd >>> s = pd.Series("Finnaly.") >>> hero.replace_punctuation(s, " ") - 0 Finnaly + 0 Finnaly dtype: object """ @@ -188,7 +188,7 @@ def remove_punctuation(s: TextSeries) -> TextSeries: >>> import pandas as pd >>> s = pd.Series("Finnaly.") >>> hero.remove_punctuation(s) - 0 Finnaly + 0 Finnaly dtype: object """ return replace_punctuation(s, " ") @@ -366,7 +366,7 @@ def remove_stopwords( >>> custom_stopwords = default_stopwords.union(set(["heroes"])) >>> s = pd.Series("Texthero is not only for the heroes") >>> hero.remove_stopwords(s, custom_stopwords) - 0 Texthero + 0 Texthero dtype: object """ @@ -550,7 +550,7 @@ def remove_round_brackets(s: TextSeries) -> TextSeries: >>> import pandas as pd >>> s = pd.Series("Texthero (is not a superhero!)") >>> hero.remove_round_brackets(s) - 0 Texthero + 0 Texthero dtype: object See also @@ -576,7 +576,7 @@ def remove_curly_brackets(s: TextSeries) -> TextSeries: >>> import pandas as pd >>> s = pd.Series("Texthero {is not a superhero!}") >>> hero.remove_curly_brackets(s) - 0 Texthero + 0 Texthero dtype: object See also @@ -602,7 +602,7 @@ def remove_square_brackets(s: TextSeries) -> TextSeries: >>> import pandas as pd >>> s = pd.Series("Texthero [is not a superhero!]") >>> hero.remove_square_brackets(s) - 0 Texthero + 0 Texthero dtype: object See also @@ -629,7 +629,7 @@ def remove_angle_brackets(s: TextSeries) -> TextSeries: >>> import pandas as pd >>> s = pd.Series("Texthero ") >>> hero.remove_angle_brackets(s) - 0 Texthero + 0 Texthero dtype: object See also @@ -656,7 +656,7 @@ def remove_brackets(s: TextSeries) -> TextSeries: >>> import pandas as pd >>> s = pd.Series("Texthero (round) [square] [curly] [angle]") >>> hero.remove_brackets(s) - 0 Texthero + 0 Texthero dtype: object See also @@ -843,7 +843,7 @@ def remove_urls(s: TextSeries) -> TextSeries: >>> import pandas as pd >>> s = pd.Series("Go to: https://example.com") >>> hero.remove_urls(s) - 0 Go to: + 0 Go to: dtype: object See also diff --git a/texthero/representation.py b/texthero/representation.py index c03d93ef..c21b7075 100644 --- a/texthero/representation.py +++ b/texthero/representation.py @@ -136,10 +136,7 @@ def count( def term_frequency( - s: pd.Series, - max_features: Optional[int] = None, - min_df=1, - max_df=1.0, + s: pd.Series, max_features: Optional[int] = None, min_df=1, max_df=1.0, ) -> pd.DataFrame: """ Represent a text-based Pandas Series using term frequency. @@ -202,12 +199,7 @@ def term_frequency( return s_term_frequency -def tfidf( - s: pd.Series, - max_features=None, - min_df=1, - max_df=1.0, -) -> pd.DataFrame: +def tfidf(s: pd.Series, max_features=None, min_df=1, max_df=1.0,) -> pd.DataFrame: """ Represent a text-based Pandas Series using TF-IDF. @@ -446,11 +438,7 @@ def nmf( `_ """ - nmf = NMF( - n_components=n_components, - init="random", - random_state=random_state, - ) + nmf = NMF(n_components=n_components, init="random", random_state=random_state,) if _check_is_valid_DocumentTermDF(s): s_coo = s.sparse.to_coo() From efd9fde677e0d7b0da5f156b7049f9860364030d Mon Sep 17 00:00:00 2001 From: Maximilian Krahn Date: Tue, 22 Sep 2020 14:17:00 +0200 Subject: [PATCH 19/21] fixed merge issues --- tests/test_representation.py | 4 ++-- texthero/representation.py | 5 +---- 2 files changed, 3 insertions(+), 6 deletions(-) diff --git a/tests/test_representation.py b/tests/test_representation.py index 988e87c5..bbcfcae8 100644 --- a/tests/test_representation.py +++ b/tests/test_representation.py @@ -69,7 +69,7 @@ def _tfidf(term, corpus, document_index): "term_frequency", representation.term_frequency, pd.DataFrame( - [[0.125, 0.0, 0.0, 0.125, 0.250], [0.0, 0.25, 0.125, 0.0, 0.125]], + [[0.25, 0.0, 0.0, 0.25, 0.5], [0.0, 0.5, 0.25, 0.0, 0.25]], index=tokenized_output_index, columns=["!", ".", "?", "TEST", "Test"], dtype="Sparse", @@ -106,7 +106,7 @@ def _tfidf(term, corpus, document_index): "term_frequency", representation.term_frequency, pd.DataFrame( - [0.666667, 0.333333], index=tokenized_output_index, columns=["Test"], + [1, 1], index=tokenized_output_index, columns=["Test"], ).astype("Sparse[float64, nan]"), ], [ diff --git a/texthero/representation.py b/texthero/representation.py index 1d58014e..ceda1089 100644 --- a/texthero/representation.py +++ b/texthero/representation.py @@ -187,10 +187,7 @@ def term_frequency( s_term_frequency = s.pipe( count, max_features=max_features, min_df=min_df, max_df=max_df ).pipe(normalize, norm="l1") - # Rename first level of index from "count" to "term_frequency" - s_term_frequency.columns = pd.MultiIndex.from_product( - [["term_frequency"], s_term_frequency.columns.levels[1]] - ) + return s_term_frequency From c1dd5eb8ba9283d5ccad68d54c200ba3c18dd6de Mon Sep 17 00:00:00 2001 From: Maximilian Krahn Date: Tue, 22 Sep 2020 15:00:20 +0200 Subject: [PATCH 20/21] fix formatting --- tests/test_representation.py | 6 +++--- texthero/representation.py | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/test_representation.py b/tests/test_representation.py index bbcfcae8..1b9fd03a 100644 --- a/tests/test_representation.py +++ b/tests/test_representation.py @@ -105,9 +105,9 @@ def _tfidf(term, corpus, document_index): [ "term_frequency", representation.term_frequency, - pd.DataFrame( - [1, 1], index=tokenized_output_index, columns=["Test"], - ).astype("Sparse[float64, nan]"), + pd.DataFrame([1, 1], index=tokenized_output_index, columns=["Test"],).astype( + "Sparse[float64, nan]" + ), ], [ "tfidf", diff --git a/texthero/representation.py b/texthero/representation.py index ceda1089..8bbfeadb 100644 --- a/texthero/representation.py +++ b/texthero/representation.py @@ -187,7 +187,7 @@ def term_frequency( s_term_frequency = s.pipe( count, max_features=max_features, min_df=min_df, max_df=max_df ).pipe(normalize, norm="l1") - + return s_term_frequency From 87eef827d8cd1a7f0ba68c1b2a25297b3bc53430 Mon Sep 17 00:00:00 2001 From: Maximilian Krahn Date: Tue, 22 Sep 2020 15:12:29 +0200 Subject: [PATCH 21/21] fixed merge issues --- texthero/preprocessing.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/texthero/preprocessing.py b/texthero/preprocessing.py index 4c252c22..3de98214 100644 --- a/texthero/preprocessing.py +++ b/texthero/preprocessing.py @@ -974,7 +974,7 @@ def filter_extremes( # The remaining tokens are exactly the subcolumn names # in the term_frequency DocumentTermDF. - tokens_to_keep = set(s_term_frequency.columns.levels[1]) + tokens_to_keep = set(s_term_frequency.columns) # Go through documents and only keep tokens in tokens_to_keep. # FIXME: Parallelize this after #162 is merged.