Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Change representation_series to DataFrame #156

Merged
Merged
Show file tree
Hide file tree
Changes from 10 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 3 additions & 15 deletions tests/test_indexes.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,21 +56,9 @@
]

test_cases_representation = [
[
"count",
lambda x: representation.flatten(representation.count(x)),
(s_tokenized_lists,),
],
[
"term_frequency",
lambda x: representation.flatten(representation.term_frequency(x)),
(s_tokenized_lists,),
],
[
"tfidf",
lambda x: representation.flatten(representation.tfidf(x)),
(s_tokenized_lists,),
],
["count", representation.count, (s_tokenized_lists,),],
["term_frequency", representation.term_frequency, (s_tokenized_lists,),],
["tfidf", representation.tfidf, (s_tokenized_lists,),],
["pca", representation.pca, (s_numeric_lists, 0)],
["nmf", representation.nmf, (s_numeric_lists,)],
["tsne", representation.tsne, (s_numeric_lists,)],
Expand Down
279 changes: 165 additions & 114 deletions tests/test_representation.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,39 +50,129 @@ def _tfidf(term, corpus, document_index):
[["Test", "Test", "TEST", "!"], ["Test", "?", ".", "."]], index=[5, 7]
)

s_tokenized_output_index = pd.MultiIndex.from_tuples(
[(0, "!"), (0, "TEST"), (0, "Test"), (1, "."), (1, "?"), (1, "Test")],
)
s_tokenized_output_index = pd.Index([0, 1])
mk2510 marked this conversation as resolved.
Show resolved Hide resolved

s_tokenized_output_noncontinuous_index = pd.MultiIndex.from_tuples(
[(5, "!"), (5, "TEST"), (5, "Test"), (7, "."), (7, "?"), (7, "Test")],
)
s_tokenized_output_index_noncontinous = pd.Index([5, 7])

s_tokenized_output_min_df_index = pd.MultiIndex.from_tuples([(0, "Test"), (1, "Test")],)

def _get_multiindex_for_tokenized_output(first_level_name):
return pd.MultiIndex.from_product(
[[first_level_name], ["!", ".", "?", "TEST", "Test"]]
)


test_cases_vectorization = [
# format: [function_name, function, correct output for tokenized input above, dtype of output]
["count", representation.count, [1, 1, 2, 2, 1, 1], "int"],
# format: [function_name, function, correct output for tokenized input above]
[
"count",
representation.count,
pd.DataFrame(
[[1, 0, 0, 1, 2], [0, 2, 1, 0, 1]],
index=s_tokenized_output_index,
columns=_get_multiindex_for_tokenized_output("count"),
).astype("Sparse[int64, 0]"),
],
[
"term_frequency",
representation.term_frequency,
[0.125, 0.125, 0.250, 0.250, 0.125, 0.125],
"float",
pd.DataFrame(
[[0.125, 0.0, 0.0, 0.125, 0.250], [0.0, 0.25, 0.125, 0.0, 0.125]],
index=s_tokenized_output_index,
columns=_get_multiindex_for_tokenized_output("term_frequency"),
dtype="Sparse",
).astype("Sparse[float64, nan]"),
],
[
"tfidf",
representation.tfidf,
[_tfidf(x[1], s_tokenized, x[0]) for x in s_tokenized_output_index],
"float",
pd.DataFrame(
[
[
_tfidf(x, s_tokenized, 0) # Testing the tfidf formula here
for x in ["!", ".", "?", "TEST", "Test"]
],
[_tfidf(x, s_tokenized, 1) for x in ["!", ".", "?", "TEST", "Test"]],
],
index=s_tokenized_output_index,
columns=_get_multiindex_for_tokenized_output("tfidf"),
).astype("Sparse[float64, nan]"),
],
]


test_cases_vectorization_min_df = [
# format: [function_name, function, correct output for tokenized input above, dtype of output]
["count", representation.count, [2, 1], "int"],
["term_frequency", representation.term_frequency, [0.666667, 0.333333], "float",],
["tfidf", representation.tfidf, [2.0, 1.0], "float",],
# format: [function_name, function, correct output for tokenized input above]
[
"count",
representation.count,
pd.DataFrame(
[2, 1],
index=s_tokenized_output_index,
columns=pd.MultiIndex.from_tuples([("count", "Test")]),
).astype("Sparse[int64, 0]"),
],
[
"term_frequency",
representation.term_frequency,
pd.DataFrame(
[0.666667, 0.333333],
index=s_tokenized_output_index,
columns=pd.MultiIndex.from_tuples([("term_frequency", "Test")]),
).astype("Sparse[float64, nan]"),
],
[
"tfidf",
representation.tfidf,
pd.DataFrame(
[2, 1],
index=s_tokenized_output_index,
columns=pd.MultiIndex.from_tuples([("tfidf", "Test")]),
).astype("Sparse[float64, nan]"),
],
]


s_vector_series = pd.Series([[1.0, 0.0], [0.0, 0.0]], index=[5, 7])
mk2510 marked this conversation as resolved.
Show resolved Hide resolved
s_documenttermDF = pd.DataFrame(
mk2510 marked this conversation as resolved.
Show resolved Hide resolved
[[1.0, 0.0], [0.0, 0.0]],
index=[5, 7],
columns=pd.MultiIndex.from_product([["test"], ["a", "b"]]),
).astype("Sparse[float64, nan]")


test_cases_dim_reduction_and_clustering = [
# format: [function_name, function, correct output for s_vector_series and s_documenttermDF input above]
["pca", representation.pca, pd.Series([[-0.5, 0.0], [0.5, 0.0]], index=[5, 7],),],
[
"nmf",
representation.nmf,
pd.Series([[5.119042424626627, 0.0], [0.0, 0.0]], index=[5, 7],),
],
[
"tsne",
representation.tsne,
pd.Series([[164.86682, 1814.1647], [-164.8667, -1814.1644]], index=[5, 7],),
],
[
"kmeans",
representation.kmeans,
pd.Series([1, 0], index=[5, 7], dtype="category"),
],
[
"dbscan",
representation.dbscan,
pd.Series([-1, -1], index=[5, 7], dtype="category"),
],
[
"meanshift",
representation.meanshift,
pd.Series([0, 1], index=[5, 7], dtype="category"),
],
[
"normalize",
representation.normalize,
pd.Series([[1.0, 0.0], [0.0, 0.0]], index=[5, 7],),
],
]


Expand All @@ -98,62 +188,25 @@ class AbstractRepresentationTest(PandasTestCase):
"""

@parameterized.expand(test_cases_vectorization)
def test_vectorization_simple(
self, name, test_function, correct_output_values, int_or_float
):
if int_or_float == "int":
s_true = pd.Series(
correct_output_values, index=s_tokenized_output_index, dtype="int"
).astype(pd.SparseDtype(np.int64, 0))
else:
s_true = pd.Series(
correct_output_values, index=s_tokenized_output_index, dtype="float"
).astype(pd.SparseDtype("float", np.nan))
def test_vectorization_simple(self, name, test_function, correct_output):
s_true = correct_output
result_s = test_function(s_tokenized)

pd.testing.assert_series_equal(s_true, result_s)
pd.testing.assert_frame_equal(s_true, result_s, check_dtype=False)

@parameterized.expand(test_cases_vectorization)
def test_vectorization_noncontinuous_index_kept(
self, name, test_function, correct_output_values, int_or_float
self, name, test_function, correct_output=None
):
if int_or_float == "int":
s_true = pd.Series(
correct_output_values,
index=s_tokenized_output_noncontinuous_index,
dtype="int",
).astype(pd.SparseDtype(np.int64, 0))
else:
s_true = pd.Series(
correct_output_values,
index=s_tokenized_output_noncontinuous_index,
dtype="float",
).astype(pd.SparseDtype("float", np.nan))

result_s = test_function(s_tokenized_with_noncontinuous_index)

pd.testing.assert_series_equal(s_true, result_s)
pd.testing.assert_index_equal(
s_tokenized_output_index_noncontinous, result_s.index
)

@parameterized.expand(test_cases_vectorization_min_df)
def test_vectorization_min_df(
self, name, test_function, correct_output_values, int_or_float
):
if int_or_float == "int":
s_true = pd.Series(
correct_output_values,
index=s_tokenized_output_min_df_index,
dtype="int",
).astype(pd.SparseDtype(np.int64, 0))
else:
s_true = pd.Series(
correct_output_values,
index=s_tokenized_output_min_df_index,
dtype="float",
).astype(pd.SparseDtype("float", np.nan))

def test_vectorization_min_df(self, name, test_function, correct_output):
s_true = correct_output
result_s = test_function(s_tokenized, min_df=2)

pd.testing.assert_series_equal(s_true, result_s)
pd.testing.assert_frame_equal(s_true, result_s, check_dtype=False)

@parameterized.expand(test_cases_vectorization)
def test_vectorization_not_tokenized_yet_warning(self, name, test_function, *args):
Expand All @@ -168,69 +221,67 @@ def test_vectorization_arguments_to_sklearn(self, name, test_function, *args):
self.fail("Sklearn arguments not handled correctly.")

"""
Individual / special tests.
"""

def test_tfidf_formula(self):
henrifroese marked this conversation as resolved.
Show resolved Hide resolved
s = pd.Series(["Hi Bye", "Test Bye Bye"])
s = preprocessing.tokenize(s)
s_true_index = pd.MultiIndex.from_tuples(
[(0, "Bye"), (0, "Hi"), (1, "Bye"), (1, "Test")],
)
s_true = pd.Series(
[_tfidf(x[1], s, x[0]) for x in s_true_index], index=s_true_index
).astype("Sparse")

self.assertEqual(representation.tfidf(s), s_true)

"""
flatten.
Dimensionality Reduction and Clustering
"""

def test_flatten(self):
index = pd.MultiIndex.from_tuples(
[("doc0", "Word1"), ("doc0", "Word3"), ("doc1", "Word2")],
)
s = pd.Series([3, np.nan, 4], index=index)
@parameterized.expand(test_cases_dim_reduction_and_clustering)
def test_dim_reduction_and_clustering_with_vector_series_input(
self, name, test_function, correct_output
):
s_true = correct_output

s_true = pd.Series(
[[3.0, 0.0, np.nan], [0.0, 4.0, 0.0]], index=["doc0", "doc1"],
)
if name == "kmeans":
result_s = test_function(s_vector_series, random_state=42, n_clusters=2)
elif name == "dbscan" or name == "meanshift" or name == "normalize":
result_s = test_function(s_vector_series)
else:
result_s = test_function(s_vector_series, random_state=42)

pd.testing.assert_series_equal(
representation.flatten(s), s_true, check_names=False
s_true,
result_s,
check_dtype=False,
rtol=0.1,
atol=0.1,
check_category_order=False,
)

def test_flatten_fill_missing_with(self):
index = pd.MultiIndex.from_tuples(
[("doc0", "Word1"), ("doc0", "Word3"), ("doc1", "Word2")],
)
s = pd.Series([3, np.nan, 4], index=index)
@parameterized.expand(test_cases_dim_reduction_and_clustering)
def test_dim_reduction_and_clustering_with_documenttermDF_input(
self, name, test_function, correct_output
):
s_true = correct_output

s_true = pd.Series(
[[3.0, "FILLED", np.nan], ["FILLED", 4.0, "FILLED"]],
index=["doc0", "doc1"],
)
if name == "normalize":
# testing this below separately
return

if name == "kmeans":
result_s = test_function(s_documenttermDF, random_state=42, n_clusters=2)
mk2510 marked this conversation as resolved.
Show resolved Hide resolved
elif name == "dbscan" or name == "meanshift" or name == "normalize":
mk2510 marked this conversation as resolved.
Show resolved Hide resolved
result_s = test_function(s_documenttermDF)
else:
result_s = test_function(s_documenttermDF, random_state=42)

pd.testing.assert_series_equal(
representation.flatten(s, fill_missing_with="FILLED"),
s_true,
check_names=False,
)

def test_flatten_missing_row(self):
# Simulating a row with no features, so it's completely missing from
# the representation series.
index = pd.MultiIndex.from_tuples(
[("doc0", "Word1"), ("doc0", "Word3"), ("doc1", "Word2")],
result_s,
check_dtype=False,
rtol=0.1,
atol=0.1,
check_category_order=False,
)
s = pd.Series([3, np.nan, 4], index=index)

s_true = pd.Series(
[[3.0, 0.0, np.nan], [0.0, 4.0, 0.0], [0.0, 0.0, 0.0]],
index=["doc0", "doc1", "doc2"],
def test_normalize_documenttermDF_also_as_output(self):
mk2510 marked this conversation as resolved.
Show resolved Hide resolved
# normalize should also return DocumentTermDF output for DocumentTermDF
# input so we test it separately
result = representation.normalize(s_documenttermDF)
correct_output = pd.DataFrame(
[[1.0, 0.0], [0.0, 0.0]],
index=[5, 7],
columns=pd.MultiIndex.from_product([["test"], ["a", "b"]]),
)

pd.testing.assert_series_equal(
representation.flatten(s, index=s_true.index), s_true, check_names=False
pd.testing.assert_frame_equal(
result, correct_output, check_dtype=False, rtol=0.1, atol=0.1,
)
Loading