jbesomi · jbesomi · Sep 12, 2020 · Aug 18, 2020 · Aug 19, 2020 · Aug 19, 2020
diff --git a/tests/test_indexes.py b/tests/test_indexes.py
@@ -56,21 +56,9 @@
 ]
 
 test_cases_representation = [
-    [
-        "count",
-        lambda x: representation.flatten(representation.count(x)),
-        (s_tokenized_lists,),
-    ],
-    [
-        "term_frequency",
-        lambda x: representation.flatten(representation.term_frequency(x)),
-        (s_tokenized_lists,),
-    ],
-    [
-        "tfidf",
-        lambda x: representation.flatten(representation.tfidf(x)),
-        (s_tokenized_lists,),
-    ],
+    ["count", representation.count, (s_tokenized_lists,),],
+    ["term_frequency", representation.term_frequency, (s_tokenized_lists,),],
+    ["tfidf", representation.tfidf, (s_tokenized_lists,),],
     ["pca", representation.pca, (s_numeric_lists, 0)],
     ["nmf", representation.nmf, (s_numeric_lists,)],
     ["tsne", representation.tsne, (s_numeric_lists,)],

diff --git a/tests/test_representation.py b/tests/test_representation.py
@@ -50,39 +50,129 @@ def _tfidf(term, corpus, document_index):
     [["Test", "Test", "TEST", "!"], ["Test", "?", ".", "."]], index=[5, 7]
 )
 
-s_tokenized_output_index = pd.MultiIndex.from_tuples(
-    [(0, "!"), (0, "TEST"), (0, "Test"), (1, "."), (1, "?"), (1, "Test")],
-)
+s_tokenized_output_index = pd.Index([0, 1])
 
-s_tokenized_output_noncontinuous_index = pd.MultiIndex.from_tuples(
-    [(5, "!"), (5, "TEST"), (5, "Test"), (7, "."), (7, "?"), (7, "Test")],
-)
+s_tokenized_output_index_noncontinous = pd.Index([5, 7])
 
-s_tokenized_output_min_df_index = pd.MultiIndex.from_tuples([(0, "Test"), (1, "Test")],)
+
+def _get_multiindex_for_tokenized_output(first_level_name):
+    return pd.MultiIndex.from_product(
+        [[first_level_name], ["!", ".", "?", "TEST", "Test"]]
+    )
 
 
 test_cases_vectorization = [
-    # format: [function_name, function, correct output for tokenized input above, dtype of output]
-    ["count", representation.count, [1, 1, 2, 2, 1, 1], "int"],
+    # format: [function_name, function, correct output for tokenized input above]
+    [
+        "count",
+        representation.count,
+        pd.DataFrame(
+            [[1, 0, 0, 1, 2], [0, 2, 1, 0, 1]],
+            index=s_tokenized_output_index,
+            columns=_get_multiindex_for_tokenized_output("count"),
+        ).astype("Sparse[int64, 0]"),
+    ],
     [
         "term_frequency",
         representation.term_frequency,
-        [0.125, 0.125, 0.250, 0.250, 0.125, 0.125],
-        "float",
+        pd.DataFrame(
+            [[0.125, 0.0, 0.0, 0.125, 0.250], [0.0, 0.25, 0.125, 0.0, 0.125]],
+            index=s_tokenized_output_index,
+            columns=_get_multiindex_for_tokenized_output("term_frequency"),
+            dtype="Sparse",
+        ).astype("Sparse[float64, nan]"),
     ],
     [
         "tfidf",
         representation.tfidf,
-        [_tfidf(x[1], s_tokenized, x[0]) for x in s_tokenized_output_index],
-        "float",
+        pd.DataFrame(
+            [
+                [
+                    _tfidf(x, s_tokenized, 0)  # Testing the tfidf formula here
+                    for x in ["!", ".", "?", "TEST", "Test"]
+                ],
+                [_tfidf(x, s_tokenized, 1) for x in ["!", ".", "?", "TEST", "Test"]],
+            ],
+            index=s_tokenized_output_index,
+            columns=_get_multiindex_for_tokenized_output("tfidf"),
+        ).astype("Sparse[float64, nan]"),
     ],
 ]
 
+
 test_cases_vectorization_min_df = [
-    # format: [function_name, function, correct output for tokenized input above, dtype of output]
-    ["count", representation.count, [2, 1], "int"],
-    ["term_frequency", representation.term_frequency, [0.666667, 0.333333], "float",],
-    ["tfidf", representation.tfidf, [2.0, 1.0], "float",],
+    # format: [function_name, function, correct output for tokenized input above]
+    [
+        "count",
+        representation.count,
+        pd.DataFrame(
+            [2, 1],
+            index=s_tokenized_output_index,
+            columns=pd.MultiIndex.from_tuples([("count", "Test")]),
+        ).astype("Sparse[int64, 0]"),
+    ],
+    [
+        "term_frequency",
+        representation.term_frequency,
+        pd.DataFrame(
+            [0.666667, 0.333333],
+            index=s_tokenized_output_index,
+            columns=pd.MultiIndex.from_tuples([("term_frequency", "Test")]),
+        ).astype("Sparse[float64, nan]"),
+    ],
+    [
+        "tfidf",
+        representation.tfidf,
+        pd.DataFrame(
+            [2, 1],
+            index=s_tokenized_output_index,
+            columns=pd.MultiIndex.from_tuples([("tfidf", "Test")]),
+        ).astype("Sparse[float64, nan]"),
+    ],
+]
+
+
+s_vector_series = pd.Series([[1.0, 0.0], [0.0, 0.0]], index=[5, 7])
+s_documenttermDF = pd.DataFrame(
+    [[1.0, 0.0], [0.0, 0.0]],
+    index=[5, 7],
+    columns=pd.MultiIndex.from_product([["test"], ["a", "b"]]),
+).astype("Sparse[float64, nan]")
+
+
+test_cases_dim_reduction_and_clustering = [
+    # format: [function_name, function, correct output for s_vector_series and s_documenttermDF input above]
+    ["pca", representation.pca, pd.Series([[-0.5, 0.0], [0.5, 0.0]], index=[5, 7],),],
+    [
+        "nmf",
+        representation.nmf,
+        pd.Series([[5.119042424626627, 0.0], [0.0, 0.0]], index=[5, 7],),
+    ],
+    [
+        "tsne",
+        representation.tsne,
+        pd.Series([[164.86682, 1814.1647], [-164.8667, -1814.1644]], index=[5, 7],),
+    ],
+    [
+        "kmeans",
+        representation.kmeans,
+        pd.Series([1, 0], index=[5, 7], dtype="category"),
+    ],
+    [
+        "dbscan",
+        representation.dbscan,
+        pd.Series([-1, -1], index=[5, 7], dtype="category"),
+    ],
+    [
+        "meanshift",
+        representation.meanshift,
+        pd.Series([0, 1], index=[5, 7], dtype="category"),
+    ],
+    [
+        "normalize",
+        representation.normalize,
+        pd.Series([[1.0, 0.0], [0.0, 0.0]], index=[5, 7],),
+    ],
 ]
 
 
@@ -98,62 +188,25 @@ class AbstractRepresentationTest(PandasTestCase):
     """
 
     @parameterized.expand(test_cases_vectorization)
-    def test_vectorization_simple(
-        self, name, test_function, correct_output_values, int_or_float
-    ):
-        if int_or_float == "int":
-            s_true = pd.Series(
-                correct_output_values, index=s_tokenized_output_index, dtype="int"
-            ).astype(pd.SparseDtype(np.int64, 0))
-        else:
-            s_true = pd.Series(
-                correct_output_values, index=s_tokenized_output_index, dtype="float"
-            ).astype(pd.SparseDtype("float", np.nan))
+    def test_vectorization_simple(self, name, test_function, correct_output):
+        s_true = correct_output
         result_s = test_function(s_tokenized)
-
-        pd.testing.assert_series_equal(s_true, result_s)
+        pd.testing.assert_frame_equal(s_true, result_s, check_dtype=False)
 
     @parameterized.expand(test_cases_vectorization)
     def test_vectorization_noncontinuous_index_kept(
-        self, name, test_function, correct_output_values, int_or_float
+        self, name, test_function, correct_output=None
     ):
-        if int_or_float == "int":
-            s_true = pd.Series(
-                correct_output_values,
-                index=s_tokenized_output_noncontinuous_index,
-                dtype="int",
-            ).astype(pd.SparseDtype(np.int64, 0))
-        else:
-            s_true = pd.Series(
-                correct_output_values,
-                index=s_tokenized_output_noncontinuous_index,
-                dtype="float",
-            ).astype(pd.SparseDtype("float", np.nan))
-
         result_s = test_function(s_tokenized_with_noncontinuous_index)
-
-        pd.testing.assert_series_equal(s_true, result_s)
+        pd.testing.assert_index_equal(
+            s_tokenized_output_index_noncontinous, result_s.index
+        )
 
     @parameterized.expand(test_cases_vectorization_min_df)
-    def test_vectorization_min_df(
-        self, name, test_function, correct_output_values, int_or_float
-    ):
-        if int_or_float == "int":
-            s_true = pd.Series(
-                correct_output_values,
-                index=s_tokenized_output_min_df_index,
-                dtype="int",
-            ).astype(pd.SparseDtype(np.int64, 0))
-        else:
-            s_true = pd.Series(
-                correct_output_values,
-                index=s_tokenized_output_min_df_index,
-                dtype="float",
-            ).astype(pd.SparseDtype("float", np.nan))
-
+    def test_vectorization_min_df(self, name, test_function, correct_output):
+        s_true = correct_output
         result_s = test_function(s_tokenized, min_df=2)
-
-        pd.testing.assert_series_equal(s_true, result_s)
+        pd.testing.assert_frame_equal(s_true, result_s, check_dtype=False)
 
     @parameterized.expand(test_cases_vectorization)
     def test_vectorization_not_tokenized_yet_warning(self, name, test_function, *args):
@@ -168,69 +221,67 @@ def test_vectorization_arguments_to_sklearn(self, name, test_function, *args):
             self.fail("Sklearn arguments not handled correctly.")
 
     """
-    Individual / special tests.
-    """
-
-    def test_tfidf_formula(self):
-        s = pd.Series(["Hi Bye", "Test Bye Bye"])
-        s = preprocessing.tokenize(s)
-        s_true_index = pd.MultiIndex.from_tuples(
-            [(0, "Bye"), (0, "Hi"), (1, "Bye"), (1, "Test")],
-        )
-        s_true = pd.Series(
-            [_tfidf(x[1], s, x[0]) for x in s_true_index], index=s_true_index
-        ).astype("Sparse")
-
-        self.assertEqual(representation.tfidf(s), s_true)
-
-    """
-    flatten.
+    Dimensionality Reduction and Clustering
     """
 
-    def test_flatten(self):
-        index = pd.MultiIndex.from_tuples(
-            [("doc0", "Word1"), ("doc0", "Word3"), ("doc1", "Word2")],
-        )
-        s = pd.Series([3, np.nan, 4], index=index)
+    @parameterized.expand(test_cases_dim_reduction_and_clustering)
+    def test_dim_reduction_and_clustering_with_vector_series_input(
+        self, name, test_function, correct_output
+    ):
+        s_true = correct_output
 
-        s_true = pd.Series(
-            [[3.0, 0.0, np.nan], [0.0, 4.0, 0.0]], index=["doc0", "doc1"],
-        )
+        if name == "kmeans":
+            result_s = test_function(s_vector_series, random_state=42, n_clusters=2)
+        elif name == "dbscan" or name == "meanshift" or name == "normalize":
+            result_s = test_function(s_vector_series)
+        else:
+            result_s = test_function(s_vector_series, random_state=42)
 
         pd.testing.assert_series_equal(
-            representation.flatten(s), s_true, check_names=False
+            s_true,
+            result_s,
+            check_dtype=False,
+            rtol=0.1,
+            atol=0.1,
+            check_category_order=False,
         )
 
-    def test_flatten_fill_missing_with(self):
-        index = pd.MultiIndex.from_tuples(
-            [("doc0", "Word1"), ("doc0", "Word3"), ("doc1", "Word2")],
-        )
-        s = pd.Series([3, np.nan, 4], index=index)
+    @parameterized.expand(test_cases_dim_reduction_and_clustering)
+    def test_dim_reduction_and_clustering_with_documenttermDF_input(
+        self, name, test_function, correct_output
+    ):
+        s_true = correct_output
 
-        s_true = pd.Series(
-            [[3.0, "FILLED", np.nan], ["FILLED", 4.0, "FILLED"]],
-            index=["doc0", "doc1"],
-        )
+        if name == "normalize":
+            # testing this below separately
+            return
+
+        if name == "kmeans":
+            result_s = test_function(s_documenttermDF, random_state=42, n_clusters=2)
+        elif name == "dbscan" or name == "meanshift" or name == "normalize":
+            result_s = test_function(s_documenttermDF)
+        else:
+            result_s = test_function(s_documenttermDF, random_state=42)
 
         pd.testing.assert_series_equal(
-            representation.flatten(s, fill_missing_with="FILLED"),
             s_true,
-            check_names=False,
-        )
-
-    def test_flatten_missing_row(self):
-        # Simulating a row with no features, so it's completely missing from
-        # the representation series.
-        index = pd.MultiIndex.from_tuples(
-            [("doc0", "Word1"), ("doc0", "Word3"), ("doc1", "Word2")],
+            result_s,
+            check_dtype=False,
+            rtol=0.1,
+            atol=0.1,
+            check_category_order=False,
         )
-        s = pd.Series([3, np.nan, 4], index=index)
 
-        s_true = pd.Series(
-            [[3.0, 0.0, np.nan], [0.0, 4.0, 0.0], [0.0, 0.0, 0.0]],
-            index=["doc0", "doc1", "doc2"],
+    def test_normalize_documenttermDF_also_as_output(self):
+        # normalize should also return DocumentTermDF output for DocumentTermDF
+        # input so we test it separately
+        result = representation.normalize(s_documenttermDF)
+        correct_output = pd.DataFrame(
+            [[1.0, 0.0], [0.0, 0.0]],
+            index=[5, 7],
+            columns=pd.MultiIndex.from_product([["test"], ["a", "b"]]),
         )
 
-        pd.testing.assert_series_equal(
-            representation.flatten(s, index=s_true.index), s_true, check_names=False
+        pd.testing.assert_frame_equal(
+            result, correct_output, check_dtype=False, rtol=0.1, atol=0.1,
         )