capitalone · JGSweets · Jul 11, 2022 · Jul 11, 2022 · Jul 11, 2022
@@ -78,7 +78,7 @@ def chunk_file(filepath, c_size):
     def test_read_url_content_overflow(self, mock_request_get):
         # assumed chunk size
         c_size = 8192
-        max_allows_file_size = 1024 ** 3  # 1GB
+        max_allows_file_size = 1024**3  # 1GB
 
         try:
             # mock the iter_content to return just under 1GB so no error raises
@@ -112,7 +112,7 @@ def test_read_url_content_overflow(self, mock_request_get):
     def test_read_url_header_overflow(self, mock_request_get):
         # assumed chunk size
         c_size = 8192
-        max_allows_file_size = 1024 ** 3  # 1GB
+        max_allows_file_size = 1024**3  # 1GB
 
         # set valid content length size
         content_length = 5000

@@ -53,7 +53,7 @@ def test_file_UTF_encoding_detection(self):
     def test_nth_loc_detection(self):
         """
         Tests the ability for the `data_utils.find_nth_location` to detect the
-        nth index of a search_query in a string. 
+        nth index of a search_query in a string.
         """
         # Input args: string, query, n
         # Expected results: index, occurrences
@@ -122,7 +122,7 @@ def test_nth_loc_detection(self):
 
     def test_load_as_str_from_file(self):
         """
-        Tests if the load_as_str_file function can appropriately load files 
+        Tests if the load_as_str_file function can appropriately load files
         thresholded by bytes or max lines.
         """
 

@@ -24,7 +24,12 @@ def test_classification_report(self):
                 "f1-score": 1 / 2,
                 "support": 2,
             },
-            "UNKNOWN": {"precision": 0, "recall": 0, "f1-score": 0, "support": 1,},
+            "UNKNOWN": {
+                "precision": 0,
+                "recall": 0,
+                "f1-score": 0,
+                "support": 1,
+            },
             "OTHER": {
                 "precision": 2 / 3,
                 "recall": 2 / 3,

@@ -634,7 +634,9 @@ def test_process_batch_helper(self):
 
     def test_process(self):
         preprocessor = CharPreprocessor(
-            max_length=5, default_label="UNKNOWN", pad_label="PAD",
+            max_length=5,
+            default_label="UNKNOWN",
+            pad_label="PAD",
         )
 
         label_mapping = {
@@ -1769,7 +1771,10 @@ def test_flatten_convert(self):
         )
         postprocessor = CharPostprocessor(flatten_separator=flatten_separator)
 
-        output_generator = preprocessor.process(test_sentences, batch_size=batch_size,)
+        output_generator = preprocessor.process(
+            test_sentences,
+            batch_size=batch_size,
+        )
 
         # mimic model output as a sentence instead of prediction
         output = [
@@ -1832,7 +1837,9 @@ def test_flatten_convert(self):
             list(data[0]) for batch_data in output_generator for data in batch_data
         ]
         output = postprocessor.process(
-            test_sentences, dict(pred=output), label_mapping=dict(test=0),
+            test_sentences,
+            dict(pred=output),
+            label_mapping=dict(test=0),
         )
         reconstructed_test_sentences = [
             "".join(sentence) for sentence in output["pred"]
@@ -2065,7 +2072,9 @@ def test_get_parameters(self):
 
     def test_convert_to_unstructured_format(self):
         preprocessor = StructCharPreprocessor(
-            max_length=10, default_label="UNKNOWN", pad_label="PAD",
+            max_length=10,
+            default_label="UNKNOWN",
+            pad_label="PAD",
         )
 
         # test a single sentence
@@ -2114,7 +2123,9 @@ def test_convert_to_unstructured_format(self):
 
     def test_process(self):
         preprocessor = StructCharPreprocessor(
-            max_length=10, default_label="UNKNOWN", pad_label="PAD",
+            max_length=10,
+            default_label="UNKNOWN",
+            pad_label="PAD",
         )
 
         label_mapping = {

@@ -240,7 +240,10 @@ def test_check_pipeline_overlap_mismatch(self):
 
         # make preprocess and model the same, but different from postprocessor
         data_labeler.set_params(
-            {"model": {"default_label": "a"}, "postprocessor": {"default_label": "b"},}
+            {
+                "model": {"default_label": "a"},
+                "postprocessor": {"default_label": "b"},
+            }
         )
         with self.assertRaisesRegex(
             RuntimeError,

@@ -28,7 +28,12 @@ def test_no_omit_class(self):
                 "f1-score": 1 / 2,
                 "support": 2,
             },
-            "UNKNOWN": {"precision": 0, "recall": 0, "f1-score": 0, "support": 1,},
+            "UNKNOWN": {
+                "precision": 0,
+                "recall": 0,
+                "f1-score": 0,
+                "support": 1,
+            },
             "OTHER": {
                 "precision": 2 / 3,
                 "recall": 2 / 3,
@@ -65,7 +70,12 @@ def test_no_omit_class(self):
     def test_omit_1_class(self):
 
         expected_output = {
-            "UNKNOWN": {"precision": 0, "recall": 0, "f1-score": 0, "support": 1,},
+            "UNKNOWN": {
+                "precision": 0,
+                "recall": 0,
+                "f1-score": 0,
+                "support": 1,
+            },
             "OTHER": {
                 "precision": 2 / 3,
                 "recall": 2 / 3,
@@ -153,8 +163,18 @@ def test_no_support_classes(self):
                 "f1-score": 2 / 3,
                 "support": 3,
             },
-            "NO_SUPPORT": {"precision": 0, "recall": 0, "f1-score": 0, "support": 0,},
-            "NO_SUPPORT2": {"precision": 0, "recall": 0, "f1-score": 0, "support": 0,},
+            "NO_SUPPORT": {
+                "precision": 0,
+                "recall": 0,
+                "f1-score": 0,
+                "support": 0,
+            },
+            "NO_SUPPORT2": {
+                "precision": 0,
+                "recall": 0,
+                "f1-score": 0,
+                "support": 0,
+            },
             "micro avg": {
                 "precision": 2 / 3,
                 "recall": 2 / 3,
@@ -218,7 +238,13 @@ def test_save_conf_mat(self, mock_dataframe):
 
         # ideally mock out the actual contents written to file, but
         # would be difficult to get this completely worked out.
-        expected_conf_mat = np.array([[1, 0, 1], [1, 0, 0], [0, 1, 2],])
+        expected_conf_mat = np.array(
+            [
+                [1, 0, 1],
+                [1, 0, 0],
+                [0, 1, 2],
+            ]
+        )
         expected_row_col_names = dict(
             columns=["pred:PAD", "pred:UNKNOWN", "pred:OTHER"],
             index=["true:PAD", "true:UNKNOWN", "true:OTHER"],

@@ -233,12 +233,44 @@ def test_mixed_categorical_col_integer_string(self):
     def test_categorical_mapping(self):
 
         df1 = pd.Series(
-            ["abcd", "aa", "abcd", "aa", "b", "4", "3", "2", "dfd", "2", np.nan,]
+            [
+                "abcd",
+                "aa",
+                "abcd",
+                "aa",
+                "b",
+                "4",
+                "3",
+                "2",
+                "dfd",
+                "2",
+                np.nan,
+            ]
         )
         df2 = pd.Series(
-            ["1", "null", "ee", "NaN", "ff", "nan", "gg", "None", "aa", "b", "ee",]
+            [
+                "1",
+                "null",
+                "ee",
+                "NaN",
+                "ff",
+                "nan",
+                "gg",
+                "None",
+                "aa",
+                "b",
+                "ee",
+            ]
+        )
+        df3 = pd.Series(
+            [
+                "NaN",
+                "b",
+                "nan",
+                "c",
+                None,
+            ]
         )
-        df3 = pd.Series(["NaN", "b", "nan", "c", None,])
 
         column_profile = StructuredColProfiler(df1)
         cat_profiler = column_profile.profiles["data_stats_profile"]._profiles[
@@ -297,7 +329,20 @@ def test_categorical_mapping(self):
 
     def test_true_categorical_report(self):
         df_categorical = pd.Series(
-            ["a", "a", "a", "b", "b", "b", "b", "c", "c", "c", "c", "c",]
+            [
+                "a",
+                "a",
+                "a",
+                "b",
+                "b",
+                "b",
+                "b",
+                "c",
+                "c",
+                "c",
+                "c",
+                "c",
+            ]
         )
         profile = CategoricalColumn(df_categorical.name)
         profile.update(df_categorical)
@@ -338,7 +383,12 @@ def test_false_categorical_report(self):
         self.assertIsNotNone(report.pop("times", None))
         expected_profile = dict(
             categorical=False,
-            statistics=dict([("unique_count", 20), ("unique_ratio", 1),]),
+            statistics=dict(
+                [
+                    ("unique_count", 20),
+                    ("unique_ratio", 1),
+                ]
+            ),
         )
         self.assertEqual(report, expected_profile)
 
@@ -447,7 +497,12 @@ def test_categorical_merge(self):
         self.assertIsNotNone(report.pop("times", None))
         expected_profile = dict(
             categorical=False,
-            statistics=dict([("unique_count", 16), ("unique_ratio", 16 / 33),]),
+            statistics=dict(
+                [
+                    ("unique_count", 16),
+                    ("unique_ratio", 16 / 33),
+                ]
+            ),
         )
         self.assertEqual(report, expected_profile)
 

@@ -225,7 +225,11 @@ def test_profile(self):
             min="03/10/13 15:43",
             max="Mar 11, 2013",
             histogram=None,
-            format=["%Y-%m-%d %H:%M:%S", "%m/%d/%y %H:%M", "%b %d, %Y",],
+            format=[
+                "%Y-%m-%d %H:%M:%S",
+                "%m/%d/%y %H:%M",
+                "%b %d, %Y",
+            ],
             times=defaultdict(float, {"datetime": 1.0}),
         )
         time_array = [float(i) for i in range(4, 0, -1)]

@@ -435,7 +435,7 @@ def batch_variance(mean_a, var_a, count_a, mean_b, var_b, count_b):
             delta = mean_b - mean_a
             m_a = var_a * (count_a - 1)
             m_b = var_b * (count_b - 1)
-            M2 = m_a + m_b + delta ** 2 * count_a * count_b / (count_a + count_b)
+            M2 = m_a + m_b + delta**2 * count_a * count_b / (count_a + count_b)
             return M2 / (count_a + count_b - 1)
 
         data = np.linspace(-5, 5, 11).tolist()
@@ -668,10 +668,10 @@ def test_profiled_histogram(self):
         list_data_test.append([df4, expected_histogram4])
 
         # this data has only one unique value, overflow
-        df5 = pd.Series([-(10.0 ** 20)]).apply(str)
+        df5 = pd.Series([-(10.0**20)]).apply(str)
         expected_histogram5 = {
             "bin_counts": np.array([1]),
-            "bin_edges": np.array([-(10.0 ** 20), -(10.0 ** 20)]),
+            "bin_edges": np.array([-(10.0**20), -(10.0**20)]),
         }
         list_data_test.append([df5, expected_histogram5])
 
@@ -776,7 +776,7 @@ def test_histogram_with_varying_number_of_bin(self):
         # this data uses large number of bins, which will be set to
         # the max limit
         df2 = pd.Series(
-            [3.195103249264023e18, 9999995.0, 9999999.0, 0.0, -(10 ** 10)]
+            [3.195103249264023e18, 9999995.0, 9999999.0, 0.0, -(10**10)]
         ).apply(str)
         profiler2 = FloatColumn(df2.name)
         profiler2.max_histogram_bin = 50
@@ -1063,7 +1063,11 @@ def test_profile(self):
                 "bin_counts": np.array([1, 1, 0, 1]),
                 "bin_edges": np.array([2.5, 5.0, 7.5, 10.0, 12.5]),
             },
-            quantiles={0: 2.5075, 1: 5.005, 2: 12.4925,},
+            quantiles={
+                0: 2.5075,
+                1: 5.005,
+                2: 12.4925,
+            },
             times=defaultdict(
                 float,
                 {

@@ -280,7 +280,7 @@ def batch_variance(mean_a, var_a, count_a, mean_b, var_b, count_b):
             delta = mean_b - mean_a
             m_a = var_a * (count_a - 1)
             m_b = var_b * (count_b - 1)
-            M2 = m_a + m_b + delta ** 2 * count_a * count_b / (count_a + count_b)
+            M2 = m_a + m_b + delta**2 * count_a * count_b / (count_a + count_b)
             return M2 / (count_a + count_b - 1)
 
         data = np.linspace(-5, 5, 11).tolist()
@@ -542,7 +542,11 @@ def test_profile(self):
                 "bin_counts": np.array([1, 0, 1]),
                 "bin_edges": np.array([2.0, 10.0 / 3.0, 14.0 / 3.0, 6.0]),
             },
-            quantiles={0: 2.002, 1: 4, 2: 5.998,},
+            quantiles={
+                0: 2.002,
+                1: 4,
+                2: 5.998,
+            },
             times=defaultdict(
                 float,
                 {

@@ -660,7 +660,11 @@ def test_profile(self):
                 "bin_counts": np.array([1, 1, 1]),
                 "bin_edges": np.array([1.0, 2.0, 3.0, 4.0]),
             },
-            quantiles={0: 2.0, 1: 3.0, 2: 4.0,},
+            quantiles={
+                0: 2.0,
+                1: 3.0,
+                2: 4.0,
+            },
             num_zeros=0,  # default
             num_negatives=0,  # default
             times=defaultdict(float),  # default