From b69d21c72098cb6cce313e4e3344f528271c6af7 Mon Sep 17 00:00:00 2001 From: Jeremy Goodsitt Date: Thu, 29 Sep 2022 15:04:11 -0400 Subject: [PATCH 1/3] fix: buggy test --- dataprofiler/tests/labelers/test_unstructured_data_labeler.py | 1 + 1 file changed, 1 insertion(+) diff --git a/dataprofiler/tests/labelers/test_unstructured_data_labeler.py b/dataprofiler/tests/labelers/test_unstructured_data_labeler.py index 4f8643bcc..22f825765 100644 --- a/dataprofiler/tests/labelers/test_unstructured_data_labeler.py +++ b/dataprofiler/tests/labelers/test_unstructured_data_labeler.py @@ -68,6 +68,7 @@ class TestDataLabeler(unittest.TestCase): @staticmethod def _setup_mock_load_model(mock_load_model): model_mock = mock.Mock(spec=CharacterLevelCnnModel) + model_mock.__class__.__name__ = "CharacterLevelCnnModel" model_mock.set_num_labels = mock.Mock() mock_load_model.return_value = model_mock model_mock.requires_zero_mapping = True From 114d52ef3dcef7d42f8d9ab1eac2380b5ac805b4 Mon Sep 17 00:00:00 2001 From: Jeremy Goodsitt Date: Thu, 29 Sep 2022 15:04:35 -0400 Subject: [PATCH 2/3] fix: allow diff to pass format options --- dataprofiler/profilers/profile_builder.py | 22 ++++++++++++++++++---- 1 file changed, 18 insertions(+), 4 deletions(-) diff --git a/dataprofiler/profilers/profile_builder.py b/dataprofiler/profilers/profile_builder.py index f34e047ca..701a22391 100644 --- a/dataprofiler/profilers/profile_builder.py +++ b/dataprofiler/profilers/profile_builder.py @@ -1177,7 +1177,7 @@ def __add__( # type: ignore[override] return merged_profile def diff( # type: ignore[override] - self, other_profile: UnstructuredProfiler, options: Dict = None + self, other_profile: UnstructuredProfiler, options: Optional[Dict] = None ) -> Dict: """ Find difference between 2 unstuctured profiles and return the report. @@ -1189,6 +1189,9 @@ def diff( # type: ignore[override] :return: difference of the profiles :rtype: dict """ + if options is None: + options = {} + report = super().diff(other_profile, options) report["global_stats"].update( @@ -1208,7 +1211,11 @@ def diff( # type: ignore[override] report["data_stats"] = self._profile.diff( other_profile._profile, options=options ) - return _prepare_report(report) + return _prepare_report( + report, + output_format=options.get("output_format", None), + omit_keys=options.get("omit_keys", None), + ) def _update_base_stats(self, base_stats: Dict) -> None: """ @@ -1593,7 +1600,7 @@ def __add__( # type: ignore[override] return merged_profile def diff( # type: ignore[override] - self, other_profile: StructuredProfiler, options: Dict = None + self, other_profile: StructuredProfiler, options: Optional[Dict] = None ) -> Dict: """ Find the difference between 2 Profiles and return the report. @@ -1605,6 +1612,9 @@ def diff( # type: ignore[override] :return: difference of the profiles :rtype: dict """ + if options is None: + options = {} + report = super().diff(other_profile, options) report["global_stats"].update( { @@ -1666,7 +1676,11 @@ def diff( # type: ignore[override] self._profile[i].diff(other_profile._profile[i], options=options) ) - return _prepare_report(report) + return _prepare_report( + report, + output_format=options.get("output_format", None), + omit_keys=options.get("omit_keys", None), + ) @property def _max_col_samples_used(self) -> int: From 0ff7f06e6a8e4a9fe7ad0587c8fc81bdde638e4c Mon Sep 17 00:00:00 2001 From: Jeremy Goodsitt Date: Thu, 29 Sep 2022 15:09:04 -0400 Subject: [PATCH 3/3] fix: to serialize diff properly --- .../data_readers/structured_mixins.py | 3 +- .../profilers/helpers/report_helpers.py | 47 +++++++++++++++---- .../tests/profilers/test_profile_builder.py | 9 ++++ 3 files changed, 49 insertions(+), 10 deletions(-) diff --git a/dataprofiler/data_readers/structured_mixins.py b/dataprofiler/data_readers/structured_mixins.py index 2a7f9cb1f..cd468dc8c 100644 --- a/dataprofiler/data_readers/structured_mixins.py +++ b/dataprofiler/data_readers/structured_mixins.py @@ -1,6 +1,7 @@ """Contains mixin data class for loading datasets of tye SpreadSheet.""" from logging import Logger from typing import Any, Dict, List, Optional, Union + import pandas as pd from .. import dp_logging @@ -44,7 +45,7 @@ def _load_data_from_file(self, input_file_path: str) -> Any: """Load the data into memory from the file.""" raise NotImplementedError() - def _load_data(self, data: Optional[Union[pd.DataFrame, str]]=None) -> None: + def _load_data(self, data: Optional[Union[pd.DataFrame, str]] = None) -> None: """Load either the specified data or the input_file into memory.""" if data is not None: if isinstance(data, pd.DataFrame): diff --git a/dataprofiler/profilers/helpers/report_helpers.py b/dataprofiler/profilers/helpers/report_helpers.py index 3d5e7d5de..42c4f6d08 100644 --- a/dataprofiler/profilers/helpers/report_helpers.py +++ b/dataprofiler/profilers/helpers/report_helpers.py @@ -63,6 +63,28 @@ def flat_dict(od: Dict, separator: str = "_", key: str = "") -> Dict: ) +def _clean_profile_schema(value: Dict) -> Dict: + """ + Clean schemas in profile. + + Prepares profile schemas specifically which requires the entire list to be + shown and update specific issues related to some np.numbers not being + serializable. + + :param value: dict of the data's profile schema + :type value: dict[str, list[ints]] + :return: cleaned profile schema + :rtype: dict[str, list[ints]] + """ + profile_schema_keys = list(value.keys()) + for i, col_name in enumerate(profile_schema_keys): + if isinstance(col_name, np.int64): + profile_schema_keys[i] = int(col_name) + elif not isinstance(col_name, (str, int, float, bool, type(None))): + profile_schema_keys[i] = str(col_name) + return dict(zip(profile_schema_keys, value.values())) + + def _prepare_report( report: Dict, output_format: str = None, omit_keys: List[str] = None ) -> Dict: @@ -181,13 +203,13 @@ def _prepare_report( # Do not recurse or modify profile_schema elif key == "profile_schema" and "profile_schema" not in omit_keys: if output_format in ["serializable", "pretty", "compact"]: - profile_schema_keys = list(value.keys()) - for i, col_name in enumerate(profile_schema_keys): - if isinstance(col_name, np.int64): - profile_schema_keys[i] = int(col_name) - elif not isinstance(col_name, (str, int, float, bool, type(None))): - profile_schema_keys[i] = str(col_name) - value = dict(zip(profile_schema_keys, value.values())) + if isinstance(value, list): + # for diff specifically + value = [_clean_profile_schema(schema) for schema in value] + else: + # for report + value = _clean_profile_schema(value) + fmt_report[key] = value elif isinstance(value, dict): @@ -211,11 +233,11 @@ def _prepare_report( value, output_format, next_layer_omit_keys ) - elif isinstance(value, list) or isinstance(value, np.ndarray): + elif isinstance(value, (list, np.ndarray, set)): if output_format == "pretty": - if isinstance(value, list): + if isinstance(value, (set, list)): value = np.array(value) str_value = np.array2string(value, separator=", ") @@ -238,6 +260,13 @@ def _prepare_report( else: fmt_report[key] = value + if output_format in ["pretty", "serializable", "compact"] and all( + isinstance(v, dict) for v in value + ): + fmt_report[key] = [ + _prepare_report(v, output_format, omit_keys) for v in value + ] + elif isinstance(value, float) and output_format == "pretty": fmt_report[key] = round(value, 4) else: diff --git a/dataprofiler/tests/profilers/test_profile_builder.py b/dataprofiler/tests/profilers/test_profile_builder.py index e3ca3c39e..9a47af639 100644 --- a/dataprofiler/tests/profilers/test_profile_builder.py +++ b/dataprofiler/tests/profilers/test_profile_builder.py @@ -1,5 +1,6 @@ from __future__ import print_function +import json import logging import os import random @@ -1847,6 +1848,10 @@ def test_diff(self, *mocks): np.testing.assert_array_almost_equal(expected_chi2_mat, diff_chi2_mat) self.assertDictEqual(expected_diff, diff) + diff = profile1.diff(profile2, options={"output_format": "serializable"}) + # validate can serialize + json.dumps(diff) + @mock.patch("dataprofiler.profilers.profile_builder.DataLabeler") @mock.patch( "dataprofiler.profilers.data_labeler_column_profile." "DataLabelerColumn.update" @@ -2694,6 +2699,10 @@ def test_diff(self, *mocks): } self.assertDictEqual(expected_diff, profiler1.diff(profiler2)) + # validate can serialize + diff = profiler1.diff(profiler2, options=dict(output_format="serializable")) + json.dumps(diff) + def test_get_sample_size(self, *mocks): data = pd.DataFrame([0] * int(50e3))