Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Allow diff to set format options for prepare report #669

Merged
merged 3 commits into from
Sep 29, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion dataprofiler/data_readers/structured_mixins.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
"""Contains mixin data class for loading datasets of tye SpreadSheet."""
from logging import Logger
from typing import Any, Dict, List, Optional, Union

import pandas as pd

from .. import dp_logging
Expand Down Expand Up @@ -44,7 +45,7 @@ def _load_data_from_file(self, input_file_path: str) -> Any:
"""Load the data into memory from the file."""
raise NotImplementedError()

def _load_data(self, data: Optional[Union[pd.DataFrame, str]]=None) -> None:
def _load_data(self, data: Optional[Union[pd.DataFrame, str]] = None) -> None:
"""Load either the specified data or the input_file into memory."""
if data is not None:
if isinstance(data, pd.DataFrame):
Expand Down
47 changes: 38 additions & 9 deletions dataprofiler/profilers/helpers/report_helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,28 @@ def flat_dict(od: Dict, separator: str = "_", key: str = "") -> Dict:
)


def _clean_profile_schema(value: Dict) -> Dict:
"""
Clean schemas in profile.

Prepares profile schemas specifically which requires the entire list to be
shown and update specific issues related to some np.numbers not being
serializable.

:param value: dict of the data's profile schema
:type value: dict[str, list[ints]]
:return: cleaned profile schema
:rtype: dict[str, list[ints]]
"""
profile_schema_keys = list(value.keys())
for i, col_name in enumerate(profile_schema_keys):
if isinstance(col_name, np.int64):
profile_schema_keys[i] = int(col_name)
elif not isinstance(col_name, (str, int, float, bool, type(None))):
profile_schema_keys[i] = str(col_name)
return dict(zip(profile_schema_keys, value.values()))


def _prepare_report(
report: Dict, output_format: str = None, omit_keys: List[str] = None
) -> Dict:
Expand Down Expand Up @@ -181,13 +203,13 @@ def _prepare_report(
# Do not recurse or modify profile_schema
elif key == "profile_schema" and "profile_schema" not in omit_keys:
if output_format in ["serializable", "pretty", "compact"]:
profile_schema_keys = list(value.keys())
for i, col_name in enumerate(profile_schema_keys):
if isinstance(col_name, np.int64):
profile_schema_keys[i] = int(col_name)
elif not isinstance(col_name, (str, int, float, bool, type(None))):
profile_schema_keys[i] = str(col_name)
value = dict(zip(profile_schema_keys, value.values()))
if isinstance(value, list):
# for diff specifically
value = [_clean_profile_schema(schema) for schema in value]
else:
# for report
value = _clean_profile_schema(value)

fmt_report[key] = value

elif isinstance(value, dict):
Expand All @@ -211,11 +233,11 @@ def _prepare_report(
value, output_format, next_layer_omit_keys
)

elif isinstance(value, list) or isinstance(value, np.ndarray):
elif isinstance(value, (list, np.ndarray, set)):

if output_format == "pretty":

if isinstance(value, list):
if isinstance(value, (set, list)):
value = np.array(value)

str_value = np.array2string(value, separator=", ")
Expand All @@ -238,6 +260,13 @@ def _prepare_report(
else:
fmt_report[key] = value

if output_format in ["pretty", "serializable", "compact"] and all(
isinstance(v, dict) for v in value
):
fmt_report[key] = [
_prepare_report(v, output_format, omit_keys) for v in value
]

elif isinstance(value, float) and output_format == "pretty":
fmt_report[key] = round(value, 4)
else:
Expand Down
22 changes: 18 additions & 4 deletions dataprofiler/profilers/profile_builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -1177,7 +1177,7 @@ def __add__( # type: ignore[override]
return merged_profile

def diff( # type: ignore[override]
self, other_profile: UnstructuredProfiler, options: Dict = None
self, other_profile: UnstructuredProfiler, options: Optional[Dict] = None
) -> Dict:
"""
Find difference between 2 unstuctured profiles and return the report.
Expand All @@ -1189,6 +1189,9 @@ def diff( # type: ignore[override]
:return: difference of the profiles
:rtype: dict
"""
if options is None:
options = {}

report = super().diff(other_profile, options)

report["global_stats"].update(
Expand All @@ -1208,7 +1211,11 @@ def diff( # type: ignore[override]
report["data_stats"] = self._profile.diff(
other_profile._profile, options=options
)
return _prepare_report(report)
return _prepare_report(
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

just curious, why do we return options here if they are passed in? I wouldn't have thought to do that.

report,
output_format=options.get("output_format", None),
omit_keys=options.get("omit_keys", None),
)

def _update_base_stats(self, base_stats: Dict) -> None:
"""
Expand Down Expand Up @@ -1593,7 +1600,7 @@ def __add__( # type: ignore[override]
return merged_profile

def diff( # type: ignore[override]
self, other_profile: StructuredProfiler, options: Dict = None
self, other_profile: StructuredProfiler, options: Optional[Dict] = None
) -> Dict:
"""
Find the difference between 2 Profiles and return the report.
Expand All @@ -1605,6 +1612,9 @@ def diff( # type: ignore[override]
:return: difference of the profiles
:rtype: dict
"""
if options is None:
options = {}

report = super().diff(other_profile, options)
report["global_stats"].update(
{
Expand Down Expand Up @@ -1666,7 +1676,11 @@ def diff( # type: ignore[override]
self._profile[i].diff(other_profile._profile[i], options=options)
)

return _prepare_report(report)
return _prepare_report(
report,
output_format=options.get("output_format", None),
omit_keys=options.get("omit_keys", None),
)

@property
def _max_col_samples_used(self) -> int:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,7 @@ class TestDataLabeler(unittest.TestCase):
@staticmethod
def _setup_mock_load_model(mock_load_model):
model_mock = mock.Mock(spec=CharacterLevelCnnModel)
model_mock.__class__.__name__ = "CharacterLevelCnnModel"
model_mock.set_num_labels = mock.Mock()
mock_load_model.return_value = model_mock
model_mock.requires_zero_mapping = True
Expand Down
9 changes: 9 additions & 0 deletions dataprofiler/tests/profilers/test_profile_builder.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from __future__ import print_function

import json
import logging
import os
import random
Expand Down Expand Up @@ -1847,6 +1848,10 @@ def test_diff(self, *mocks):
np.testing.assert_array_almost_equal(expected_chi2_mat, diff_chi2_mat)
self.assertDictEqual(expected_diff, diff)

diff = profile1.diff(profile2, options={"output_format": "serializable"})
# validate can serialize
json.dumps(diff)

@mock.patch("dataprofiler.profilers.profile_builder.DataLabeler")
@mock.patch(
"dataprofiler.profilers.data_labeler_column_profile." "DataLabelerColumn.update"
Expand Down Expand Up @@ -2694,6 +2699,10 @@ def test_diff(self, *mocks):
}
self.assertDictEqual(expected_diff, profiler1.diff(profiler2))

# validate can serialize
diff = profiler1.diff(profiler2, options=dict(output_format="serializable"))
json.dumps(diff)

def test_get_sample_size(self, *mocks):
data = pd.DataFrame([0] * int(50e3))

Expand Down