Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Top Profiler Structured Diff #358

Merged
merged 11 commits into from
Jul 27, 2021
148 changes: 148 additions & 0 deletions dataprofiler/profilers/profile_builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -196,6 +196,69 @@ def __add__(self, other):
)
return merged_profile

def diff(self, other_profile, options=None):
"""
Finds the difference between 2 StructuredCols and returns the report

:param other: Structured col finding the difference with this one.
:type other: StructuredColProfiler
:param options: options to change results of the difference
:type options: dict
:return: difference of the structured column
:rtype: dict
"""
unordered_profile = dict()
for key, profile in self.profiles.items():
if key in other_profile.profiles:
comp_diff = self.profiles[key].diff(other_profile.profiles[key],
options=options)
utils.dict_merge(unordered_profile, comp_diff)

name = self.name
if isinstance(self.name, np.integer):
name = int(name)

unordered_profile.update({
"column_name": name,
})

unordered_profile["statistics"].update({
"sample_size": utils.find_diff_of_numbers(
self.sample_size, other_profile.sample_size),
"null_count": utils.find_diff_of_numbers(
self.null_count, other_profile.null_count),
"null_types": utils.find_diff_of_lists_and_sets(
self.null_types, other_profile.null_types),
"null_types_index": utils.find_diff_of_dicts_with_diff_keys(
self.null_types_index, other_profile.null_types_index),
})

if unordered_profile.get("data_type", None) is not None:
unordered_profile["statistics"].update({
"data_type_representation":
unordered_profile["data_type_representation"]
})

dict_order = [
"column_name",
"data_type",
"data_label",
"categorical",
"order",
"statistics",
]
profile = OrderedDict()
if 'data_label_profile' not in self.profiles or\
'data_label_profile' not in other_profile.profiles:
dict_order.remove("data_label")
for key in dict_order:
try:
profile[key] = unordered_profile[key]
except KeyError as e:
profile[key] = None

return profile

@property
def profile(self):
unordered_profile = dict()
Expand Down Expand Up @@ -556,6 +619,30 @@ def __add__(self, other):

return merged_profile

def diff(self, other_profile, options=None):
"""
Finds the difference of two profiles

:param other: profile being added to this one.
:type other: BaseProfiler
:return: diff of the two profiles
:rtype: dict
"""
if type(other_profile) is not type(self):
raise TypeError('`{}` and `{}` are not of the same profiler type.'.
format(type(self).__name__,
type(other_profile).__name__))

diff_profile = OrderedDict([
("global_stats", {
"file_type": utils.find_diff_of_strings_and_bools(
self.file_type, other_profile.file_type),
"encoding": utils.find_diff_of_strings_and_bools(
self.encoding, other_profile.encoding),
})])

return diff_profile

def _get_sample_size(self, data):
"""
Determines the minimum sampling size for profiling the dataset.
Expand Down Expand Up @@ -1241,6 +1328,67 @@ def __add__(self, other):

return merged_profile

def diff(self, other_profile, options=None):
"""
Finds the difference between 2 Profiles and returns the report

:param other: profile finding the difference with this one
:type other: StructuredProfiler
:param options: options to change results of the difference
:type options: dict
:return: difference of the profiles
:rtype: dict
"""
report = super().diff( other_profile, options)
report["global_stats"].update({
"samples_used": utils.find_diff_of_numbers(
self._max_col_samples_used,
other_profile._max_col_samples_used),
"column_count": utils.find_diff_of_numbers(
len(self._profile), len(other_profile._profile)),
"row_count": utils.find_diff_of_numbers(
self.total_samples, other_profile.total_samples),
"row_has_null_ratio": utils.find_diff_of_numbers(
self._get_row_has_null_ratio(),
other_profile._get_row_has_null_ratio()),
"row_is_null_ratio": utils.find_diff_of_numbers(
self._get_row_is_null_ratio(),
other_profile._get_row_is_null_ratio()),
"unique_row_ratio": utils.find_diff_of_numbers(
self._get_unique_row_ratio(),
other_profile._get_unique_row_ratio()),
"duplicate_row_count": utils.find_diff_of_numbers(
self._get_duplicate_row_count(),
other_profile._get_row_is_null_ratio()),
"correlation_matrix": utils.find_diff_of_matrices(
self.correlation_matrix,
other_profile.correlation_matrix),
"profile_schema": defaultdict(list)})
report.update({"data_stats": []})

# Extract the schema of each profile
self_profile_schema = defaultdict(list)
other_profile_schema = defaultdict(list)
for i in range(len(self._profile)):
col_name = self._profile[i].name
self_profile_schema[col_name].append(i)
Comment on lines +1372 to +1374
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

how often are we doing this? should we have a util function to build this dict?

for i in range(len(other_profile._profile)):
col_name = other_profile._profile[i].name
other_profile_schema[col_name].append(i)

report["global_stats"]["profile_schema"] = \
utils.find_diff_of_dicts_with_diff_keys(self_profile_schema,
other_profile_schema)

# Only find the diff of columns if the schemas are exactly the same
if self_profile_schema == other_profile_schema:
for i in range(len(self._profile)):
report["data_stats"].append(
self._profile[i].diff(other_profile._profile[i],
options=options))

return _prepare_report(report)

@property
def _max_col_samples_used(self):
"""
Expand Down
25 changes: 25 additions & 0 deletions dataprofiler/profilers/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -493,6 +493,31 @@ def find_diff_of_dicts(dict1, dict2):
return diff


def find_diff_of_matrices(matrix1, matrix2):
"""
Finds the difference between two matrices.

:param matrix1: the first matrix
:type matrix1: list(list(float))
:param matrix2: the second matrix
:type matrix2: list(list(float))
:return: Difference in the matrix
:rtype: list(list(float))
"""

diff = None

if matrix1 is not None and matrix2 is not None:
mat1 = np.array(matrix1, dtype=np.float)
mat2 = np.array(matrix2, dtype=np.float)

if mat1.shape == mat2.shape:
diff = mat1 - mat2
if ((diff == 0) | np.isnan(diff)).all():
diff = "unchanged"

return diff

def find_diff_of_dicts_with_diff_keys(dict1, dict2):
"""
Finds the difference between two dicts. For each key in each dict,
Expand Down
Loading