Skip to content

Commit

Permalink
Top Profiler Structured Diff (#358)
Browse files Browse the repository at this point in the history
* Profile diff

* Updated

* Undid small change

* new line

* Abstracted a bit
  • Loading branch information
grant-eden authored Jul 27, 2021
1 parent ab64055 commit 7d5a0b4
Show file tree
Hide file tree
Showing 4 changed files with 439 additions and 1 deletion.
148 changes: 148 additions & 0 deletions dataprofiler/profilers/profile_builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -196,6 +196,69 @@ def __add__(self, other):
)
return merged_profile

def diff(self, other_profile, options=None):
"""
Finds the difference between 2 StructuredCols and returns the report
:param other: Structured col finding the difference with this one.
:type other: StructuredColProfiler
:param options: options to change results of the difference
:type options: dict
:return: difference of the structured column
:rtype: dict
"""
unordered_profile = dict()
for key, profile in self.profiles.items():
if key in other_profile.profiles:
comp_diff = self.profiles[key].diff(other_profile.profiles[key],
options=options)
utils.dict_merge(unordered_profile, comp_diff)

name = self.name
if isinstance(self.name, np.integer):
name = int(name)

unordered_profile.update({
"column_name": name,
})

unordered_profile["statistics"].update({
"sample_size": utils.find_diff_of_numbers(
self.sample_size, other_profile.sample_size),
"null_count": utils.find_diff_of_numbers(
self.null_count, other_profile.null_count),
"null_types": utils.find_diff_of_lists_and_sets(
self.null_types, other_profile.null_types),
"null_types_index": utils.find_diff_of_dicts_with_diff_keys(
self.null_types_index, other_profile.null_types_index),
})

if unordered_profile.get("data_type", None) is not None:
unordered_profile["statistics"].update({
"data_type_representation":
unordered_profile["data_type_representation"]
})

dict_order = [
"column_name",
"data_type",
"data_label",
"categorical",
"order",
"statistics",
]
profile = OrderedDict()
if 'data_label_profile' not in self.profiles or\
'data_label_profile' not in other_profile.profiles:
dict_order.remove("data_label")
for key in dict_order:
try:
profile[key] = unordered_profile[key]
except KeyError as e:
profile[key] = None

return profile

@property
def profile(self):
unordered_profile = dict()
Expand Down Expand Up @@ -556,6 +619,30 @@ def __add__(self, other):

return merged_profile

def diff(self, other_profile, options=None):
"""
Finds the difference of two profiles
:param other: profile being added to this one.
:type other: BaseProfiler
:return: diff of the two profiles
:rtype: dict
"""
if type(other_profile) is not type(self):
raise TypeError('`{}` and `{}` are not of the same profiler type.'.
format(type(self).__name__,
type(other_profile).__name__))

diff_profile = OrderedDict([
("global_stats", {
"file_type": utils.find_diff_of_strings_and_bools(
self.file_type, other_profile.file_type),
"encoding": utils.find_diff_of_strings_and_bools(
self.encoding, other_profile.encoding),
})])

return diff_profile

def _get_sample_size(self, data):
"""
Determines the minimum sampling size for profiling the dataset.
Expand Down Expand Up @@ -1241,6 +1328,67 @@ def __add__(self, other):

return merged_profile

def diff(self, other_profile, options=None):
"""
Finds the difference between 2 Profiles and returns the report
:param other: profile finding the difference with this one
:type other: StructuredProfiler
:param options: options to change results of the difference
:type options: dict
:return: difference of the profiles
:rtype: dict
"""
report = super().diff( other_profile, options)
report["global_stats"].update({
"samples_used": utils.find_diff_of_numbers(
self._max_col_samples_used,
other_profile._max_col_samples_used),
"column_count": utils.find_diff_of_numbers(
len(self._profile), len(other_profile._profile)),
"row_count": utils.find_diff_of_numbers(
self.total_samples, other_profile.total_samples),
"row_has_null_ratio": utils.find_diff_of_numbers(
self._get_row_has_null_ratio(),
other_profile._get_row_has_null_ratio()),
"row_is_null_ratio": utils.find_diff_of_numbers(
self._get_row_is_null_ratio(),
other_profile._get_row_is_null_ratio()),
"unique_row_ratio": utils.find_diff_of_numbers(
self._get_unique_row_ratio(),
other_profile._get_unique_row_ratio()),
"duplicate_row_count": utils.find_diff_of_numbers(
self._get_duplicate_row_count(),
other_profile._get_row_is_null_ratio()),
"correlation_matrix": utils.find_diff_of_matrices(
self.correlation_matrix,
other_profile.correlation_matrix),
"profile_schema": defaultdict(list)})
report.update({"data_stats": []})

# Extract the schema of each profile
self_profile_schema = defaultdict(list)
other_profile_schema = defaultdict(list)
for i in range(len(self._profile)):
col_name = self._profile[i].name
self_profile_schema[col_name].append(i)
for i in range(len(other_profile._profile)):
col_name = other_profile._profile[i].name
other_profile_schema[col_name].append(i)

report["global_stats"]["profile_schema"] = \
utils.find_diff_of_dicts_with_diff_keys(self_profile_schema,
other_profile_schema)

# Only find the diff of columns if the schemas are exactly the same
if self_profile_schema == other_profile_schema:
for i in range(len(self._profile)):
report["data_stats"].append(
self._profile[i].diff(other_profile._profile[i],
options=options))

return _prepare_report(report)

@property
def _max_col_samples_used(self):
"""
Expand Down
25 changes: 25 additions & 0 deletions dataprofiler/profilers/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -493,6 +493,31 @@ def find_diff_of_dicts(dict1, dict2):
return diff


def find_diff_of_matrices(matrix1, matrix2):
"""
Finds the difference between two matrices.
:param matrix1: the first matrix
:type matrix1: list(list(float))
:param matrix2: the second matrix
:type matrix2: list(list(float))
:return: Difference in the matrix
:rtype: list(list(float))
"""

diff = None

if matrix1 is not None and matrix2 is not None:
mat1 = np.array(matrix1, dtype=np.float)
mat2 = np.array(matrix2, dtype=np.float)

if mat1.shape == mat2.shape:
diff = mat1 - mat2
if ((diff == 0) | np.isnan(diff)).all():
diff = "unchanged"

return diff

def find_diff_of_dicts_with_diff_keys(dict1, dict2):
"""
Finds the difference between two dicts. For each key in each dict,
Expand Down
Loading

0 comments on commit 7d5a0b4

Please sign in to comment.