Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add text size to unstructured profiler #340

Merged
merged 18 commits into from
Jul 19, 2021
27 changes: 19 additions & 8 deletions dataprofiler/profilers/profile_builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -884,6 +884,7 @@ def __init__(self, data, samples_per_update=None, min_true_samples=0,

# Unstructured specific properties
self._empty_line_count = 0
self.capacity = 0
self.sample = []

if data is not None:
Expand All @@ -910,6 +911,7 @@ def __add__(self, other):
# unstruct specific property merging
merged_profile._empty_line_count = (
self._empty_line_count + other._empty_line_count)
merged_profile.capacity = self.capacity + other.capacity
samples = list(dict.fromkeys(self.sample + other.sample))
merged_profile.sample = random.sample(list(samples),
min(len(samples), 5))
Expand All @@ -931,6 +933,7 @@ def _update_base_stats(self, base_stats):
self.total_samples += base_stats["sample_size"]
self.sample = base_stats["sample"]
self._empty_line_count += base_stats["empty_line_count"]
self.capacity += base_stats["capacity"]

def report(self, report_options=None):
"""
Expand Down Expand Up @@ -965,7 +968,8 @@ def report(self, report_options=None):
"samples_used": self.total_samples,
"empty_line_count": self._empty_line_count,
"file_type": self.file_type,
"encoding": self.encoding
"encoding": self.encoding,
"capacity": self.capacity,
}),
("data_stats", OrderedDict()),
])
Expand Down Expand Up @@ -997,12 +1001,16 @@ def _clean_data_and_get_base_stats(data, sample_size,
len_data = len(data)
if not len_data:
return data, {
"sample_size": 0, "empty_line_count": dict(), "sample": [],
"sample_size": 0, "empty_line_count": dict(),
"sample": [], "capacity": 0
}

# ensure all data are of type str
data = data.apply(str)

# get capacity
base_stats = {"capacity": utils.get_capacity(data)}

# Setup sample generator
sample_ind_generator = utils.shuffle_in_chunks(
len_data, chunk_size=sample_size)
Expand Down Expand Up @@ -1038,12 +1046,14 @@ def _clean_data_and_get_base_stats(data, sample_size,
data = data.loc[true_sample_list]
total_empty = total_sample_size - len(true_sample_list)

base_stats = {
"sample_size": total_sample_size,
"empty_line_count": total_empty,
"sample": random.sample(list(data.values),
min(len(data), 5)),
}
base_stats.update(
{
"sample_size": total_sample_size,
"empty_line_count": total_empty,
"sample": random.sample(list(data.values),
min(len(data), 5)),
}
)

return data, base_stats

Expand Down Expand Up @@ -1114,6 +1124,7 @@ def save(self, filepath=None):
"_samples_per_update": self._samples_per_update,
"_min_true_samples": self._min_true_samples,
"_empty_line_count": self._empty_line_count,
"capacity": self.capacity,
"options": self.options,
"_profile": self.profile
}
Expand Down
20 changes: 20 additions & 0 deletions dataprofiler/profilers/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -483,3 +483,23 @@ def find_diff_of_dicts(dict1, dict2):
diff = "unchanged"

return diff

def get_capacity(data, unit='M'):
"""
Get size (capacity) of the input data

:param data: list or array of data
:type data: Union[list, numpy.array, pandas.DataFrame]
:param unit: capacity unit (B, K, M, or G)
:type unit: string
:return: capacity of the input data
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

need to update docstring to get rid of capacity

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

changed

"""
unit_map = {'B': 0, 'K': 1, 'M': 2, 'G': 3}
if unit not in unit_map:
raise ValueError('Currently only supports the '
'capacity unit in [B, K, M, G]')
capacity = 0
for sentence in data:
capacity += len(sentence.encode('utf-8'))
capacity /= 1024.0 ** unit_map[unit] # Conversion based on unit_map
return capacity
26 changes: 24 additions & 2 deletions dataprofiler/tests/profilers/test_profile_builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -1399,6 +1399,7 @@ def test_base(self, *mocks):
self.assertEqual(0, profiler._min_true_samples)
self.assertEqual(0, profiler.total_samples)
self.assertEqual(0, profiler._empty_line_count)
self.assertEqual(0, profiler.capacity)
self.assertEqual(0.2, profiler._sampling_ratio)
self.assertEqual(5000, profiler._min_sample_size)
self.assertEqual([], profiler.sample)
Expand All @@ -1416,6 +1417,7 @@ def test_base(self, *mocks):
self.assertEqual(4, profiler.total_samples)
self.assertCountEqual(['this', 'is my', 'test'], profiler.sample)
self.assertEqual(1, profiler._empty_line_count)
self.assertAlmostEqual(1.43051e-05, profiler.capacity)
self.assertEqual("<class 'pandas.core.series.Series'>",
profiler.file_type)
self.assertIsNone(profiler.encoding)
Expand All @@ -1433,6 +1435,7 @@ def test_base(self, *mocks):
self.assertEqual(4, profiler.total_samples)
self.assertCountEqual(['this', 'is my', 'test'], profiler.sample)
self.assertEqual(1, profiler._empty_line_count)
self.assertAlmostEqual(1.43051e-05, profiler.capacity)
self.assertEqual("csv", profiler.file_type)
self.assertEqual("utf-8", profiler.encoding)
self.assertIsInstance(profiler._profile, UnstructuredCompiler)
Expand All @@ -1455,6 +1458,7 @@ def test_str_input_data(self, *mocks):
profiler = UnstructuredProfiler(data)
self.assertEqual(1, profiler.total_samples)
self.assertEqual(0, profiler._empty_line_count)
self.assertAlmostEqual(1.52587e-05, profiler.capacity)
self.assertEqual("<class 'str'>", profiler.file_type)
self.assertIsNone(profiler.encoding)
self.assertIsInstance(profiler._profile, UnstructuredCompiler)
Expand All @@ -1464,6 +1468,7 @@ def test_list_input_data(self, *mocks):
profiler = UnstructuredProfiler(data)
self.assertEqual(4, profiler.total_samples)
self.assertEqual(1, profiler._empty_line_count)
self.assertAlmostEqual(1.43051e-05, profiler.capacity)
self.assertEqual("<class 'list'>", profiler.file_type)
self.assertIsNone(profiler.encoding)
self.assertIsInstance(profiler._profile, UnstructuredCompiler)
Expand All @@ -1473,6 +1478,7 @@ def test_dataframe_input_data(self, *mocks):
profiler = UnstructuredProfiler(data)
self.assertEqual(4, profiler.total_samples)
self.assertEqual(1, profiler._empty_line_count)
self.assertAlmostEqual(1.43051e-05, profiler.capacity)
self.assertEqual("<class 'pandas.core.frame.DataFrame'>", profiler.file_type)
self.assertIsNone(profiler.encoding)
self.assertIsInstance(profiler._profile, UnstructuredCompiler)
Expand All @@ -1494,6 +1500,7 @@ def test_merge_profiles(self, *mocks):
merged_profile = profiler1 + profiler2
self.assertEqual(10, merged_profile.total_samples)
self.assertEqual(4, merged_profile._empty_line_count)
self.assertAlmostEqual(3.81469e-05, merged_profile.capacity)
# note how sample doesn't include whitespace lines
self.assertCountEqual(['this', ' is', 'here\n', 'more data', 'is my'],
merged_profile.sample)
Expand Down Expand Up @@ -1532,11 +1539,13 @@ def test_clean_data_and_get_base_stats(self, *mocks):

# note: bc the sample size is 3, only a subset of the data was sampled
self.assertTrue(np.issubdtype(np.object_, df_series.dtype))
# pop out the capacity to test first
self.assertAlmostEqual(2.38418e-05, base_stats.pop('capacity'))
self.assertDictEqual(
{
'sample': ['more data'], # bc of subset sampled
'sample_size': 3,
'empty_line_count': 2
'empty_line_count': 2,
},
base_stats)

Expand All @@ -1547,11 +1556,13 @@ def test_clean_data_and_get_base_stats(self, *mocks):

# note: bc the sample size is 3, only a subset of the data was sampled
self.assertTrue(np.issubdtype(np.object_, df_series.dtype))
# pop out the capacity to test first
self.assertAlmostEqual(2.38418e-05, base_stats.pop('capacity'))
self.assertDictEqual(
{
'sample': ['more data', 'here\n', 'a', ' is'],
'sample_size': 6,
'empty_line_count': 2
'empty_line_count': 2,
},
base_stats)

Expand All @@ -1564,13 +1575,15 @@ def test_update_profile(self, *mocks):
profiler = UnstructuredProfiler(data1)
self.assertEqual(4, profiler.total_samples)
self.assertEqual(1, profiler._empty_line_count)
self.assertAlmostEqual(1.43051e-05, profiler.capacity)
# note how sample doesn't include whitespace lines
self.assertCountEqual(['this', 'is my', 'test'], profiler.sample)

# update with second dataset
profiler.update_profile(data2)
self.assertEqual(10, profiler.total_samples)
self.assertEqual(4, profiler._empty_line_count)
self.assertAlmostEqual(3.81469e-05, profiler.capacity)
# note how sample doesn't include whitespace lines
self.assertCountEqual(['here\n', ' is', 'more data'], profiler.sample)

Expand Down Expand Up @@ -1637,6 +1650,9 @@ def test_total_samples(self):
def test_empty_line_count(self):
self.assertEqual(7, self.profiler._empty_line_count)

def test_get_capacity(self):
self.assertAlmostEqual(3.74794e-04, self.profiler.capacity)

def test_text_profiler_results(self):
# pop out times
self.assertIsNotNone(
Expand Down Expand Up @@ -1692,6 +1708,7 @@ def test_text_profiler_results(self):
'global_stats': {
'samples_used': 16,
'empty_line_count': 7,
'capacity': 3.74794e-04,
'file_type': "<class 'pandas.core.frame.DataFrame'>",
'encoding': None},
'data_stats': {
Expand All @@ -1702,6 +1719,9 @@ def test_text_profiler_results(self):
}
}
}
# pop out the capacity to test first
self.assertAlmostEqual(expected_report['global_stats'].pop('capacity'),
self.report['global_stats'].pop('capacity'))
self.assertDictEqual(expected_report, self.report)

def test_add_profilers(self):
Expand All @@ -1710,6 +1730,7 @@ def test_add_profilers(self):

self.assertEqual(21, merged_profiler.total_samples)
self.assertEqual(8, merged_profiler._empty_line_count)
self.assertAlmostEqual(4.0245e-04, merged_profiler.capacity)
self.assertCountEqual(
['test\n',
'extra',
Expand Down Expand Up @@ -1766,6 +1787,7 @@ def test_update_profile(self):
# tests
self.assertEqual(21, update_profiler.total_samples)
self.assertEqual(8, update_profiler._empty_line_count)
self.assertAlmostEqual(4.0245e-04, update_profiler.capacity)

# Note: different from merge because sample is from last update only
self.assertCountEqual(
Expand Down
18 changes: 18 additions & 0 deletions dataprofiler/tests/profilers/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -124,3 +124,21 @@ def test_find_diff(self):
"g": [None, 15]
}
self.assertDictEqual(expected_diff, utils.find_diff_of_dicts(dict1, dict2))

def test_get_capacity(self):
"""
Checks to see if the get capacity function is operating appropriately.
"""
# wrong unit input
with self.assertRaisesRegex(ValueError,
"Currently only supports the capacity unit in \[B, K, M, G\]"):
utils.get_capacity([], unit="wrong_unit")

# test with different data sizes
self.assertEqual(0, utils.get_capacity([]))
self.assertAlmostEqual(3.14712e-05,
utils.get_capacity(["This is test, a Test sentence.!!!"]))
self.assertAlmostEqual(3.14712e-05,
utils.get_capacity(["This is test,", " a Test sentence.!!!"]))
self.assertAlmostEqual(3.14712e-08,
utils.get_capacity(["This is test, a Test sentence.!!!"], unit='G'))