capitalone · JGSweets · Jul 19, 2021 · Jul 16, 2021 · Jul 19, 2021 · Jul 19, 2021
@@ -884,6 +884,7 @@ def __init__(self, data, samples_per_update=None, min_true_samples=0,
 
         # Unstructured specific properties
         self._empty_line_count = 0
+        self.capacity = 0
         self.sample = []
 
         if data is not None:
@@ -910,6 +911,7 @@ def __add__(self, other):
         # unstruct specific property merging
         merged_profile._empty_line_count = (
                 self._empty_line_count + other._empty_line_count)
+        merged_profile.capacity = self.capacity + other.capacity
         samples = list(dict.fromkeys(self.sample + other.sample))
         merged_profile.sample = random.sample(list(samples),
                                               min(len(samples), 5))
@@ -931,6 +933,7 @@ def _update_base_stats(self, base_stats):
         self.total_samples += base_stats["sample_size"]
         self.sample = base_stats["sample"]
         self._empty_line_count += base_stats["empty_line_count"]
+        self.capacity += base_stats["capacity"]
 
     def report(self, report_options=None):
         """
@@ -965,7 +968,8 @@ def report(self, report_options=None):
                 "samples_used": self.total_samples,
                 "empty_line_count": self._empty_line_count,
                 "file_type": self.file_type,
-                "encoding": self.encoding
+                "encoding": self.encoding,
+                "capacity": self.capacity,
             }),
             ("data_stats", OrderedDict()),
         ])
@@ -997,12 +1001,16 @@ def _clean_data_and_get_base_stats(data, sample_size,
         len_data = len(data)
         if not len_data:
             return data, {
-                "sample_size": 0, "empty_line_count": dict(), "sample": [],
+                "sample_size": 0, "empty_line_count": dict(),
+                "sample": [], "capacity": 0
             }
 
         # ensure all data are of type str
         data = data.apply(str)
 
+        # get capacity
+        base_stats = {"capacity": utils.get_capacity(data)}
+
         # Setup sample generator
         sample_ind_generator = utils.shuffle_in_chunks(
             len_data, chunk_size=sample_size)
@@ -1038,12 +1046,14 @@ def _clean_data_and_get_base_stats(data, sample_size,
         data = data.loc[true_sample_list]
         total_empty = total_sample_size - len(true_sample_list)
 
-        base_stats = {
-            "sample_size": total_sample_size,
-            "empty_line_count": total_empty,
-            "sample": random.sample(list(data.values),
-                                    min(len(data), 5)),
-        }
+        base_stats.update(
+            {
+                "sample_size": total_sample_size,
+                "empty_line_count": total_empty,
+                "sample": random.sample(list(data.values),
+                                        min(len(data), 5)),
+            }
+        )
 
         return data, base_stats
 
@@ -1114,6 +1124,7 @@ def save(self, filepath=None):
             "_samples_per_update": self._samples_per_update,
             "_min_true_samples": self._min_true_samples,
             "_empty_line_count": self._empty_line_count,
+            "capacity": self.capacity,
             "options": self.options,
             "_profile": self.profile
         }

@@ -483,3 +483,23 @@ def find_diff_of_dicts(dict1, dict2):
         diff = "unchanged"
 
     return diff
+
+def get_capacity(data, unit='M'):
+    """
+    Get size (capacity) of the input data
+
+    :param data: list or array of data
+    :type data: Union[list, numpy.array, pandas.DataFrame]
+    :param unit: capacity unit (B, K, M, or G)
+    :type unit: string
+    :return: capacity of the input data
+    """
+    unit_map = {'B': 0, 'K': 1, 'M': 2, 'G': 3}
+    if unit not in unit_map:
+        raise ValueError('Currently only supports the '
+                         'capacity unit in [B, K, M, G]')
+    capacity = 0
+    for sentence in data:
+        capacity += len(sentence.encode('utf-8'))
+    capacity /= 1024.0 ** unit_map[unit]  # Conversion based on unit_map
+    return capacity
@@ -1399,6 +1399,7 @@ def test_base(self, *mocks):
         self.assertEqual(0, profiler._min_true_samples)
         self.assertEqual(0, profiler.total_samples)
         self.assertEqual(0, profiler._empty_line_count)
+        self.assertEqual(0, profiler.capacity)
         self.assertEqual(0.2, profiler._sampling_ratio)
         self.assertEqual(5000, profiler._min_sample_size)
         self.assertEqual([], profiler.sample)
@@ -1416,6 +1417,7 @@ def test_base(self, *mocks):
         self.assertEqual(4, profiler.total_samples)
         self.assertCountEqual(['this', 'is my', 'test'], profiler.sample)
         self.assertEqual(1, profiler._empty_line_count)
+        self.assertAlmostEqual(1.43051e-05, profiler.capacity)
         self.assertEqual("<class 'pandas.core.series.Series'>",
                          profiler.file_type)
         self.assertIsNone(profiler.encoding)
@@ -1433,6 +1435,7 @@ def test_base(self, *mocks):
         self.assertEqual(4, profiler.total_samples)
         self.assertCountEqual(['this', 'is my', 'test'], profiler.sample)
         self.assertEqual(1, profiler._empty_line_count)
+        self.assertAlmostEqual(1.43051e-05, profiler.capacity)
         self.assertEqual("csv", profiler.file_type)
         self.assertEqual("utf-8", profiler.encoding)
         self.assertIsInstance(profiler._profile, UnstructuredCompiler)
@@ -1455,6 +1458,7 @@ def test_str_input_data(self, *mocks):
         profiler = UnstructuredProfiler(data)
         self.assertEqual(1, profiler.total_samples)
         self.assertEqual(0, profiler._empty_line_count)
+        self.assertAlmostEqual(1.52587e-05, profiler.capacity)
         self.assertEqual("<class 'str'>", profiler.file_type)
         self.assertIsNone(profiler.encoding)
         self.assertIsInstance(profiler._profile, UnstructuredCompiler)
@@ -1464,6 +1468,7 @@ def test_list_input_data(self, *mocks):
         profiler = UnstructuredProfiler(data)
         self.assertEqual(4, profiler.total_samples)
         self.assertEqual(1, profiler._empty_line_count)
+        self.assertAlmostEqual(1.43051e-05, profiler.capacity)
         self.assertEqual("<class 'list'>", profiler.file_type)
         self.assertIsNone(profiler.encoding)
         self.assertIsInstance(profiler._profile, UnstructuredCompiler)
@@ -1473,6 +1478,7 @@ def test_dataframe_input_data(self, *mocks):
         profiler = UnstructuredProfiler(data)
         self.assertEqual(4, profiler.total_samples)
         self.assertEqual(1, profiler._empty_line_count)
+        self.assertAlmostEqual(1.43051e-05, profiler.capacity)
         self.assertEqual("<class 'pandas.core.frame.DataFrame'>", profiler.file_type)
         self.assertIsNone(profiler.encoding)
         self.assertIsInstance(profiler._profile, UnstructuredCompiler)
@@ -1494,6 +1500,7 @@ def test_merge_profiles(self, *mocks):
         merged_profile = profiler1 + profiler2
         self.assertEqual(10, merged_profile.total_samples)
         self.assertEqual(4, merged_profile._empty_line_count)
+        self.assertAlmostEqual(3.81469e-05, merged_profile.capacity)
         # note how sample doesn't include whitespace lines
         self.assertCountEqual(['this', ' is', 'here\n', 'more data', 'is my'],
                               merged_profile.sample)
@@ -1532,11 +1539,13 @@ def test_clean_data_and_get_base_stats(self, *mocks):
 
         # note: bc the sample size is 3, only a subset of the data was sampled
         self.assertTrue(np.issubdtype(np.object_, df_series.dtype))
+        # pop out the capacity to test first
+        self.assertAlmostEqual(2.38418e-05, base_stats.pop('capacity'))
         self.assertDictEqual(
             {
                 'sample': ['more data'],  # bc of subset sampled
                 'sample_size': 3,
-                'empty_line_count': 2
+                'empty_line_count': 2,
             },
             base_stats)
 
@@ -1547,11 +1556,13 @@ def test_clean_data_and_get_base_stats(self, *mocks):
 
         # note: bc the sample size is 3, only a subset of the data was sampled
         self.assertTrue(np.issubdtype(np.object_, df_series.dtype))
+        # pop out the capacity to test first
+        self.assertAlmostEqual(2.38418e-05, base_stats.pop('capacity'))
         self.assertDictEqual(
             {
                 'sample': ['more data', 'here\n', 'a', ' is'],
                 'sample_size': 6,
-                'empty_line_count': 2
+                'empty_line_count': 2,
             },
             base_stats)
 
@@ -1564,13 +1575,15 @@ def test_update_profile(self, *mocks):
         profiler = UnstructuredProfiler(data1)
         self.assertEqual(4, profiler.total_samples)
         self.assertEqual(1, profiler._empty_line_count)
+        self.assertAlmostEqual(1.43051e-05, profiler.capacity)
         # note how sample doesn't include whitespace lines
         self.assertCountEqual(['this', 'is my', 'test'], profiler.sample)
 
         # update with second dataset
         profiler.update_profile(data2)
         self.assertEqual(10, profiler.total_samples)
         self.assertEqual(4, profiler._empty_line_count)
+        self.assertAlmostEqual(3.81469e-05, profiler.capacity)
         # note how sample doesn't include whitespace lines
         self.assertCountEqual(['here\n', ' is', 'more data'], profiler.sample)
 
@@ -1637,6 +1650,9 @@ def test_total_samples(self):
     def test_empty_line_count(self):
         self.assertEqual(7, self.profiler._empty_line_count)
 
+    def test_get_capacity(self):
+        self.assertAlmostEqual(3.74794e-04, self.profiler.capacity)
+
     def test_text_profiler_results(self):
         # pop out times
         self.assertIsNotNone(
@@ -1692,6 +1708,7 @@ def test_text_profiler_results(self):
             'global_stats': {
                 'samples_used': 16,
                 'empty_line_count': 7,
+                'capacity': 3.74794e-04,
                 'file_type': "<class 'pandas.core.frame.DataFrame'>",
                 'encoding': None},
             'data_stats': {
@@ -1702,6 +1719,9 @@ def test_text_profiler_results(self):
                 }
             }
         }
+        # pop out the capacity to test first
+        self.assertAlmostEqual(expected_report['global_stats'].pop('capacity'),
+                               self.report['global_stats'].pop('capacity'))
         self.assertDictEqual(expected_report, self.report)
 
     def test_add_profilers(self):
@@ -1710,6 +1730,7 @@ def test_add_profilers(self):
 
         self.assertEqual(21, merged_profiler.total_samples)
         self.assertEqual(8, merged_profiler._empty_line_count)
+        self.assertAlmostEqual(4.0245e-04, merged_profiler.capacity)
         self.assertCountEqual(
             ['test\n',
              'extra',
@@ -1766,6 +1787,7 @@ def test_update_profile(self):
         # tests
         self.assertEqual(21, update_profiler.total_samples)
         self.assertEqual(8, update_profiler._empty_line_count)
+        self.assertAlmostEqual(4.0245e-04, update_profiler.capacity)
 
         # Note: different from merge because sample is from last update only
         self.assertCountEqual(

@@ -124,3 +124,21 @@ def test_find_diff(self):
             "g": [None, 15]
         }
         self.assertDictEqual(expected_diff, utils.find_diff_of_dicts(dict1, dict2))
+
+    def test_get_capacity(self):
+        """
+        Checks to see if the get capacity function is operating appropriately.
+        """
+        # wrong unit input
+        with self.assertRaisesRegex(ValueError,
+                "Currently only supports the capacity unit in \[B, K, M, G\]"):
+            utils.get_capacity([], unit="wrong_unit")
+
+        # test with different data sizes
+        self.assertEqual(0, utils.get_capacity([]))
+        self.assertAlmostEqual(3.14712e-05,
+            utils.get_capacity(["This is test, a Test sentence.!!!"]))
+        self.assertAlmostEqual(3.14712e-05,
+            utils.get_capacity(["This is test,", " a Test sentence.!!!"]))
+        self.assertAlmostEqual(3.14712e-08,
+            utils.get_capacity(["This is test, a Test sentence.!!!"], unit='G'))