Deci-AI · shanibenbaruch · Aug 31, 2023 · Aug 30, 2023 · Aug 30, 2023 · Aug 30, 2023
diff --git a/src/data_gradients/feature_extractors/object_detection/bounding_boxes_area.py b/src/data_gradients/feature_extractors/object_detection/bounding_boxes_area.py
@@ -1,3 +1,6 @@
+import math
+
+import numpy as np
 import pandas as pd
 
 from data_gradients.common.registry.registry import register_feature_extractor
@@ -25,6 +28,10 @@ def __init__(self, topk: int = 30, prioritization_mode: str = "train_val_diff"):
         self.value_extractor = MostImportantValuesSelector(topk=topk, prioritization_mode=prioritization_mode)
         self.data = []
 
+        self.hist_transform_name = 'sqrt'
+        transforms = {'sqrt': lambda bbox_area: int(math.sqrt(bbox_area))}
+        self.hist_transform = transforms[self.hist_transform_name]
+
     def update(self, sample: DetectionSample):
         image_area = sample.image.shape[0] * sample.image.shape[1]
         for class_id, bbox_xyxy in zip(sample.class_ids, sample.bboxes_xyxy):
@@ -36,12 +43,15 @@ def update(self, sample: DetectionSample):
                     "class_id": class_id,
                     "class_name": class_name,
                     "relative_bbox_area": 100 * (bbox_area / image_area),
+                    f"bbox_area_{self.hist_transform_name}": self.hist_transform(bbox_area),
                 }
             )
 
     def aggregate(self) -> Feature:
         df = pd.DataFrame(self.data)
 
+        dict_bincount = self._compute_histogram(df=df, transform_name=self.hist_transform_name)
+
         df = self.value_extractor.select(df=df, id_col="class_id", split_col="split", value_col="relative_bbox_area")
 
         # Height of the plot is proportional to the number of classes
@@ -66,7 +76,10 @@ def aggregate(self) -> Feature:
             tight_layout=True,
         )
 
-        json = {split: dict(df[df["split"] == split]["relative_bbox_area"].describe()) for split in df["split"].unique()}
+        json = {}
+        for split in df["split"].unique():
+            basic_stats = dict(df[df["split"] == split]["relative_bbox_area"].describe())
+            json[split] = {**basic_stats, "histogram_per_class": dict_bincount[split]}
 
         feature = Feature(
             data=df,
@@ -75,6 +88,54 @@ def aggregate(self) -> Feature:
         )
         return feature
 
+    @staticmethod
+    def _compute_histogram(df: pd.DataFrame, transform_name: str) -> dict:
+        """
+        Compute histograms for bounding box areas per class.
+
+        :param df:                  DataFrame containing bounding box data.
+        :param transform_name:      Type of transformation (like 'sqrt').
+        :return:                    A dictionary containing relevant histogram information.
+            Example:
+            {
+                'train': {
+                    'transform': 'sqrt', # Transformation applied to the bbox area
+                    'bin_width': 1,      # width between histogram bins. This depends on how the histogram is created.
+                    'max_value': 3,      # max (transformed) area value included in the histogram
+                    'histograms': {      # Dictionary of class name and its corresponding histogram
+                        'A': [0, 1, 0, 2],
+                        'B': [0, 0, 1, 0]
+                    }
+                },
+                'val': ...
+        }
+        """
+        max_value = df[f'bbox_area_{transform_name}'].max()
+        max_value = int(max_value)
+
+        dict_bincount = {}
+        for split in df['split'].unique():
+            dict_bincount[split] = {}
+            split_data = df[df['split'] == split]
+
+            dict_bincount[split] = {
+                'transform': transform_name,
+                'bin_width': 1,
+                'max_value': max_value,
+                'histograms': {},
+            }
+
+            for class_label in split_data['class_name'].unique():
+                class_data = split_data[split_data['class_name'] == class_label]
+
+                # Compute histograms for bin_width = 1
+                bin_counts = np.bincount(class_data[f'bbox_area_{transform_name}'], minlength=max_value + 1)
+                histogram = bin_counts.tolist()
+
+                dict_bincount[split]['histograms'][class_label] = histogram
+
+        return dict_bincount
+
     @property
     def title(self) -> str:
         return "Distribution of Bounding Box Area"
@@ -87,3 +148,4 @@ def description(self) -> str:
             "Another thing to keep in mind is that having too many very small objects may indicate that your are downsizing your original image to a "
             "low resolution that is not appropriate for your objects."
         )
+
diff --git a/tests/deci_core_unit_test_suite_runner.py b/tests/deci_core_unit_test_suite_runner.py
@@ -2,6 +2,7 @@
 import unittest
 
 from tests.unit_tests.average_brightness_test import AverageBrightnessTest
+from tests.unit_tests.feature_extractors.detection.test_bounding_boxes_area import TestComputeHistogram
 
 
 class CoreUnitTestSuiteRunner:
@@ -19,6 +20,7 @@ def _add_modules_to_unit_tests_suite(self):
             :return:
         """
         self.unit_tests_suite.addTest(self.test_loader.loadTestsFromModule(AverageBrightnessTest))
+        self.unit_tests_suite.addTest(self.test_loader.loadTestsFromModule(TestComputeHistogram))
 
 
 if __name__ == "__main__":

diff --git a/tests/unit_tests/feature_extractors/detection/test_bounding_boxes_area.py b/tests/unit_tests/feature_extractors/detection/test_bounding_boxes_area.py
@@ -0,0 +1,154 @@
+import unittest
+
+import numpy as np
+import pandas as pd
+
+from data_gradients.feature_extractors.object_detection.bounding_boxes_area import DetectionBoundingBoxArea
+from data_gradients.utils.data_classes.data_samples import ImageChannelFormat, DetectionSample
+
+
+class TestComputeHistogram(unittest.TestCase):
+    def test_compute_histogram(self):
+        test_df = pd.DataFrame({
+            'bbox_area_sqrt': [1, 2, 3, 3, 3, 2, 3],
+            'split': ['train', 'train', 'train', 'train', 'val', 'val', 'val'],
+            'class_name': ['A', 'B', 'A', 'A', 'B', 'A', 'C']
+        })
+
+        result = DetectionBoundingBoxArea._compute_histogram(test_df, transform_name='sqrt')
+
+        expected_result = {
+            'train': {
+                'transform': 'sqrt',
+                'bin_width': 1,
+                'max_value': 3,
+                'histograms': {
+                    'A': [0, 1, 0, 2],
+                    'B': [0, 0, 1, 0]
+                }
+            },
+            'val': {
+                'transform': 'sqrt',
+                'bin_width': 1,
+                'max_value': 3,
+                'histograms': {
+                    'A': [0, 0, 1, 0],
+                    'B': [0, 0, 0, 1],
+                    'C': [0, 0, 0, 1]
+                }
+            }
+        }
+
+        self.assertEqual(result, expected_result)
+
+    def test_single_data_point(self):
+        test_df = pd.DataFrame({'bbox_area_sqrt': [1], 'split': ['train'], 'class_name': ['A']})
+        result = DetectionBoundingBoxArea._compute_histogram(test_df, transform_name='sqrt')
+
+        expected_result = {
+            'train': {
+                'transform': 'sqrt',
+                'bin_width': 1,
+                'max_value': 1,
+                'histograms': {
+                    'A': [0, 1]
+                }
+            }
+        }
+
+        self.assertEqual(result, expected_result)
+
+    def test_minimum_maximum_values(self):
+        test_df = pd.DataFrame({
+            'bbox_area_sqrt': [1, 100],
+            'split': ['val', 'val'],
+            'class_name': ['A', 'A']
+        })
+        result = DetectionBoundingBoxArea._compute_histogram(test_df, transform_name='sqrt')
+
+        expected_result = {
+            'val': {
+                'transform': 'sqrt',
+                'bin_width': 1,
+                'max_value': 100,
+                'histograms': {
+                    'A': [0] + [1] + [0] * 98 + [1]
+                }
+            }
+        }
+
+        self.assertEqual(result, expected_result)
+
+    def test_histogram_json_output(self):
+        train_sample = DetectionSample(
+            sample_id='sample_1',
+            split='train',
+            image=np.zeros((100, 100, 3)),
+            image_format=ImageChannelFormat.RGB,
+            bboxes_xyxy=np.array(
+                [
+                    [2, 2, 4, 4],
+                    [3, 3, 6, 6],
+                    [1, 1, 5, 5],
+                    [1, 1, 4, 4],
+                    [5, 5, 6, 6],
+                    [7, 7, 9, 9]
+                ]
+            ),
+            class_ids=np.array([0, 1, 2, 2, 3, 4]),
+            class_names=['A', 'B', 'C', 'D', 'E'],
+        )
+
+        val_sample = DetectionSample(
+            sample_id='sample_2',
+            split='val',
+            image=np.zeros((100, 100, 3)),
+            image_format=ImageChannelFormat.RGB,
+            bboxes_xyxy=np.array(
+                [
+                    [1, 1, 3, 3],
+                    [2, 2, 5, 5],
+                    [5, 5, 6, 6],
+                ]
+            ),
+            class_ids=np.array([0, 1, 1]),
+            class_names=['A', 'B'],
+        )
+
+        extractor = DetectionBoundingBoxArea()
+        extractor.update(train_sample)
+        extractor.update(val_sample)
+        feature = extractor.aggregate()
+
+        histogram_dict_train = feature.json['train']['histogram_per_class']
+        histogram_dict_val = feature.json['val']['histogram_per_class']
+
+        expected_result_train = {
+            'transform': 'sqrt',
+            'bin_width': 1,
+            'max_value': 4,
+            'histograms': {
+                'A': [0, 0, 1, 0, 0],
+                'B': [0, 0, 0, 1, 0],
+                'C': [0, 0, 0, 1, 1],
+                'D': [0, 1, 0, 0, 0],
+                'E': [0, 0, 1, 0, 0]
+            }
+        }
+
+        expected_result_val = {
+            'transform': 'sqrt',
+            'bin_width': 1,
+            'max_value': 4,
+            'histograms': {
+                'A': [0, 0, 1, 0, 0],
+                'B': [0, 1, 0, 1, 0]
+            }
+        }
+
+        self.assertEqual(histogram_dict_train, expected_result_train)
+        self.assertEqual(histogram_dict_val, expected_result_val)
+
+
+if __name__ == '__main__':
+    unittest.main()