Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feature/alg 1639 add proxy information #180

Merged
merged 7 commits into from
Aug 31, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
import math

import numpy as np
import pandas as pd

from data_gradients.common.registry.registry import register_feature_extractor
Expand Down Expand Up @@ -25,6 +28,10 @@ def __init__(self, topk: int = 30, prioritization_mode: str = "train_val_diff"):
self.value_extractor = MostImportantValuesSelector(topk=topk, prioritization_mode=prioritization_mode)
self.data = []

self.hist_transform_name = 'sqrt'
transforms = {'sqrt': lambda bbox_area: int(math.sqrt(bbox_area))}
self.hist_transform = transforms[self.hist_transform_name]

def update(self, sample: DetectionSample):
image_area = sample.image.shape[0] * sample.image.shape[1]
for class_id, bbox_xyxy in zip(sample.class_ids, sample.bboxes_xyxy):
Expand All @@ -36,12 +43,15 @@ def update(self, sample: DetectionSample):
"class_id": class_id,
"class_name": class_name,
"relative_bbox_area": 100 * (bbox_area / image_area),
f"bbox_area_{self.hist_transform_name}": self.hist_transform(bbox_area),
}
)

def aggregate(self) -> Feature:
df = pd.DataFrame(self.data)

dict_bincount = self._compute_histogram(df=df, transform_name=self.hist_transform_name)

df = self.value_extractor.select(df=df, id_col="class_id", split_col="split", value_col="relative_bbox_area")

# Height of the plot is proportional to the number of classes
Expand All @@ -66,7 +76,10 @@ def aggregate(self) -> Feature:
tight_layout=True,
)

json = {split: dict(df[df["split"] == split]["relative_bbox_area"].describe()) for split in df["split"].unique()}
json = {}
for split in df["split"].unique():
basic_stats = dict(df[df["split"] == split]["relative_bbox_area"].describe())
json[split] = {**basic_stats, "histogram_per_class": dict_bincount[split]}

feature = Feature(
data=df,
Expand All @@ -75,6 +88,54 @@ def aggregate(self) -> Feature:
)
return feature

@staticmethod
def _compute_histogram(df: pd.DataFrame, transform_name: str) -> dict:
"""
Compute histograms for bounding box areas per class.

:param df: DataFrame containing bounding box data.
:param transform_name: Type of transformation (like 'sqrt').
:return: A dictionary containing relevant histogram information.
Example:
{
'train': {
'transform': 'sqrt', # Transformation applied to the bbox area
'bin_width': 1, # width between histogram bins. This depends on how the histogram is created.
'max_value': 3, # max (transformed) area value included in the histogram
'histograms': { # Dictionary of class name and its corresponding histogram
'A': [0, 1, 0, 2],
'B': [0, 0, 1, 0]
}
},
'val': ...
}
"""
max_value = df[f'bbox_area_{transform_name}'].max()
max_value = int(max_value)

dict_bincount = {}
for split in df['split'].unique():
dict_bincount[split] = {}
split_data = df[df['split'] == split]

dict_bincount[split] = {
'transform': transform_name,
'bin_width': 1,
Louis-Dupont marked this conversation as resolved.
Show resolved Hide resolved
'max_value': max_value,
'histograms': {},
}

for class_label in split_data['class_name'].unique():
class_data = split_data[split_data['class_name'] == class_label]

# Compute histograms for bin_width = 1
bin_counts = np.bincount(class_data[f'bbox_area_{transform_name}'], minlength=max_value + 1)
histogram = bin_counts.tolist()

dict_bincount[split]['histograms'][class_label] = histogram

return dict_bincount

@property
def title(self) -> str:
return "Distribution of Bounding Box Area"
Expand All @@ -87,3 +148,4 @@ def description(self) -> str:
"Another thing to keep in mind is that having too many very small objects may indicate that your are downsizing your original image to a "
"low resolution that is not appropriate for your objects."
)

2 changes: 2 additions & 0 deletions tests/deci_core_unit_test_suite_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import unittest

from tests.unit_tests.average_brightness_test import AverageBrightnessTest
from tests.unit_tests.feature_extractors.detection.test_bounding_boxes_area import TestComputeHistogram


class CoreUnitTestSuiteRunner:
Expand All @@ -19,6 +20,7 @@ def _add_modules_to_unit_tests_suite(self):
:return:
"""
self.unit_tests_suite.addTest(self.test_loader.loadTestsFromModule(AverageBrightnessTest))
self.unit_tests_suite.addTest(self.test_loader.loadTestsFromModule(TestComputeHistogram))


if __name__ == "__main__":
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,154 @@
import unittest

import numpy as np
import pandas as pd

from data_gradients.feature_extractors.object_detection.bounding_boxes_area import DetectionBoundingBoxArea
from data_gradients.utils.data_classes.data_samples import ImageChannelFormat, DetectionSample


class TestComputeHistogram(unittest.TestCase):
def test_compute_histogram(self):
test_df = pd.DataFrame({
'bbox_area_sqrt': [1, 2, 3, 3, 3, 2, 3],
'split': ['train', 'train', 'train', 'train', 'val', 'val', 'val'],
'class_name': ['A', 'B', 'A', 'A', 'B', 'A', 'C']
})

result = DetectionBoundingBoxArea._compute_histogram(test_df, transform_name='sqrt')

expected_result = {
'train': {
'transform': 'sqrt',
'bin_width': 1,
'max_value': 3,
'histograms': {
'A': [0, 1, 0, 2],
'B': [0, 0, 1, 0]
}
},
'val': {
'transform': 'sqrt',
'bin_width': 1,
'max_value': 3,
'histograms': {
'A': [0, 0, 1, 0],
'B': [0, 0, 0, 1],
'C': [0, 0, 0, 1]
}
}
}

self.assertEqual(result, expected_result)

def test_single_data_point(self):
test_df = pd.DataFrame({'bbox_area_sqrt': [1], 'split': ['train'], 'class_name': ['A']})
result = DetectionBoundingBoxArea._compute_histogram(test_df, transform_name='sqrt')

expected_result = {
'train': {
'transform': 'sqrt',
'bin_width': 1,
'max_value': 1,
'histograms': {
'A': [0, 1]
}
}
}

self.assertEqual(result, expected_result)

def test_minimum_maximum_values(self):
test_df = pd.DataFrame({
'bbox_area_sqrt': [1, 100],
'split': ['val', 'val'],
'class_name': ['A', 'A']
})
result = DetectionBoundingBoxArea._compute_histogram(test_df, transform_name='sqrt')

expected_result = {
'val': {
'transform': 'sqrt',
'bin_width': 1,
'max_value': 100,
'histograms': {
'A': [0] + [1] + [0] * 98 + [1]
}
}
}

self.assertEqual(result, expected_result)

def test_histogram_json_output(self):
train_sample = DetectionSample(
sample_id='sample_1',
split='train',
image=np.zeros((100, 100, 3)),
image_format=ImageChannelFormat.RGB,
bboxes_xyxy=np.array(
[
[2, 2, 4, 4],
[3, 3, 6, 6],
[1, 1, 5, 5],
[1, 1, 4, 4],
[5, 5, 6, 6],
[7, 7, 9, 9]
]
),
class_ids=np.array([0, 1, 2, 2, 3, 4]),
class_names=['A', 'B', 'C', 'D', 'E'],
)

val_sample = DetectionSample(
sample_id='sample_2',
split='val',
image=np.zeros((100, 100, 3)),
image_format=ImageChannelFormat.RGB,
bboxes_xyxy=np.array(
[
[1, 1, 3, 3],
[2, 2, 5, 5],
[5, 5, 6, 6],
]
),
class_ids=np.array([0, 1, 1]),
class_names=['A', 'B'],
)

extractor = DetectionBoundingBoxArea()
extractor.update(train_sample)
extractor.update(val_sample)
feature = extractor.aggregate()

histogram_dict_train = feature.json['train']['histogram_per_class']
histogram_dict_val = feature.json['val']['histogram_per_class']

expected_result_train = {
'transform': 'sqrt',
'bin_width': 1,
'max_value': 4,
'histograms': {
'A': [0, 0, 1, 0, 0],
'B': [0, 0, 0, 1, 0],
'C': [0, 0, 0, 1, 1],
'D': [0, 1, 0, 0, 0],
'E': [0, 0, 1, 0, 0]
}
}

expected_result_val = {
'transform': 'sqrt',
'bin_width': 1,
'max_value': 4,
'histograms': {
'A': [0, 0, 1, 0, 0],
'B': [0, 1, 0, 1, 0]
}
}

self.assertEqual(histogram_dict_train, expected_result_train)
self.assertEqual(histogram_dict_val, expected_result_val)


if __name__ == '__main__':
unittest.main()