From 0e7a85fe6bfb6f87e46fd3110fab75e7c11733d5 Mon Sep 17 00:00:00 2001 From: Zhiltsov Max Date: Mon, 3 Aug 2020 12:02:01 +0300 Subject: [PATCH 1/5] Add statistics command --- .../datumaro/cli/contexts/project/__init__.py | 24 ++-- datumaro/datumaro/components/operations.py | 136 ++++++++++++++++++ 2 files changed, 145 insertions(+), 15 deletions(-) diff --git a/datumaro/datumaro/cli/contexts/project/__init__.py b/datumaro/datumaro/cli/contexts/project/__init__.py index 65f81886e5c3..6b3acca0540b 100644 --- a/datumaro/datumaro/cli/contexts/project/__init__.py +++ b/datumaro/datumaro/cli/contexts/project/__init__.py @@ -17,7 +17,8 @@ from datumaro.components.dataset_filter import DatasetItemEncoder from datumaro.components.extractor import AnnotationType from datumaro.components.cli_plugin import CliPlugin -from datumaro.components.operations import mean_std +from datumaro.components.operations import \ + compute_image_statistics, compute_ann_statistics from .diff import DiffVisualizer from ...util import add_subparser, CliException, MultilineFormatter, \ make_file_name @@ -647,22 +648,15 @@ def build_stats_parser(parser_ctor=argparse.ArgumentParser): def stats_command(args): project = load_project(args.project_dir) + dataset = project.make_dataset() + stats = compute_image_statistics(dataset) + stats.update(compute_ann_statistics(dataset)) - def print_extractor_info(extractor, indent=''): - mean, std = mean_std(dataset) - print("%sImage mean:" % indent, ', '.join('%.3f' % n for n in mean)) - print("%sImage std:" % indent, ', '.join('%.3f' % n for n in std)) - - print("Dataset: ") - print_extractor_info(dataset) - - if 1 < len(dataset.subsets()): - print("Subsets: ") - for subset_name in dataset.subsets(): - subset = dataset.get_subset(subset_name) - print(" %s:" % subset_name) - print_extractor_info(subset, " " * 4) + dst_file = generate_next_file_name('statistics', ext='.json') + log.info("Writing project statistics to '%s'" % dst_file) + with open(dst_file, 'w') as f: + json.dump(stats, f) def build_info_parser(parser_ctor=argparse.ArgumentParser): parser = parser_ctor(help="Get project info", diff --git a/datumaro/datumaro/components/operations.py b/datumaro/datumaro/components/operations.py index a049e2675c5d..37ef425ac761 100644 --- a/datumaro/datumaro/components/operations.py +++ b/datumaro/datumaro/components/operations.py @@ -3,9 +3,13 @@ # # SPDX-License-Identifier: MIT +from copy import deepcopy + import cv2 import numpy as np +from datumaro.components.extractor import AnnotationType + def mean_std(dataset): """ @@ -14,6 +18,8 @@ def mean_std(dataset): # Use an online algorithm to: # - handle different image sizes # - avoid cancellation problem + if len(dataset) == 0: + return [0, 0, 0], [0, 0, 0] stats = np.empty((len(dataset), 2, 3), dtype=np.double) counts = np.empty(len(dataset), dtype=np.uint32) @@ -80,3 +86,133 @@ def compute_stats(stats, counts, mean_accessor, variance_accessor): *__class__.compute_stats(stats[:h], counts[:h], m, v), *__class__.compute_stats(stats[h:], counts[h:], m, v) ) + +def compute_image_statistics(dataset): + stats = { + 'dataset': {}, + 'subsets': {} + } + + def _extractor_stats(extractor): + mean, std = mean_std(extractor) + return { + 'images count': len(extractor), + 'image mean': [float(n) for n in mean[::-1]], + 'image std': [float(n) for n in std[::-1]], + } + + stats['dataset'].update(_extractor_stats(dataset)) + + subsets = dataset.subsets() or [None] + if subsets and 0 < len([s for s in subsets if s]): + for subset_name in subsets: + stats['subsets'][subset_name] = _extractor_stats( + dataset.get_subset(subset_name)) + + return stats + +def compute_ann_statistics(dataset): + labels = dataset.categories().get(AnnotationType.label) + def get_label(ann): + return labels.items[ann.label].name if ann.label is not None else None + + stats = { + 'images count': len(dataset), + 'annotations count': 0, + 'unannotated images count': 0, + 'unannotated images': [], + 'annotations by type': { t.name: { + 'count': 0, + } for t in AnnotationType }, + 'annotations': {}, + } + by_type = stats['annotations by type'] + + attr_template = { + 'count': 0, + 'values count': 0, + 'values present': set(), + 'distribution': {}, # value -> (count, total%) + } + label_stat = { + 'count': 0, + 'distribution': { l.name: [0, 0] for l in labels.items + }, # label -> (count, total%) + + 'attributes': {}, + } + stats['annotations']['labels'] = label_stat + segm_stat = { + 'avg. area': 0, + 'area distribution': [], # a histogram with 10 bins + # (min, min+10%), ..., (min+90%, max) -> (count, total%) + + 'pixel distribution': { l.name: [0, 0] for l in labels.items + }, # label -> (count, total%) + } + stats['annotations']['segments'] = segm_stat + segm_areas = [] + pixel_dist = segm_stat['pixel distribution'] + total_pixels = 0 + + for item in dataset: + if len(item.annotations) == 0: + stats['unannotated images'].append(item.id) + continue + + for ann in item.annotations: + by_type[ann.type.name]['count'] += 1 + + if not hasattr(ann, 'label') or ann.label is None: + continue + + if ann.type in {AnnotationType.mask, + AnnotationType.polygon, AnnotationType.bbox}: + area = ann.get_area() + segm_areas.append(area) + pixel_dist[get_label(ann)][0] += int(area) + + label_stat['count'] += 1 + label_stat['distribution'][get_label(ann)][0] += 1 + + for name, value in ann.attributes.items(): + if name.lower() in { 'occluded', 'visibility', 'score', + 'id', 'track_id' }: + continue + attrs_stat = label_stat['attributes'].setdefault(name, + deepcopy(attr_template)) + attrs_stat['count'] += 1 + attrs_stat['values present'].add(str(value)) + attrs_stat['distribution'] \ + .setdefault(str(value), [0, 0])[0] += 1 + + stats['annotations count'] = sum(t['count'] for t in + stats['annotations by type'].values()) + stats['unannotated images count'] = len(stats['unannotated images']) + + for label_info in label_stat['distribution'].values(): + label_info[1] = label_info[0] / label_stat['count'] + + for label_attr in label_stat['attributes'].values(): + label_attr['values count'] = len(label_attr['values present']) + label_attr['values present'] = sorted(label_attr['values present']) + for attr_info in label_attr['distribution'].values(): + attr_info[1] = attr_info[0] / label_attr['count'] + + # numpy.sum might be faster, but could overflow with large datasets. + # Python's int can transparently mutate to be of indefinite precision (long) + total_pixels = sum(int(a) for a in segm_areas) + + segm_stat['avg. area'] = total_pixels / (len(segm_areas) or 1.0) + + for label_info in segm_stat['pixel distribution'].values(): + label_info[1] = label_info[0] / total_pixels + + if len(segm_areas) != 0: + hist, bins = np.histogram(segm_areas) + segm_stat['area distribution'] = [{ + 'min': float(bin_min), 'max': float(bin_max), + 'count': int(c), 'percent': int(c) / len(segm_areas) + } for c, (bin_min, bin_max) in zip(hist, zip(bins[:-1], bins[1:]))] + + return stats From a9b8232b06f70e3ba62c45550d93872f931fb239 Mon Sep 17 00:00:00 2001 From: Zhiltsov Max Date: Mon, 3 Aug 2020 12:02:10 +0300 Subject: [PATCH 2/5] Add tests --- datumaro/tests/test_ops.py | 108 +++++++++++++++++++++++++++++++++++-- 1 file changed, 105 insertions(+), 3 deletions(-) diff --git a/datumaro/tests/test_ops.py b/datumaro/tests/test_ops.py index 24f32f352a71..4b7e480b8009 100644 --- a/datumaro/tests/test_ops.py +++ b/datumaro/tests/test_ops.py @@ -1,7 +1,8 @@ import numpy as np -from datumaro.components.extractor import Extractor, DatasetItem -from datumaro.components.operations import mean_std +from datumaro.components.extractor import (Extractor, DatasetItem, Label, + Mask, Bbox, Points, LabelCategories, AnnotationType, Caption) +from datumaro.components.operations import mean_std, compute_ann_statistics from unittest import TestCase @@ -28,4 +29,105 @@ def __iter__(self): for em, am in zip(expected_mean, actual_mean): self.assertAlmostEqual(em, am, places=0) for estd, astd in zip(expected_std, actual_std): - self.assertAlmostEqual(estd, astd, places=0) \ No newline at end of file + self.assertAlmostEqual(estd, astd, places=0) + + def test_stats(self): + dataset = Dataset.from_iterable([ + DatasetItem(id=1, image=np.ones((5, 5, 3)), annotations=[ + Caption('hello'), + Caption('world'), + Label(2, attributes={ 'x': 1, 'y': '2', }), + Bbox(1, 2, 2, 2, label=2, attributes={ 'score': 0.5, }), + Bbox(5, 6, 2, 2, attributes={ + 'x': 1, 'y': '3', 'occluded': True, + }), + Points([1, 2, 2, 0, 1, 1], label=0), + Mask(label=3, image=np.array([ + [0, 0, 1, 1, 1], + [0, 0, 1, 1, 1], + [0, 0, 1, 1, 1], + [0, 0, 0, 0, 0], + [0, 0, 0, 0, 0], + ])), + ]), + DatasetItem(id=2, image=np.ones((2, 4, 3)), annotations=[ + Label(2, attributes={ 'x': 2, 'y': '2', }), + Bbox(1, 2, 2, 2, label=3, attributes={ 'score': 0.5, }), + Bbox(5, 6, 2, 2, attributes={ + 'x': 2, 'y': '3', 'occluded': False, + }), + ]), + DatasetItem(id=3), + ], categories=['label_%s' % i for i in range(4)]) + + expected = { + 'images count': 3, + 'annotations count': 10, + 'unannotated images count': 1, + 'unannotated images': ['3'], + 'annotations by type': { + 'label': { 'count': 2, }, + 'polygon': { 'count': 0, }, + 'polyline': { 'count': 0, }, + 'bbox': { 'count': 4, }, + 'mask': { 'count': 1, }, + 'points': { 'count': 1, }, + 'caption': { 'count': 2, }, + }, + 'annotations': { + 'labels': { + 'count': 6, + 'distribution': { + 'label_0': [1, 1/6], + 'label_1': [0, 0.0], + 'label_2': [3, 3/6], + 'label_3': [2, 2/6], + }, + 'attributes': { + 'x': { + 'count': 2, # unnotations with no label are skipped + 'values count': 2, + 'values present': ['1', '2'], + 'distribution': { + '1': [1, 1/2], + '2': [1, 1/2], + }, + }, + 'y': { + 'count': 2, # unnotations with no label are skipped + 'values count': 1, + 'values present': ['2'], + 'distribution': { + '2': [2, 2/2], + }, + }, + # must not include "special" attributes like "occluded" + } + }, + 'segments': { + 'avg. area': (4 * 2 + 9 * 1) / 3, + 'area distribution': [ + {'min': 4.0, 'max': 4.5, 'count': 2, 'percent': 2/3}, + {'min': 4.5, 'max': 5.0, 'count': 0, 'percent': 0.0}, + {'min': 5.0, 'max': 5.5, 'count': 0, 'percent': 0.0}, + {'min': 5.5, 'max': 6.0, 'count': 0, 'percent': 0.0}, + {'min': 6.0, 'max': 6.5, 'count': 0, 'percent': 0.0}, + {'min': 6.5, 'max': 7.0, 'count': 0, 'percent': 0.0}, + {'min': 7.0, 'max': 7.5, 'count': 0, 'percent': 0.0}, + {'min': 7.5, 'max': 8.0, 'count': 0, 'percent': 0.0}, + {'min': 8.0, 'max': 8.5, 'count': 0, 'percent': 0.0}, + {'min': 8.5, 'max': 9.0, 'count': 1, 'percent': 1/3}, + ], + 'pixel distribution': { + 'label_0': [0, 0.0], + 'label_1': [0, 0.0], + 'label_2': [4, 4/17], + 'label_3': [13, 13/17], + }, + } + }, + } + + actual = compute_ann_statistics(TestExtractor()) + + self.assertEqual(expected, actual) \ No newline at end of file From bef09273eac8b27486bc8b958a35ef69f12d1c03 Mon Sep 17 00:00:00 2001 From: Zhiltsov Max Date: Mon, 3 Aug 2020 12:02:22 +0300 Subject: [PATCH 3/5] Update changelog --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index e230af8bed80..3973bcb5702f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -13,6 +13,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Support creating multiple jobs for each task through python cli (https://github.com/opencv/cvat/pull/1950) - python cli over https () - Error message when plugins weren't able to initialize instead of infinite loading () +- [Datumaro] Dataset statistics () ### Changed - Smaller object details () From 0974d8806919e6450c55686b5c57124bab46ae85 Mon Sep 17 00:00:00 2001 From: Zhiltsov Max Date: Mon, 3 Aug 2020 12:03:44 +0300 Subject: [PATCH 4/5] fix test --- datumaro/tests/test_ops.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/datumaro/tests/test_ops.py b/datumaro/tests/test_ops.py index 4b7e480b8009..ed165b2ddd19 100644 --- a/datumaro/tests/test_ops.py +++ b/datumaro/tests/test_ops.py @@ -1,7 +1,8 @@ import numpy as np from datumaro.components.extractor import (Extractor, DatasetItem, Label, - Mask, Bbox, Points, LabelCategories, AnnotationType, Caption) + Mask, Bbox, Points, Caption) +from datumaro.components.project import Dataset from datumaro.components.operations import mean_std, compute_ann_statistics from unittest import TestCase @@ -128,6 +129,6 @@ def test_stats(self): }, } - actual = compute_ann_statistics(TestExtractor()) + actual = compute_ann_statistics(dataset) self.assertEqual(expected, actual) \ No newline at end of file From cc0fb8df6d35d3d757f54857aa8cb17803b405a5 Mon Sep 17 00:00:00 2001 From: Zhiltsov Max Date: Mon, 3 Aug 2020 12:10:42 +0300 Subject: [PATCH 5/5] handle image absence --- .../datumaro/cli/contexts/project/__init__.py | 5 ++-- datumaro/datumaro/components/operations.py | 27 ++++++++++++++++--- 2 files changed, 26 insertions(+), 6 deletions(-) diff --git a/datumaro/datumaro/cli/contexts/project/__init__.py b/datumaro/datumaro/cli/contexts/project/__init__.py index 6b3acca0540b..99f5fe82467f 100644 --- a/datumaro/datumaro/cli/contexts/project/__init__.py +++ b/datumaro/datumaro/cli/contexts/project/__init__.py @@ -650,13 +650,14 @@ def stats_command(args): project = load_project(args.project_dir) dataset = project.make_dataset() - stats = compute_image_statistics(dataset) + stats = {} + stats.update(compute_image_statistics(dataset)) stats.update(compute_ann_statistics(dataset)) dst_file = generate_next_file_name('statistics', ext='.json') log.info("Writing project statistics to '%s'" % dst_file) with open(dst_file, 'w') as f: - json.dump(stats, f) + json.dump(stats, f, indent=4, sort_keys=True) def build_info_parser(parser_ctor=argparse.ArgumentParser): parser = parser_ctor(help="Get project info", diff --git a/datumaro/datumaro/components/operations.py b/datumaro/datumaro/components/operations.py index 37ef425ac761..7961775e8b90 100644 --- a/datumaro/datumaro/components/operations.py +++ b/datumaro/datumaro/components/operations.py @@ -3,6 +3,7 @@ # # SPDX-License-Identifier: MIT +import logging as log from copy import deepcopy import cv2 @@ -94,13 +95,31 @@ def compute_image_statistics(dataset): } def _extractor_stats(extractor): - mean, std = mean_std(extractor) - return { + available = True + for item in extractor: + if not (item.has_image and item.image.has_data): + available = False + log.warn("Item %s has no image. Image stats won't be computed", + item.id) + break + + stats = { 'images count': len(extractor), - 'image mean': [float(n) for n in mean[::-1]], - 'image std': [float(n) for n in std[::-1]], } + if available: + mean, std = mean_std(extractor) + stats.update({ + 'image mean': [float(n) for n in mean[::-1]], + 'image std': [float(n) for n in std[::-1]], + }) + else: + stats.update({ + 'image mean': 'n/a', + 'image std': 'n/a', + }) + return stats + stats['dataset'].update(_extractor_stats(dataset)) subsets = dataset.subsets() or [None]