Initial commit. version 0.1.0

shonohs · Jun 21, 2021 · 9e6c853 · 9e6c853
commit 9e6c853
Show file tree

Hide file tree

Showing 17 changed files with 612 additions and 0 deletions.
diff --git a/README.md b/README.md
@@ -0,0 +1,96 @@
+# simpledataset
+
+Utility tools for simple vision image dataset format. WORK IN PROGRESS.
+
+## Features
+* See the summary of a dataset
+* Convert from/to various dataset formats.
+* Web UI to look into a dataset
+* CUI tools to split and concat datasets.
+* CUI tools to modify labels.
+
+## Install
+```
+pip install simpledataset==0.1.0
+```
+
+## Usage
+```
+# Show summary
+dataset_summary <input_dataset>
+
+# For Classification dataset, extract only the images that have the specified labels.
+# For Detection dataset, extract only the boxes that have the specified labels.
+dataset_filter <input_dataset> <output_dataset> [--include_class <class_id> [<class_id> ...]] [--exclude_class <class_id> [<class_id> ...]]
+
+# Update class labels
+dataset_map <input_dataset> <output_dataset> --map <src_class_id> <dst_class_id> [--map <src_class_id> <dst_class_id> [--map...]]
+
+dataset_split # NYI
+
+dataset_concat # NYI
+
+dataset_shuffle # NYI
+
+dataset_sample # NYI
+
+dataset_pack # NYI
+
+# Convert from/to other dataset types. COCO format is supported now.
+dataset_convert_from <input_dataset> <input_format> <output_filepath>
+dataset_convert_to <input_dataset> <output_format> <output_filepath>
+```
+
+
+## Examples
+### Change class ids
+For example, if you would like to change MNIST to odd or even classification dataset, you can use dataset_map command. In this example, we use class_id=0 for even numbers, and class_id=1 for odd numbers.
+```bash
+dataset_map mnist.txt new_dataset.txt --map 2 0 --map 3 1 --map 4 0 --map 5 1 --map 6 0 --map 7 1 --map 8 0 --map 9 1
+```
+
+
+## SIMPLE Dataset format
+Currently there are 2 dataset formats, Image Classification and Object Detection. Both datasets have a single txt file, image files and an optional list of label names (labels.txt). In addition to that, Object Detection datasets has label files that contains bbox info.
+
+### Image Classification
+The main txt format is:
+```
+<file> ::= <txt_line> ('\n' <txt_line>)*
+<txt_line> ::= <image_filepath> ' ' <labels>
+<image_filepath> ::= <filepath> | <zip_filepath> '@' <entry_name>
+<labels> ::= <class_id> (',' <class_id>)*
+```
+
+Here is an example txt file.
+```
+train_images.zip@0.jpg 0
+train_images2.zip@1.jpg 1
+image.png 0,1
+image2.bmp 0,1,2,3
+```
+
+### Object Detection
+The main txt format is:
+```
+<file> ::= <txt_line> ('\n' <txt_line>)*
+<txt_line> ::= <image_filepath> ' ' <label_filepath>
+<image_filepath> ::= <filepath> | <zip_filepath> '@' <entry_name>
+<label_filepath> ::= <filepath> | <zip_filepath> '@' <entry_name>
+```
+
+The format of a label file is:
+```
+<file> ::= <label_line> ('\n' <label_line>)*
+<label_line> ::= <class_id> ' ' <bbox_x_min> ' ' <bbox_y_min> ' ' <bbox_x_max> ' ' <bbox_y_max>
+<class_id> ::= <int>
+<bbox_x_min> ::= <int>      ; 0 <= <bbox_x_min> < <bbox_x_max> <= <image_width>
+<bbox_y_min> ::= <int>      ; 0 <= <bbox_y_min> < <bbox_y_max> <= <image_height>
+<bbox_x_max> ::= <int>
+<bbox_y_max> ::= <int>
+```
+
+
+## Usage for remote datasets
+NYI.
+This tool allows you to use datasets on Azure Blob Storage. You can update a dataset on the storage efficiently.
diff --git a/setup.py b/setup.py
@@ -0,0 +1,21 @@
+import setuptools
+
+
+setuptools.setup(name='simpledataset',
+                 version='0.1.0',
+                 description="Utility tools for SIMPLE vision dataset format.",
+                 packages=setuptools.find_packages(),
+                 license='MIT',
+                 url='https://github.com/shonohs/simpledataset',
+                 classifiers=[
+                     'Intended Audience :: Developers',
+                     'License :: OSI Approved :: MIT License'
+                 ],
+                 entry_points={
+                     'console_scripts': [
+                         'dataset_convert_from=simpledataset.commands.convert_from:main',
+                         'dataset_convert_to=simpledataset.commands.convert_to:main',
+                         'dataset_filter=simpledataset.commands.filter:main',
+                         'dataset_map=simpledataset.commands.map:main',
+                         'dataset_summary=simpledataset.commands.summary:main'
+                     ]})
diff --git a/simpledataset/__init__.py b/simpledataset/__init__.py
diff --git a/simpledataset/commands/__init__.py b/simpledataset/commands/__init__.py
diff --git a/simpledataset/commands/convert_from.py b/simpledataset/commands/convert_from.py
@@ -0,0 +1,33 @@
+import argparse
+import pathlib
+from simpledataset.common import DatasetWriter
+from simpledataset.converters import CocoReader
+
+
+def convert_from(input_filepath, source_format, output_filepath):
+    if source_format == 'coco':
+        dataset = CocoReader().read(input_filepath, input_filepath.parent)
+    else:
+        raise RuntimeError(f"Unsupported format: {source_format}")
+
+    directory = output_filepath.parent
+    DatasetWriter(directory).write(dataset, output_filepath)
+    print(f"Successfully saved {output_filepath}.")
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('input_filepath', type=pathlib.Path)
+    parser.add_argument('source_format', choices=['coco'])
+    parser.add_argument('output_filepath', type=pathlib.Path)
+
+    args = parser.parse_args()
+
+    if args.output_filepath.exists():
+        parser.error(f"{args.output_filepath} already exists.")
+
+    convert_from(args.input_filepath, args.source_format, args.output_filepath)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/simpledataset/commands/convert_to.py b/simpledataset/commands/convert_to.py
@@ -0,0 +1,35 @@
+import argparse
+import pathlib
+from simpledataset.common import SimpleDatasetFactory
+from simpledataset.converters import CocoWriter
+
+
+def convert_to(main_txt, directory, target_format, output_filepath):
+    dataset = SimpleDatasetFactory().load(main_txt, directory)
+
+    if target_format == 'coco':
+        CocoWriter().write(dataset, output_filepath, output_filepath.parent)
+        print(f"Successfully saved to {output_filepath}")
+    else:
+        raise RuntimeError(f"Unsupported format: {target_format}")
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('main_txt_filepath', type=pathlib.Path)
+    parser.add_argument('target_format', choices=['coco'])
+    parser.add_argument('output_filepath', type=pathlib.Path)
+
+    args = parser.parse_args()
+
+    main_txt = args.main_txt_filepath.read_text()
+    directory = args.main_txt_filepath.parent
+
+    if args.output_filepath.exists():
+        parser.error(f"{args.output_filepath} already exists.")
+
+    convert_to(main_txt, directory, args.target_format, args.output_filepath)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/simpledataset/commands/filter.py b/simpledataset/commands/filter.py
@@ -0,0 +1,61 @@
+import argparse
+import logging
+import pathlib
+from simpledataset.common import SimpleDatasetFactory, ImageClassificationDataset, ObjectDetectionDataset, DatasetWriter
+
+logger = logging.getLogger(__name__)
+
+
+def filter_dataset(main_txt, directory, output_filepath, include_class_ids, exclude_class_ids):
+    dataset = SimpleDatasetFactory().load(main_txt, directory)
+    max_class_id = dataset.get_max_class_id()
+    include_class_ids = set(include_class_ids or [i for i in range(max_class_id + 1) if i not in exclude_class_ids])
+
+    for c in include_class_ids:
+        if c > max_class_id:
+            logger.warning(f"The class {c} is not in the dataset.")
+
+    if dataset.type == 'image_classification':
+        data = [(image, [i for i in labels if i in include_class_ids]) for image, labels in dataset]
+        # Remove images that have no labels.
+        data = [d for d in data if d[1]]
+        dataset = ImageClassificationDataset(data, directory)
+    elif dataset.type == 'object_detection':
+        data = [(image, [x for x in labels if x[0] in include_class_ids]) for image, labels in dataset]
+        # Remove images that have no labels
+        data = [d for d in data if d[1]]
+        dataset = ObjectDetectionDataset(data, directory)
+    else:
+        raise RuntimeError
+
+    DatasetWriter(directory).write(dataset, output_filepath)
+    print(f"Successfully saved {output_filepath}")
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('main_txt_filepath', type=pathlib.Path)
+    parser.add_argument('output_filename')
+    parser.add_argument('--include_class', nargs='*', default=[], metavar='CLASS_ID')
+    parser.add_argument('--exclude_class', nargs='*', default=[], metavar='CLASS_ID')
+    args = parser.parse_args()
+
+    if '/' in args.output_filename:
+        parser.error("The output file must be in the same directory.")
+
+    main_txt = args.main_txt_filepath.read_text()
+    directory = args.main_txt_filepath.parent
+
+    if (directory / args.output_filename).exists():
+        parser.error(f"{args.output_filename} already exists.")
+
+    if (args.include_class and args.exclude_class) or not (args.include_class or args.exclude_class):
+        parser.error("--include_class or --exclude_class must be specified.")
+
+    include_class_ids = [int(c) for c in args.include_class]
+    exclude_class_ids = [int(c) for c in args.exclude_class]
+    filter_dataset(main_txt, directory, args.output_filename, include_class_ids, exclude_class_ids)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/simpledataset/commands/map.py b/simpledataset/commands/map.py
@@ -0,0 +1,43 @@
+import argparse
+import pathlib
+from simpledataset.common import SimpleDatasetFactory, ImageClassificationDataset, ObjectDetectionDataset, DatasetWriter
+
+
+def map_dataset(main_txt, directory, output_filepath, mappings_list):
+    dataset = SimpleDatasetFactory().load(main_txt, directory)
+    mappings = {int(src): int(dst) for src, dst in mappings_list}
+
+    if dataset.type == 'image_classification':
+        data = [(image, map(lambda x: mappings.get(x, x), labels)) for image, labels in dataset]
+        dataset = ImageClassificationDataset(data, directory)
+    elif dataset.type == 'object_detection':
+        data = [(image, map(lambda x: (mappings.get(x, x), *x[1:]), labels)) for image, labels in dataset]
+        dataset = ObjectDetectionDataset(data, directory)
+    else:
+        raise RuntimeError
+
+    DatasetWriter(directory).write(dataset, output_filepath)
+    print(f"Successfully saved {output_filepath}")
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('main_txt_filepath', type=pathlib.Path)
+    parser.add_argument('output_filename')
+    parser.add_argument('--map', nargs=2, required=True, action='append', metavar=('src_class_id', 'dst_class_id'))
+    args = parser.parse_args()
+
+    if '/' in args.output_filename:
+        parser.error("The output file must be in the same directory.")
+
+    main_txt = args.main_txt_filepath.read_text()
+    directory = args.main_txt_filepath.parent
+
+    if (directory / args.output_filename).exists():
+        parser.error(f"{args.output_filename} already exists.")
+
+    map_dataset(main_txt, directory, args.output_filename, args.map)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/simpledataset/commands/pack.py b/simpledataset/commands/pack.py
diff --git a/simpledataset/commands/shuffle.py b/simpledataset/commands/shuffle.py
diff --git a/simpledataset/commands/summary.py b/simpledataset/commands/summary.py
@@ -0,0 +1,41 @@
+import argparse
+import collections
+import pathlib
+from simpledataset.common import SimpleDatasetFactory
+
+
+def print_summary(main_txt, directory):
+    dataset = SimpleDatasetFactory().load(main_txt, directory)
+    num_class_samples = collections.Counter()
+
+    if dataset.type == 'image_classification':
+        for image, labels in dataset:
+            num_class_samples.update(labels)
+    elif dataset.type == 'object_detection':
+        for image, labels in dataset:
+            num_class_samples.update(x[0] for x in labels)
+    else:
+        raise RuntimeError
+
+    print(f"The number of images: {len(dataset)}")
+    print(f"The number of classes: {len(num_class_samples)}")
+    print(f"Max class id: {dataset.get_max_class_id()}")
+    print("Class distribution:")
+    for class_id in sorted(num_class_samples.keys()):
+        print(f"    Class {class_id}: {num_class_samples[class_id]}")
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('main_txt_filepath', type=pathlib.Path)
+
+    args = parser.parse_args()
+
+    main_txt = args.main_txt_filepath.read_text()
+    directory = args.main_txt_filepath.parent
+
+    print_summary(main_txt, directory)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/simpledataset/common/__init__.py b/simpledataset/common/__init__.py
@@ -0,0 +1,4 @@
+from .dataset import SimpleDatasetFactory, ImageClassificationDataset, ObjectDetectionDataset
+from .dataset_writer import DatasetWriter
+
+__all__ = ['SimpleDatasetFactory', 'ImageClassificationDataset', 'ObjectDetectionDataset', 'DatasetWriter']