Skip to content

Commit

Permalink
Initial commit. version 0.1.0
Browse files Browse the repository at this point in the history
  • Loading branch information
shono committed Jun 21, 2021
0 parents commit 9e6c853
Show file tree
Hide file tree
Showing 17 changed files with 612 additions and 0 deletions.
96 changes: 96 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@
# simpledataset

Utility tools for simple vision image dataset format. WORK IN PROGRESS.

## Features
* See the summary of a dataset
* Convert from/to various dataset formats.
* Web UI to look into a dataset
* CUI tools to split and concat datasets.
* CUI tools to modify labels.

## Install
```
pip install simpledataset==0.1.0
```

## Usage
```
# Show summary
dataset_summary <input_dataset>
# For Classification dataset, extract only the images that have the specified labels.
# For Detection dataset, extract only the boxes that have the specified labels.
dataset_filter <input_dataset> <output_dataset> [--include_class <class_id> [<class_id> ...]] [--exclude_class <class_id> [<class_id> ...]]
# Update class labels
dataset_map <input_dataset> <output_dataset> --map <src_class_id> <dst_class_id> [--map <src_class_id> <dst_class_id> [--map...]]
dataset_split # NYI
dataset_concat # NYI
dataset_shuffle # NYI
dataset_sample # NYI
dataset_pack # NYI
# Convert from/to other dataset types. COCO format is supported now.
dataset_convert_from <input_dataset> <input_format> <output_filepath>
dataset_convert_to <input_dataset> <output_format> <output_filepath>
```


## Examples
### Change class ids
For example, if you would like to change MNIST to odd or even classification dataset, you can use dataset_map command. In this example, we use class_id=0 for even numbers, and class_id=1 for odd numbers.
```bash
dataset_map mnist.txt new_dataset.txt --map 2 0 --map 3 1 --map 4 0 --map 5 1 --map 6 0 --map 7 1 --map 8 0 --map 9 1
```


## SIMPLE Dataset format
Currently there are 2 dataset formats, Image Classification and Object Detection. Both datasets have a single txt file, image files and an optional list of label names (labels.txt). In addition to that, Object Detection datasets has label files that contains bbox info.

### Image Classification
The main txt format is:
```
<file> ::= <txt_line> ('\n' <txt_line>)*
<txt_line> ::= <image_filepath> ' ' <labels>
<image_filepath> ::= <filepath> | <zip_filepath> '@' <entry_name>
<labels> ::= <class_id> (',' <class_id>)*
```

Here is an example txt file.
```
train_images.zip@0.jpg 0
train_images2.zip@1.jpg 1
image.png 0,1
image2.bmp 0,1,2,3
```

### Object Detection
The main txt format is:
```
<file> ::= <txt_line> ('\n' <txt_line>)*
<txt_line> ::= <image_filepath> ' ' <label_filepath>
<image_filepath> ::= <filepath> | <zip_filepath> '@' <entry_name>
<label_filepath> ::= <filepath> | <zip_filepath> '@' <entry_name>
```

The format of a label file is:
```
<file> ::= <label_line> ('\n' <label_line>)*
<label_line> ::= <class_id> ' ' <bbox_x_min> ' ' <bbox_y_min> ' ' <bbox_x_max> ' ' <bbox_y_max>
<class_id> ::= <int>
<bbox_x_min> ::= <int> ; 0 <= <bbox_x_min> < <bbox_x_max> <= <image_width>
<bbox_y_min> ::= <int> ; 0 <= <bbox_y_min> < <bbox_y_max> <= <image_height>
<bbox_x_max> ::= <int>
<bbox_y_max> ::= <int>
```


## Usage for remote datasets
NYI.
This tool allows you to use datasets on Azure Blob Storage. You can update a dataset on the storage efficiently.
21 changes: 21 additions & 0 deletions setup.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
import setuptools


setuptools.setup(name='simpledataset',
version='0.1.0',
description="Utility tools for SIMPLE vision dataset format.",
packages=setuptools.find_packages(),
license='MIT',
url='https://github.com/shonohs/simpledataset',
classifiers=[
'Intended Audience :: Developers',
'License :: OSI Approved :: MIT License'
],
entry_points={
'console_scripts': [
'dataset_convert_from=simpledataset.commands.convert_from:main',
'dataset_convert_to=simpledataset.commands.convert_to:main',
'dataset_filter=simpledataset.commands.filter:main',
'dataset_map=simpledataset.commands.map:main',
'dataset_summary=simpledataset.commands.summary:main'
]})
Empty file added simpledataset/__init__.py
Empty file.
Empty file.
33 changes: 33 additions & 0 deletions simpledataset/commands/convert_from.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
import argparse
import pathlib
from simpledataset.common import DatasetWriter
from simpledataset.converters import CocoReader


def convert_from(input_filepath, source_format, output_filepath):
if source_format == 'coco':
dataset = CocoReader().read(input_filepath, input_filepath.parent)
else:
raise RuntimeError(f"Unsupported format: {source_format}")

directory = output_filepath.parent
DatasetWriter(directory).write(dataset, output_filepath)
print(f"Successfully saved {output_filepath}.")


def main():
parser = argparse.ArgumentParser()
parser.add_argument('input_filepath', type=pathlib.Path)
parser.add_argument('source_format', choices=['coco'])
parser.add_argument('output_filepath', type=pathlib.Path)

args = parser.parse_args()

if args.output_filepath.exists():
parser.error(f"{args.output_filepath} already exists.")

convert_from(args.input_filepath, args.source_format, args.output_filepath)


if __name__ == '__main__':
main()
35 changes: 35 additions & 0 deletions simpledataset/commands/convert_to.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
import argparse
import pathlib
from simpledataset.common import SimpleDatasetFactory
from simpledataset.converters import CocoWriter


def convert_to(main_txt, directory, target_format, output_filepath):
dataset = SimpleDatasetFactory().load(main_txt, directory)

if target_format == 'coco':
CocoWriter().write(dataset, output_filepath, output_filepath.parent)
print(f"Successfully saved to {output_filepath}")
else:
raise RuntimeError(f"Unsupported format: {target_format}")


def main():
parser = argparse.ArgumentParser()
parser.add_argument('main_txt_filepath', type=pathlib.Path)
parser.add_argument('target_format', choices=['coco'])
parser.add_argument('output_filepath', type=pathlib.Path)

args = parser.parse_args()

main_txt = args.main_txt_filepath.read_text()
directory = args.main_txt_filepath.parent

if args.output_filepath.exists():
parser.error(f"{args.output_filepath} already exists.")

convert_to(main_txt, directory, args.target_format, args.output_filepath)


if __name__ == '__main__':
main()
61 changes: 61 additions & 0 deletions simpledataset/commands/filter.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
import argparse
import logging
import pathlib
from simpledataset.common import SimpleDatasetFactory, ImageClassificationDataset, ObjectDetectionDataset, DatasetWriter

logger = logging.getLogger(__name__)


def filter_dataset(main_txt, directory, output_filepath, include_class_ids, exclude_class_ids):
dataset = SimpleDatasetFactory().load(main_txt, directory)
max_class_id = dataset.get_max_class_id()
include_class_ids = set(include_class_ids or [i for i in range(max_class_id + 1) if i not in exclude_class_ids])

for c in include_class_ids:
if c > max_class_id:
logger.warning(f"The class {c} is not in the dataset.")

if dataset.type == 'image_classification':
data = [(image, [i for i in labels if i in include_class_ids]) for image, labels in dataset]
# Remove images that have no labels.
data = [d for d in data if d[1]]
dataset = ImageClassificationDataset(data, directory)
elif dataset.type == 'object_detection':
data = [(image, [x for x in labels if x[0] in include_class_ids]) for image, labels in dataset]
# Remove images that have no labels
data = [d for d in data if d[1]]
dataset = ObjectDetectionDataset(data, directory)
else:
raise RuntimeError

DatasetWriter(directory).write(dataset, output_filepath)
print(f"Successfully saved {output_filepath}")


def main():
parser = argparse.ArgumentParser()
parser.add_argument('main_txt_filepath', type=pathlib.Path)
parser.add_argument('output_filename')
parser.add_argument('--include_class', nargs='*', default=[], metavar='CLASS_ID')
parser.add_argument('--exclude_class', nargs='*', default=[], metavar='CLASS_ID')
args = parser.parse_args()

if '/' in args.output_filename:
parser.error("The output file must be in the same directory.")

main_txt = args.main_txt_filepath.read_text()
directory = args.main_txt_filepath.parent

if (directory / args.output_filename).exists():
parser.error(f"{args.output_filename} already exists.")

if (args.include_class and args.exclude_class) or not (args.include_class or args.exclude_class):
parser.error("--include_class or --exclude_class must be specified.")

include_class_ids = [int(c) for c in args.include_class]
exclude_class_ids = [int(c) for c in args.exclude_class]
filter_dataset(main_txt, directory, args.output_filename, include_class_ids, exclude_class_ids)


if __name__ == '__main__':
main()
43 changes: 43 additions & 0 deletions simpledataset/commands/map.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
import argparse
import pathlib
from simpledataset.common import SimpleDatasetFactory, ImageClassificationDataset, ObjectDetectionDataset, DatasetWriter


def map_dataset(main_txt, directory, output_filepath, mappings_list):
dataset = SimpleDatasetFactory().load(main_txt, directory)
mappings = {int(src): int(dst) for src, dst in mappings_list}

if dataset.type == 'image_classification':
data = [(image, map(lambda x: mappings.get(x, x), labels)) for image, labels in dataset]
dataset = ImageClassificationDataset(data, directory)
elif dataset.type == 'object_detection':
data = [(image, map(lambda x: (mappings.get(x, x), *x[1:]), labels)) for image, labels in dataset]
dataset = ObjectDetectionDataset(data, directory)
else:
raise RuntimeError

DatasetWriter(directory).write(dataset, output_filepath)
print(f"Successfully saved {output_filepath}")


def main():
parser = argparse.ArgumentParser()
parser.add_argument('main_txt_filepath', type=pathlib.Path)
parser.add_argument('output_filename')
parser.add_argument('--map', nargs=2, required=True, action='append', metavar=('src_class_id', 'dst_class_id'))
args = parser.parse_args()

if '/' in args.output_filename:
parser.error("The output file must be in the same directory.")

main_txt = args.main_txt_filepath.read_text()
directory = args.main_txt_filepath.parent

if (directory / args.output_filename).exists():
parser.error(f"{args.output_filename} already exists.")

map_dataset(main_txt, directory, args.output_filename, args.map)


if __name__ == '__main__':
main()
Empty file added simpledataset/commands/pack.py
Empty file.
Empty file.
41 changes: 41 additions & 0 deletions simpledataset/commands/summary.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
import argparse
import collections
import pathlib
from simpledataset.common import SimpleDatasetFactory


def print_summary(main_txt, directory):
dataset = SimpleDatasetFactory().load(main_txt, directory)
num_class_samples = collections.Counter()

if dataset.type == 'image_classification':
for image, labels in dataset:
num_class_samples.update(labels)
elif dataset.type == 'object_detection':
for image, labels in dataset:
num_class_samples.update(x[0] for x in labels)
else:
raise RuntimeError

print(f"The number of images: {len(dataset)}")
print(f"The number of classes: {len(num_class_samples)}")
print(f"Max class id: {dataset.get_max_class_id()}")
print("Class distribution:")
for class_id in sorted(num_class_samples.keys()):
print(f" Class {class_id}: {num_class_samples[class_id]}")


def main():
parser = argparse.ArgumentParser()
parser.add_argument('main_txt_filepath', type=pathlib.Path)

args = parser.parse_args()

main_txt = args.main_txt_filepath.read_text()
directory = args.main_txt_filepath.parent

print_summary(main_txt, directory)


if __name__ == '__main__':
main()
4 changes: 4 additions & 0 deletions simpledataset/common/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
from .dataset import SimpleDatasetFactory, ImageClassificationDataset, ObjectDetectionDataset
from .dataset_writer import DatasetWriter

__all__ = ['SimpleDatasetFactory', 'ImageClassificationDataset', 'ObjectDetectionDataset', 'DatasetWriter']
Loading

0 comments on commit 9e6c853

Please sign in to comment.