-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
shono
committed
Jun 21, 2021
0 parents
commit 9e6c853
Showing
17 changed files
with
612 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,96 @@ | ||
# simpledataset | ||
|
||
Utility tools for simple vision image dataset format. WORK IN PROGRESS. | ||
|
||
## Features | ||
* See the summary of a dataset | ||
* Convert from/to various dataset formats. | ||
* Web UI to look into a dataset | ||
* CUI tools to split and concat datasets. | ||
* CUI tools to modify labels. | ||
|
||
## Install | ||
``` | ||
pip install simpledataset==0.1.0 | ||
``` | ||
|
||
## Usage | ||
``` | ||
# Show summary | ||
dataset_summary <input_dataset> | ||
# For Classification dataset, extract only the images that have the specified labels. | ||
# For Detection dataset, extract only the boxes that have the specified labels. | ||
dataset_filter <input_dataset> <output_dataset> [--include_class <class_id> [<class_id> ...]] [--exclude_class <class_id> [<class_id> ...]] | ||
# Update class labels | ||
dataset_map <input_dataset> <output_dataset> --map <src_class_id> <dst_class_id> [--map <src_class_id> <dst_class_id> [--map...]] | ||
dataset_split # NYI | ||
dataset_concat # NYI | ||
dataset_shuffle # NYI | ||
dataset_sample # NYI | ||
dataset_pack # NYI | ||
# Convert from/to other dataset types. COCO format is supported now. | ||
dataset_convert_from <input_dataset> <input_format> <output_filepath> | ||
dataset_convert_to <input_dataset> <output_format> <output_filepath> | ||
``` | ||
|
||
|
||
## Examples | ||
### Change class ids | ||
For example, if you would like to change MNIST to odd or even classification dataset, you can use dataset_map command. In this example, we use class_id=0 for even numbers, and class_id=1 for odd numbers. | ||
```bash | ||
dataset_map mnist.txt new_dataset.txt --map 2 0 --map 3 1 --map 4 0 --map 5 1 --map 6 0 --map 7 1 --map 8 0 --map 9 1 | ||
``` | ||
|
||
|
||
## SIMPLE Dataset format | ||
Currently there are 2 dataset formats, Image Classification and Object Detection. Both datasets have a single txt file, image files and an optional list of label names (labels.txt). In addition to that, Object Detection datasets has label files that contains bbox info. | ||
|
||
### Image Classification | ||
The main txt format is: | ||
``` | ||
<file> ::= <txt_line> ('\n' <txt_line>)* | ||
<txt_line> ::= <image_filepath> ' ' <labels> | ||
<image_filepath> ::= <filepath> | <zip_filepath> '@' <entry_name> | ||
<labels> ::= <class_id> (',' <class_id>)* | ||
``` | ||
|
||
Here is an example txt file. | ||
``` | ||
train_images.zip@0.jpg 0 | ||
train_images2.zip@1.jpg 1 | ||
image.png 0,1 | ||
image2.bmp 0,1,2,3 | ||
``` | ||
|
||
### Object Detection | ||
The main txt format is: | ||
``` | ||
<file> ::= <txt_line> ('\n' <txt_line>)* | ||
<txt_line> ::= <image_filepath> ' ' <label_filepath> | ||
<image_filepath> ::= <filepath> | <zip_filepath> '@' <entry_name> | ||
<label_filepath> ::= <filepath> | <zip_filepath> '@' <entry_name> | ||
``` | ||
|
||
The format of a label file is: | ||
``` | ||
<file> ::= <label_line> ('\n' <label_line>)* | ||
<label_line> ::= <class_id> ' ' <bbox_x_min> ' ' <bbox_y_min> ' ' <bbox_x_max> ' ' <bbox_y_max> | ||
<class_id> ::= <int> | ||
<bbox_x_min> ::= <int> ; 0 <= <bbox_x_min> < <bbox_x_max> <= <image_width> | ||
<bbox_y_min> ::= <int> ; 0 <= <bbox_y_min> < <bbox_y_max> <= <image_height> | ||
<bbox_x_max> ::= <int> | ||
<bbox_y_max> ::= <int> | ||
``` | ||
|
||
|
||
## Usage for remote datasets | ||
NYI. | ||
This tool allows you to use datasets on Azure Blob Storage. You can update a dataset on the storage efficiently. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,21 @@ | ||
import setuptools | ||
|
||
|
||
setuptools.setup(name='simpledataset', | ||
version='0.1.0', | ||
description="Utility tools for SIMPLE vision dataset format.", | ||
packages=setuptools.find_packages(), | ||
license='MIT', | ||
url='https://github.com/shonohs/simpledataset', | ||
classifiers=[ | ||
'Intended Audience :: Developers', | ||
'License :: OSI Approved :: MIT License' | ||
], | ||
entry_points={ | ||
'console_scripts': [ | ||
'dataset_convert_from=simpledataset.commands.convert_from:main', | ||
'dataset_convert_to=simpledataset.commands.convert_to:main', | ||
'dataset_filter=simpledataset.commands.filter:main', | ||
'dataset_map=simpledataset.commands.map:main', | ||
'dataset_summary=simpledataset.commands.summary:main' | ||
]}) |
Empty file.
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,33 @@ | ||
import argparse | ||
import pathlib | ||
from simpledataset.common import DatasetWriter | ||
from simpledataset.converters import CocoReader | ||
|
||
|
||
def convert_from(input_filepath, source_format, output_filepath): | ||
if source_format == 'coco': | ||
dataset = CocoReader().read(input_filepath, input_filepath.parent) | ||
else: | ||
raise RuntimeError(f"Unsupported format: {source_format}") | ||
|
||
directory = output_filepath.parent | ||
DatasetWriter(directory).write(dataset, output_filepath) | ||
print(f"Successfully saved {output_filepath}.") | ||
|
||
|
||
def main(): | ||
parser = argparse.ArgumentParser() | ||
parser.add_argument('input_filepath', type=pathlib.Path) | ||
parser.add_argument('source_format', choices=['coco']) | ||
parser.add_argument('output_filepath', type=pathlib.Path) | ||
|
||
args = parser.parse_args() | ||
|
||
if args.output_filepath.exists(): | ||
parser.error(f"{args.output_filepath} already exists.") | ||
|
||
convert_from(args.input_filepath, args.source_format, args.output_filepath) | ||
|
||
|
||
if __name__ == '__main__': | ||
main() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,35 @@ | ||
import argparse | ||
import pathlib | ||
from simpledataset.common import SimpleDatasetFactory | ||
from simpledataset.converters import CocoWriter | ||
|
||
|
||
def convert_to(main_txt, directory, target_format, output_filepath): | ||
dataset = SimpleDatasetFactory().load(main_txt, directory) | ||
|
||
if target_format == 'coco': | ||
CocoWriter().write(dataset, output_filepath, output_filepath.parent) | ||
print(f"Successfully saved to {output_filepath}") | ||
else: | ||
raise RuntimeError(f"Unsupported format: {target_format}") | ||
|
||
|
||
def main(): | ||
parser = argparse.ArgumentParser() | ||
parser.add_argument('main_txt_filepath', type=pathlib.Path) | ||
parser.add_argument('target_format', choices=['coco']) | ||
parser.add_argument('output_filepath', type=pathlib.Path) | ||
|
||
args = parser.parse_args() | ||
|
||
main_txt = args.main_txt_filepath.read_text() | ||
directory = args.main_txt_filepath.parent | ||
|
||
if args.output_filepath.exists(): | ||
parser.error(f"{args.output_filepath} already exists.") | ||
|
||
convert_to(main_txt, directory, args.target_format, args.output_filepath) | ||
|
||
|
||
if __name__ == '__main__': | ||
main() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,61 @@ | ||
import argparse | ||
import logging | ||
import pathlib | ||
from simpledataset.common import SimpleDatasetFactory, ImageClassificationDataset, ObjectDetectionDataset, DatasetWriter | ||
|
||
logger = logging.getLogger(__name__) | ||
|
||
|
||
def filter_dataset(main_txt, directory, output_filepath, include_class_ids, exclude_class_ids): | ||
dataset = SimpleDatasetFactory().load(main_txt, directory) | ||
max_class_id = dataset.get_max_class_id() | ||
include_class_ids = set(include_class_ids or [i for i in range(max_class_id + 1) if i not in exclude_class_ids]) | ||
|
||
for c in include_class_ids: | ||
if c > max_class_id: | ||
logger.warning(f"The class {c} is not in the dataset.") | ||
|
||
if dataset.type == 'image_classification': | ||
data = [(image, [i for i in labels if i in include_class_ids]) for image, labels in dataset] | ||
# Remove images that have no labels. | ||
data = [d for d in data if d[1]] | ||
dataset = ImageClassificationDataset(data, directory) | ||
elif dataset.type == 'object_detection': | ||
data = [(image, [x for x in labels if x[0] in include_class_ids]) for image, labels in dataset] | ||
# Remove images that have no labels | ||
data = [d for d in data if d[1]] | ||
dataset = ObjectDetectionDataset(data, directory) | ||
else: | ||
raise RuntimeError | ||
|
||
DatasetWriter(directory).write(dataset, output_filepath) | ||
print(f"Successfully saved {output_filepath}") | ||
|
||
|
||
def main(): | ||
parser = argparse.ArgumentParser() | ||
parser.add_argument('main_txt_filepath', type=pathlib.Path) | ||
parser.add_argument('output_filename') | ||
parser.add_argument('--include_class', nargs='*', default=[], metavar='CLASS_ID') | ||
parser.add_argument('--exclude_class', nargs='*', default=[], metavar='CLASS_ID') | ||
args = parser.parse_args() | ||
|
||
if '/' in args.output_filename: | ||
parser.error("The output file must be in the same directory.") | ||
|
||
main_txt = args.main_txt_filepath.read_text() | ||
directory = args.main_txt_filepath.parent | ||
|
||
if (directory / args.output_filename).exists(): | ||
parser.error(f"{args.output_filename} already exists.") | ||
|
||
if (args.include_class and args.exclude_class) or not (args.include_class or args.exclude_class): | ||
parser.error("--include_class or --exclude_class must be specified.") | ||
|
||
include_class_ids = [int(c) for c in args.include_class] | ||
exclude_class_ids = [int(c) for c in args.exclude_class] | ||
filter_dataset(main_txt, directory, args.output_filename, include_class_ids, exclude_class_ids) | ||
|
||
|
||
if __name__ == '__main__': | ||
main() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,43 @@ | ||
import argparse | ||
import pathlib | ||
from simpledataset.common import SimpleDatasetFactory, ImageClassificationDataset, ObjectDetectionDataset, DatasetWriter | ||
|
||
|
||
def map_dataset(main_txt, directory, output_filepath, mappings_list): | ||
dataset = SimpleDatasetFactory().load(main_txt, directory) | ||
mappings = {int(src): int(dst) for src, dst in mappings_list} | ||
|
||
if dataset.type == 'image_classification': | ||
data = [(image, map(lambda x: mappings.get(x, x), labels)) for image, labels in dataset] | ||
dataset = ImageClassificationDataset(data, directory) | ||
elif dataset.type == 'object_detection': | ||
data = [(image, map(lambda x: (mappings.get(x, x), *x[1:]), labels)) for image, labels in dataset] | ||
dataset = ObjectDetectionDataset(data, directory) | ||
else: | ||
raise RuntimeError | ||
|
||
DatasetWriter(directory).write(dataset, output_filepath) | ||
print(f"Successfully saved {output_filepath}") | ||
|
||
|
||
def main(): | ||
parser = argparse.ArgumentParser() | ||
parser.add_argument('main_txt_filepath', type=pathlib.Path) | ||
parser.add_argument('output_filename') | ||
parser.add_argument('--map', nargs=2, required=True, action='append', metavar=('src_class_id', 'dst_class_id')) | ||
args = parser.parse_args() | ||
|
||
if '/' in args.output_filename: | ||
parser.error("The output file must be in the same directory.") | ||
|
||
main_txt = args.main_txt_filepath.read_text() | ||
directory = args.main_txt_filepath.parent | ||
|
||
if (directory / args.output_filename).exists(): | ||
parser.error(f"{args.output_filename} already exists.") | ||
|
||
map_dataset(main_txt, directory, args.output_filename, args.map) | ||
|
||
|
||
if __name__ == '__main__': | ||
main() |
Empty file.
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,41 @@ | ||
import argparse | ||
import collections | ||
import pathlib | ||
from simpledataset.common import SimpleDatasetFactory | ||
|
||
|
||
def print_summary(main_txt, directory): | ||
dataset = SimpleDatasetFactory().load(main_txt, directory) | ||
num_class_samples = collections.Counter() | ||
|
||
if dataset.type == 'image_classification': | ||
for image, labels in dataset: | ||
num_class_samples.update(labels) | ||
elif dataset.type == 'object_detection': | ||
for image, labels in dataset: | ||
num_class_samples.update(x[0] for x in labels) | ||
else: | ||
raise RuntimeError | ||
|
||
print(f"The number of images: {len(dataset)}") | ||
print(f"The number of classes: {len(num_class_samples)}") | ||
print(f"Max class id: {dataset.get_max_class_id()}") | ||
print("Class distribution:") | ||
for class_id in sorted(num_class_samples.keys()): | ||
print(f" Class {class_id}: {num_class_samples[class_id]}") | ||
|
||
|
||
def main(): | ||
parser = argparse.ArgumentParser() | ||
parser.add_argument('main_txt_filepath', type=pathlib.Path) | ||
|
||
args = parser.parse_args() | ||
|
||
main_txt = args.main_txt_filepath.read_text() | ||
directory = args.main_txt_filepath.parent | ||
|
||
print_summary(main_txt, directory) | ||
|
||
|
||
if __name__ == '__main__': | ||
main() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,4 @@ | ||
from .dataset import SimpleDatasetFactory, ImageClassificationDataset, ObjectDetectionDataset | ||
from .dataset_writer import DatasetWriter | ||
|
||
__all__ = ['SimpleDatasetFactory', 'ImageClassificationDataset', 'ObjectDetectionDataset', 'DatasetWriter'] |
Oops, something went wrong.