From e487d0a3e3345690af16de7b53356bceca8c45d3 Mon Sep 17 00:00:00 2001 From: Louis-Dupont <35190946+Louis-Dupont@users.noreply.github.com> Date: Thu, 31 Aug 2023 17:21:38 +0300 Subject: [PATCH] Automatize dataset description (#179) * add script and run * fix * squash all changes * minor update * use __file__ --- documentation/datasets.md | 593 ++++++++++++++++++++++++++--------- scripts/describe_datasets.py | 77 +++++ 2 files changed, 515 insertions(+), 155 deletions(-) create mode 100644 scripts/describe_datasets.py diff --git a/documentation/datasets.md b/documentation/datasets.md index fb676a48..bf8022e2 100644 --- a/documentation/datasets.md +++ b/documentation/datasets.md @@ -1,19 +1,72 @@ # Built-in Datasets DataGradients offer a few basic datasets which can help you load your data without needing to provide any additional code. -These datasets contain only the very basic functionalities and are not recommended for training. -## Object Detection +These datasets contain only the basic functionalities. +They are meant to be used within SuperGradients and are not recommended to be used for training (No `transform` parameter available). +## List of Datasets -### Yolo Format Detection Dataset +- [Detection Datasets](#detection-datasets) + - [1. COCODetectionDataset](#1-cocodetectiondataset) + - [2. COCOFormatDetectionDataset](#2-cocoformatdetectiondataset) + - [3. VOCDetectionDataset](#3-vocdetectiondataset) + - [4. VOCFormatDetectionDataset](#4-vocformatdetectiondataset) + - [5. YoloFormatDetectionDataset](#5-yoloformatdetectiondataset) +- [Segmentation Datasets](#segmentation-datasets) + - [1. COCOFormatSegmentationDataset](#1-cocoformatsegmentationdataset) + - [2. COCOSegmentationDataset](#2-cocosegmentationdataset) + - [3. VOCFormatSegmentationDataset](#3-vocformatsegmentationdataset) + - [4. VOCSegmentationDataset](#4-vocsegmentationdataset) -The Yolo format Detection Dataset supports any dataset stored in the YOLO format. + +## Detection Datasets + +
+ +### 1. COCODetectionDataset + +Coco Detection Dataset expects the exact same annotation files and dataset structure os the original Coco dataset. #### Expected folder structure -Any structure including at least one sub-directory for images and one for labels. They can be the same. +The dataset folder structure should -Example 1: Separate directories for images and labels +Example: +``` +dataset_root/ + ├── images/ + │ ├── train2017/ + │ ├── val2017/ + │ └── ... + └── annotations/ + ├── instances_train2017.json + ├── instances_val2017.json + └── ... +``` + +#### Instantiation +To instantiate a dataset object for training data of the year 2017, use the following code: + +```python +from data_gradients.datasets.detection import COCODetectionDataset + +train_set = COCODetectionDataset(root_dir="", split="train", year=2017) +val_set = COCODetectionDataset(root_dir="", split="val", year=2017) +``` + + +*[source code](https://github.com/Deci-AI/data-gradients/blob/master/src/data_gradients/datasets/detection/coco_detection_dataset.py)* + +
+ +### 2. COCOFormatDetectionDataset + +The Coco Format Detection Dataset supports datasets where labels and annotations are stored in COCO format. + +#### Expected folder structure +The dataset folder structure should include at least one sub-directory for images and one JSON file for annotations. + +Example: ``` dataset_root/ ├── images/ @@ -25,33 +78,13 @@ Example 1: Separate directories for images and labels │ │ ├── ... │ └── validation/ │ ├── ... - └── labels/ - ├── train/ - │ ├── 1.txt - │ ├── 2.txt - │ └── ... - ├── test/ - │ ├── ... - └── validation/ - ├── ... -``` - -Example 2: Same directory for images and labels -``` - dataset_root/ - ├── train/ - │ ├── 1.jpg - │ ├── 1.txt - │ ├── 2.jpg - │ ├── 2.txt - │ └── ... - └── validation/ - ├── ... + └── annotations/ + ├── train.json + ├── test.json + └── validation.json ``` - -#### Expected label files structure -The label files must be structured such that each row represents a bounding box annotation. -Each bounding box is represented by 5 elements: `class_id, cx, cy, w, h`. +#### Expected Annotation File Structure +The annotation files must be structured in JSON format following the COCO data format. #### Instantiation ``` @@ -61,29 +94,124 @@ dataset_root/ │ │ ├── 1.jpg │ │ ├── 2.jpg │ │ └── ... - │ ├── test/ + │ ├── val/ │ │ ├── ... - │ └── validation/ + │ └── test/ │ ├── ... - └── labels/ - ├── train/ - │ ├── 1.txt - │ ├── 2.txt + └── annotations/ + ├── train.json + ├── test.json + └── validation.json +``` + +```python +from data_gradients.datasets.detection import COCOFormatDetectionDataset + +train_set = COCOFormatDetectionDataset( + root_dir="", images_subdir="images/train", annotation_file_path="annotations/train.json" +) +val_set = COCOFormatDetectionDataset( + root_dir="", images_subdir="images/validation", annotation_file_path="annotations/validation.json" +) +``` + + +*[source code](https://github.com/Deci-AI/data-gradients/blob/master/src/data_gradients/datasets/detection/coco_format_detection_dataset.py)* + +
+ +### 3. VOCDetectionDataset + +VOC Detection Dataset is a sub-class of the VOCFormatDetectionDataset, +but where the folders are structured exactly similarly to the original PascalVOC. + +#### Expected folder structure +Any structure including at least one sub-directory for images and one for xml labels. They can be the same. + +Example 1: Separate directories for images and labels +``` +dataset_root/ + ├── VOC2007/ + │ ├── JPEGImages/ + │ │ ├── 1.jpg + │ │ ├── 2.jpg + │ │ └── ... + │ ├── Annotations/ + │ │ ├── 1.xml + │ │ ├── 2.xml + │ │ └── ... + │ └── ImageSets/ + │ └── Main + │ ├── train.txt + │ ├── val.txt + │ ├── train_val.txt + │ └── ... + └── VOC2012/ + └── ... +``` + + +**Note**: The label file need to be stored in XML format, but the file extension can be different. + +#### Expected label files structure +The label files must be structured in XML format, like in the following example: + +``` xml + + + chair + + 1 + 213 + 263 + 375 + + + + sofa + + 104 + 151 + 334 + 287 + + + +``` + + +#### Instantiation +Let's take an example where we only have VOC2012 +``` +dataset_root/ + └── VOC2012/ + ├── JPEGImages/ + │ ├── 1.jpg + │ ├── 2.jpg │ └── ... - ├── test/ - │ ├── ... - └── validation/ - ├── ... + ├── Annotations/ + │ ├── 1.xml + │ ├── 2.xml + │ └── ... + └── ImageSets/ + └── Main + ├── train.txt + └── val.txt ``` ```python -from data_gradients.datasets.detection import YoloFormatDetectionDataset +from data_gradients.datasets.detection import VOCDetectionDataset -train_set = YoloFormatDetectionDataset(root_dir="", images_dir="images/train", labels_dir="labels/train") -val_set = YoloFormatDetectionDataset(root_dir="", images_dir="images/validation", labels_dir="labels/validation") +train_set = VOCDetectionDataset(root_dir="", year=2012, split="train") +val_set = VOCDetectionDataset(root_dir="", year=2012, split="val") ``` -### VOC Format Detection Dataset + +*[source code](https://github.com/Deci-AI/data-gradients/blob/master/src/data_gradients/datasets/detection/voc_detection_dataset.py)* + +
+ +### 4. VOCFormatDetectionDataset The VOC format Detection Dataset supports datasets where labels are stored in XML following according to VOC standard. @@ -116,6 +244,8 @@ Example 1: Separate directories for images and labels Example 2: Same directory for images and labels ``` dataset_root/ + ├── train.txt + ├── validation.txt ├── train/ │ ├── 1.jpg │ ├── 1.xml @@ -154,10 +284,22 @@ The label files must be structured in XML format, like in the following example: ``` +The (optional) config file should include the list image ids to include. +``` +1 +5 +6 +... +34122 +``` +The associated images/labels will then be loaded from the images_subdir and labels_subdir. +If config_path is not provided, all images will be used. #### Instantiation ``` dataset_root/ + ├── train.txt + ├── validation.txt ├── images/ │ ├── train/ │ │ ├── 1.jpg @@ -178,127 +320,120 @@ dataset_root/ ├── ... ``` + ```python from data_gradients.datasets.detection import VOCFormatDetectionDataset -train_set = VOCFormatDetectionDataset(root_dir="", images_dir="images/train", labels_dir="labels/train") -val_set = VOCFormatDetectionDataset(root_dir="", images_dir="images/validation", labels_dir="labels/validation") +train_set = VOCFormatDetectionDataset( + root_dir="", images_subdir="images/train", labels_subdir="labels/train", config_path="train.txt" +) +val_set = VOCFormatDetectionDataset( + root_dir="", images_subdir="images/validation", labels_subdir="labels/validation", config_path="validation.txt" +) ``` -### VOC Detection Dataset -VOC Detection Dataset is a sub-class of the [VOC Format Detection Dataset](#voc_format_detection_dataset), -where the folders are structured exactly similarly to the original PascalVOC. +*[source code](https://github.com/Deci-AI/data-gradients/blob/master/src/data_gradients/datasets/detection/voc_format_detection_dataset.py)* + +
+ +### 5. YoloFormatDetectionDataset + +The Yolo format Detection Dataset supports any dataset stored in the YOLO format. #### Expected folder structure -Any structure including at least one sub-directory for images and one for xml labels. They can be the same. +Any structure including at least one sub-directory for images and one for labels. They can be the same. Example 1: Separate directories for images and labels ``` -dataset_root/ - ├── VOC2007/ - │ ├── JPEGImages/ - │ │ ├── 1.jpg - │ │ ├── 2.jpg - │ │ └── ... - │ ├── Annotations/ - │ │ ├── 1.xml - │ │ ├── 2.xml - │ │ └── ... - │ └── ImageSets/ - │ └── Main - │ ├── train.txt - │ ├── val.txt - │ ├── train_val.txt - │ └── ... - └── VOC2012/ - └── ... + dataset_root/ + ├── images/ + │ ├── train/ + │ │ ├── 1.jpg + │ │ ├── 2.jpg + │ │ └── ... + │ ├── test/ + │ │ ├── ... + │ └── validation/ + │ ├── ... + └── labels/ + ├── train/ + │ ├── 1.txt + │ ├── 2.txt + │ └── ... + ├── test/ + │ ├── ... + └── validation/ + ├── ... ``` - -**Note**: The label file need to be stored in XML format, but the file extension can be different. - -#### Expected label files structure -The label files must be structured in XML format, like in the following example: - -``` xml - - - chair - - 1 - 213 - 263 - 375 - - - - sofa - - 104 - 151 - 334 - 287 - - - +Example 2: Same directory for images and labels +``` + dataset_root/ + ├── train/ + │ ├── 1.jpg + │ ├── 1.txt + │ ├── 2.jpg + │ ├── 2.txt + │ └── ... + └── validation/ + ├── ... ``` +#### Expected label files structure +The label files must be structured such that each row represents a bounding box label. +Each bounding box is represented by 5 elements: `class_id, cx, cy, w, h`. #### Instantiation -Let's take an example where we only have VOC2012 ``` dataset_root/ - └── VOC2012/ - ├── JPEGImages/ - │ ├── 1.jpg - │ ├── 2.jpg - │ └── ... - ├── Annotations/ - │ ├── 1.xml - │ ├── 2.xml + ├── images/ + │ ├── train/ + │ │ ├── 1.jpg + │ │ ├── 2.jpg + │ │ └── ... + │ ├── test/ + │ │ ├── ... + │ └── validation/ + │ ├── ... + └── labels/ + ├── train/ + │ ├── 1.txt + │ ├── 2.txt │ └── ... - └── ImageSets/ - └── Main - ├── train.txt - └── val.txt + ├── test/ + │ ├── ... + └── validation/ + ├── ... ``` ```python -from data_gradients.datasets.detection import VOCDetectionDataset +from data_gradients.datasets.detection import YoloFormatDetectionDataset -train_set = VOCDetectionDataset(root_dir="", year=2012, image_set="train") -val_set = VOCDetectionDataset(root_dir="", year=2012, image_set="val") +train_loader = YoloFormatDetectionDataset(root_dir="", images_dir="images/train", labels_dir="labels/train") +val_loader = YoloFormatDetectionDataset(root_dir="", images_dir="images/validation", labels_dir="labels/validation") ``` +This class does NOT support dataset formats such as Pascal VOC or COCO. + + +*[source code](https://github.com/Deci-AI/data-gradients/blob/master/src/data_gradients/datasets/detection/yolo_format_detection_dataset.py)* + +
-### Coco Format Detection Dataset -The Coco Format Detection Dataset supports datasets where labels and annotations are stored in COCO format. + +## Segmentation Datasets + +
+ +### 1. COCOFormatSegmentationDataset + +The Coco Format Segmentation Dataset supports datasets where labels and masks are stored in COCO format. #### Expected folder structure The dataset folder structure should include at least one sub-directory for images and one JSON file for annotations. Example: -``` - dataset_root/ - ├── images/ - │ ├── train/ - │ │ ├── 1.jpg - │ │ ├── 2.jpg - │ │ └── ... - │ ├── test/ - │ │ ├── ... - │ └── validation/ - │ ├── ... - └── annotations/ - ├── train.json - ├── test.json - └── validation.json -``` -#### Expected Annotation File Structure -The annotation files must be structured in JSON format following the COCO data format. - -#### Instantiation ``` dataset_root/ ├── images/ @@ -306,53 +441,201 @@ dataset_root/ │ │ ├── 1.jpg │ │ ├── 2.jpg │ │ └── ... - │ ├── val/ + │ ├── test/ │ │ ├── ... - │ └── test/ + │ └── validation/ │ ├── ... └── annotations/ ├── train.json ├── test.json └── validation.json ``` +#### Expected Annotation File Structure +The annotation files must be structured in JSON format following the COCO data format, including mask data. +#### Instantiation ```python -from data_gradients.datasets.detection import COCOFormatDetectionDataset - -train_set = COCOFormatDetectionDataset( - root_dir="", images_subdir="images/train", annotation_file_path="annotations/train.json" +from data_gradients.datasets.segmentation import COCOFormatSegmentationDataset +train_set = COCOFormatSegmentationDataset( + root_dir="", + images_subdir="images/train", + annotation_file_path="annotations/train.json" ) -val_set = COCOFormatDetectionDataset( - root_dir="", images_subdir="images/validation", annotation_file_path="annotations/validation.json" +val_set = COCOFormatSegmentationDataset( + root_dir="", + images_subdir="images/validation", + annotation_file_path="annotations/validation.json" ) ``` -### Coco Detection Dataset -Coco Detection Dataset expects the exact same annotation files and dataset structure os the original Coco dataset. +*[source code](https://github.com/Deci-AI/data-gradients/blob/master/src/data_gradients/datasets/segmentation/coco_format_segmentation_dataset.py)* + +
+ +### 2. COCOSegmentationDataset + +The COCOSegmentationDataset class is a convenience subclass of the COCOFormatSegmentationDataset that simplifies +the instantiation for the widely-used COCO Segmentation Dataset. + +This class assumes the default COCO dataset structure and naming conventions. The data should be stored in a specific +structure where each split of data (train, val) and year of the dataset is kept in a different directory. #### Expected folder structure -The dataset folder structure should -Example: ``` dataset_root/ ├── images/ │ ├── train2017/ - │ ├── val2017/ - │ └── ... + │ │ ├── 1.jpg + │ │ ├── 2.jpg + │ │ └── ... + │ └── val2017/ + │ ├── 15481.jpg + │ ├── 15482.jpg + │ └── ... └── annotations/ ├── instances_train2017.json - ├── instances_val2017.json - └── ... + └── instances_val2017.json ``` #### Instantiation -To instantiate a dataset object for training data of the year 2017, use the following code: ```python -from data_gradients.datasets.detection import COCODetectionDataset +from data_gradients.datasets.segmentation import COCOSegmentationDataset +train_set = COCOSegmentationDataset(root_dir="", split="train", year=2017) +val_set = COCOSegmentationDataset(root_dir="", split="val", year=2017) +``` -train_set = COCODetectionDataset(root_dir="", split="train", year=2017) -val_set = COCODetectionDataset(root_dir="", split="val", year=2017) + +*[source code](https://github.com/Deci-AI/data-gradients/blob/master/src/data_gradients/datasets/segmentation/coco_segmentation_dataset.py)* + +
+ +### 3. VOCFormatSegmentationDataset + +The VOC format Segmentation Dataset supports datasets where labels are stored as images, with each color in the image representing a different class. + +#### Expected folder structure +Similar to the VOCFormatDetectionDataset, this class also expects certain folder structures. For example: + +Example: Separate directories for images and labels +``` + dataset_root/ + ├── train.txt + ├── validation.txt + ├── images/ + │ ├── train/ + │ │ ├── 1.jpg + │ │ ├── 2.jpg + │ │ └── ... + │ ├── test/ + │ │ ├── ... + │ └── validation/ + │ ├── ... + └── labels/ + ├── train/ + │ ├── 1.png + │ ├── 2.png + │ └── ... + ├── test/ + │ ├── ... + └── validation/ + ├── ... +``` +Each label image should be a color image where the color of each pixel corresponds to the class of that pixel. + +The (optional) config file should include the list image ids to include. +``` +1 +5 +6 +# And so on ... +``` +The associated images/labels will then be loaded from the images_subdir and labels_subdir. +If config_path is not provided, all images will be used. + +#### Instantiation +``` +from data_gradients.datasets.segmentation import VOCFormatSegmentationDataset + +color_map = [ + [0, 0, 0], # class 0 + [255, 0, 0], # class 1 + [0, 255, 0], # class 2 + # ... +] + +train_set = VOCFormatSegmentationDataset( + root_dir="", + images_subdir="images/train", + labels_subdir="labels/train", + class_names=["background", "class1", "class2"], + color_map=color_map, + config_path="train.txt" +) +val_set = VOCFormatSegmentationDataset( + root_dir="", + images_subdir="images/validation", + labels_subdir="labels/validation", + class_names=["background", "class1", "class2"], + color_map=color_map, + config_path="validation.txt" +) +``` + + +*[source code](https://github.com/Deci-AI/data-gradients/blob/master/src/data_gradients/datasets/segmentation/voc_format_segmentation_dataset.py)* + +
+ +### 4. VOCSegmentationDataset + + +The VOCSegmentationDataset is specifically tailored for loading PASCAL VOC segmentation datasets. + +#### Expected folder structure +Similar to the VOCFormatSegmentationDataset, this class also expects certain folder structures. +The folder structure of the PASCAL VOC dataset is as follows: + +``` + dataset_root/ + ├── VOC2007/ + │ ├── JPEGImages/ + │ ├── SegmentationClass/ + │ └── ImageSets/ + │ └── Segmentation/ + │ ├── train.txt + │ └── val.txt + └── VOC2012/ + ├── JPEGImages/ + ├── SegmentationClass/ + └── ImageSets/ + └── Segmentation/ + ├── train.txt + └── val.txt ``` +Each label image should be a color image where the color of each pixel corresponds to the class of that pixel. + +#### Instantiation +``` +from data_gradients.datasets.segmentation import VOCSegmentationDataset + +train_set = VOCSegmentationDataset( + root_dir="", + year=2007, + split="train", + verbose=True +) +val_set = VOCSegmentationDataset( + root_dir="", + year=2007, + split="val", + verbose=True +) +``` + + +*[source code](https://github.com/Deci-AI/data-gradients/blob/master/src/data_gradients/datasets/segmentation/voc_segmentation_dataset.py)* + +
diff --git a/scripts/describe_datasets.py b/scripts/describe_datasets.py new file mode 100644 index 00000000..32759ea6 --- /dev/null +++ b/scripts/describe_datasets.py @@ -0,0 +1,77 @@ +import inspect +import re +import os + +from data_gradients.datasets import detection as detection_datasets +from data_gradients.datasets import segmentation as segmentation_datasets + + +def remove_first_indentation(s: str) -> str: + # This regular expression matches lines that start with 4 spaces + # and replaces them with nothing, thereby removing the first indentation. + return re.sub(r"^ ", "", s, flags=re.MULTILINE) + + +def section_name_to_md_link(name: str) -> str: + """Convert a section name to markdown link.""" + link = name.lower().replace(" ", "-").replace(".", "") + return f"[{name}](#{link})" + + +def class_to_github_url(class_obj: type) -> str: + github_base_url = "https://github.com/Deci-AI/data-gradients/blob/master/src/" + class_path = inspect.getmodule(class_obj).__name__ + module_path = class_path.replace(".", "/") + ".py" + return github_base_url + module_path + + +# Define the categories of datasets +categories = ["Detection Datasets", "Segmentation Datasets"] +modules = [detection_datasets, segmentation_datasets] + +# Placeholder for the markdown content +dataset_descriptions = "" +table_of_contents = "## List of Datasets\n\n" + +# Iterate over categories and corresponding modules +for category, module in zip(categories, modules): + # Add category to table of contents + table_of_contents += f"- {section_name_to_md_link(category)}\n" + + # Add category title + dataset_descriptions += f"## {category}\n\n
\n\n" + + # Get classes from module + dataset_classes = inspect.getmembers(module, inspect.isclass) + for i, (class_name, class_obj) in enumerate(dataset_classes): + dataset_doc = class_obj.__doc__ if class_obj.__doc__ else "No description provided." + # dataset_doc = '\n'.join([m.lstrip() for m in dataset_doc.split('\n')]) + dataset_doc = remove_first_indentation(dataset_doc) + + # Create dataset title and add to table of contents + dataset_title = f"{i+1}. {class_name}" + table_of_contents += f" - {section_name_to_md_link(dataset_title)}\n" + + # Append dataset details to the markdown content + dataset_descriptions += f"### {dataset_title}\n\n" + dataset_descriptions += f"{dataset_doc}\n\n" + dataset_descriptions += f"*[source code]({class_to_github_url(class_obj)})*\n\n
\n\n" + + # Add empty line between categories + dataset_descriptions += "\n" + +# Combine table of contents and dataset descriptions +summary = f"""# Built-in Datasets + +DataGradients offer a few basic datasets which can help you load your data without needing to provide any additional code. + +These datasets contain only the basic functionalities. +They are meant to be used within SuperGradients and are not recommended to be used for training (No `transform` parameter available). + +{table_of_contents} + +{dataset_descriptions}""" + +root_dir = os.path.dirname(os.path.dirname(__file__)) +with open(os.path.join(root_dir, "documentation", "datasets.md"), "w") as f: + f.write(summary)