From 1db48314ed748bb783a1bc58c9a13c07f0ffbfad Mon Sep 17 00:00:00 2001 From: Louis Dupont Date: Mon, 28 Aug 2023 17:53:18 +0300 Subject: [PATCH 1/5] add script and run --- documentation/datasets.md | 595 ++++++++++++++++++++++++++--------- scripts/describe_datasets.py | 74 +++++ 2 files changed, 513 insertions(+), 156 deletions(-) create mode 100644 scripts/describe_datasets.py diff --git a/documentation/datasets.md b/documentation/datasets.md index fb676a48..22d2b623 100644 --- a/documentation/datasets.md +++ b/documentation/datasets.md @@ -1,19 +1,72 @@ -# Built-in Datasets +# Datasets Description DataGradients offer a few basic datasets which can help you load your data without needing to provide any additional code. -These datasets contain only the very basic functionalities and are not recommended for training. -## Object Detection +These datasets contain only the basic functionalities. +They are meant to be used within SuperGradients and are not recommended to be used for training (No `transform` parameter available). +## List of Datasets -### Yolo Format Detection Dataset +- [Detection Datasets](#detection-datasets) + - [1. COCODetectionDataset](#1-cocodetectiondataset) + - [2. COCOFormatDetectionDataset](#2-cocoformatdetectiondataset) + - [3. VOCDetectionDataset](#3-vocdetectiondataset) + - [4. VOCFormatDetectionDataset](#4-vocformatdetectiondataset) + - [5. YoloFormatDetectionDataset](#5-yoloformatdetectiondataset) +- [Segmentation Datasets](#segmentation-datasets) + - [1. COCOFormatSegmentationDataset](#1-cocoformatsegmentationdataset) + - [2. COCOSegmentationDataset](#2-cocosegmentationdataset) + - [3. VOCFormatSegmentationDataset](#3-vocformatsegmentationdataset) + - [4. VOCSegmentationDataset](#4-vocsegmentationdataset) -The Yolo format Detection Dataset supports any dataset stored in the YOLO format. + +## Detection Datasets + +
+ +### 1. COCODetectionDataset + +Coco Detection Dataset expects the exact same annotation files and dataset structure os the original Coco dataset. #### Expected folder structure -Any structure including at least one sub-directory for images and one for labels. They can be the same. +The dataset folder structure should -Example 1: Separate directories for images and labels +Example: +``` +dataset_root/ + ├── images/ + │ ├── train2017/ + │ ├── val2017/ + │ └── ... + └── annotations/ + ├── instances_train2017.json + ├── instances_val2017.json + └── ... +``` + +#### Instantiation +To instantiate a dataset object for training data of the year 2017, use the following code: + +```python +from data_gradients.datasets.detection import COCODetectionDataset + +train_set = COCODetectionDataset(root_dir="", split="train", year=2017) +val_set = COCODetectionDataset(root_dir="", split="val", year=2017) +``` + + +*[source code](https://github.com/Deci-AI/data-gradients/blob/master/src/data_gradients/datasets/detection/coco_detection_dataset.py)* + +
+ +### 2. COCOFormatDetectionDataset + +The Coco Format Detection Dataset supports datasets where labels and annotations are stored in COCO format. + +#### Expected folder structure +The dataset folder structure should include at least one sub-directory for images and one JSON file for annotations. + +Example: ``` dataset_root/ ├── images/ @@ -25,33 +78,13 @@ Example 1: Separate directories for images and labels │ │ ├── ... │ └── validation/ │ ├── ... - └── labels/ - ├── train/ - │ ├── 1.txt - │ ├── 2.txt - │ └── ... - ├── test/ - │ ├── ... - └── validation/ - ├── ... -``` - -Example 2: Same directory for images and labels -``` - dataset_root/ - ├── train/ - │ ├── 1.jpg - │ ├── 1.txt - │ ├── 2.jpg - │ ├── 2.txt - │ └── ... - └── validation/ - ├── ... + └── annotations/ + ├── train.json + ├── test.json + └── validation.json ``` - -#### Expected label files structure -The label files must be structured such that each row represents a bounding box annotation. -Each bounding box is represented by 5 elements: `class_id, cx, cy, w, h`. +#### Expected Annotation File Structure +The annotation files must be structured in JSON format following the COCO data format. #### Instantiation ``` @@ -61,29 +94,124 @@ dataset_root/ │ │ ├── 1.jpg │ │ ├── 2.jpg │ │ └── ... - │ ├── test/ + │ ├── val/ │ │ ├── ... - │ └── validation/ + │ └── test/ │ ├── ... - └── labels/ - ├── train/ - │ ├── 1.txt - │ ├── 2.txt + └── annotations/ + ├── train.json + ├── test.json + └── validation.json +``` + +```python +from data_gradients.datasets.detection import COCOFormatDetectionDataset + +train_set = COCOFormatDetectionDataset( + root_dir="", images_subdir="images/train", annotation_file_path="annotations/train.json" +) +val_set = COCOFormatDetectionDataset( + root_dir="", images_subdir="images/validation", annotation_file_path="annotations/validation.json" +) +``` + + +*[source code](https://github.com/Deci-AI/data-gradients/blob/master/src/data_gradients/datasets/detection/coco_format_detection_dataset.py)* + +
+ +### 3. VOCDetectionDataset + +VOC Detection Dataset is a sub-class of the VOCFormatDetectionDataset, +but where the folders are structured exactly similarly to the original PascalVOC. + +#### Expected folder structure +Any structure including at least one sub-directory for images and one for xml labels. They can be the same. + +Example 1: Separate directories for images and labels +``` +dataset_root/ + ├── VOC2007/ + │ ├── JPEGImages/ + │ │ ├── 1.jpg + │ │ ├── 2.jpg + │ │ └── ... + │ ├── Annotations/ + │ │ ├── 1.xml + │ │ ├── 2.xml + │ │ └── ... + │ └── ImageSets/ + │ └── Main + │ ├── train.txt + │ ├── val.txt + │ ├── train_val.txt + │ └── ... + └── VOC2012/ + └── ... +``` + + +**Note**: The label file need to be stored in XML format, but the file extension can be different. + +#### Expected label files structure +The label files must be structured in XML format, like in the following example: + +``` xml + + + chair + + 1 + 213 + 263 + 375 + + + + sofa + + 104 + 151 + 334 + 287 + + + +``` + + +#### Instantiation +Let's take an example where we only have VOC2012 +``` +dataset_root/ + └── VOC2012/ + ├── JPEGImages/ + │ ├── 1.jpg + │ ├── 2.jpg │ └── ... - ├── test/ - │ ├── ... - └── validation/ - ├── ... + ├── Annotations/ + │ ├── 1.xml + │ ├── 2.xml + │ └── ... + └── ImageSets/ + └── Main + ├── train.txt + └── val.txt ``` ```python -from data_gradients.datasets.detection import YoloFormatDetectionDataset +from data_gradients.datasets.detection import VOCDetectionDataset -train_set = YoloFormatDetectionDataset(root_dir="", images_dir="images/train", labels_dir="labels/train") -val_set = YoloFormatDetectionDataset(root_dir="", images_dir="images/validation", labels_dir="labels/validation") +train_set = VOCDetectionDataset(root_dir="", year=2012, split="train") +val_set = VOCDetectionDataset(root_dir="", year=2012, split="val") ``` -### VOC Format Detection Dataset + +*[source code](https://github.com/Deci-AI/data-gradients/blob/master/src/data_gradients/datasets/detection/voc_detection_dataset.py)* + +
+ +### 4. VOCFormatDetectionDataset The VOC format Detection Dataset supports datasets where labels are stored in XML following according to VOC standard. @@ -116,6 +244,8 @@ Example 1: Separate directories for images and labels Example 2: Same directory for images and labels ``` dataset_root/ + ├── train.txt + ├── validation.txt ├── train/ │ ├── 1.jpg │ ├── 1.xml @@ -154,10 +284,22 @@ The label files must be structured in XML format, like in the following example: ``` +The (optional) config file should include the list image ids to include. +``` +1 +5 +6 +... +34122 +``` +The associated images/labels will then be loaded from the images_subdir and labels_subdir. +If config_path is not provided, all images will be used. #### Instantiation ``` dataset_root/ + ├── train.txt + ├── validation.txt ├── images/ │ ├── train/ │ │ ├── 1.jpg @@ -178,127 +320,120 @@ dataset_root/ ├── ... ``` + ```python from data_gradients.datasets.detection import VOCFormatDetectionDataset -train_set = VOCFormatDetectionDataset(root_dir="", images_dir="images/train", labels_dir="labels/train") -val_set = VOCFormatDetectionDataset(root_dir="", images_dir="images/validation", labels_dir="labels/validation") +train_set = VOCFormatDetectionDataset( + root_dir="", images_subdir="images/train", labels_subdir="labels/train", config_path="train.txt" +) +val_set = VOCFormatDetectionDataset( + root_dir="", images_subdir="images/validation", labels_subdir="labels/validation", config_path="validation.txt" +) ``` -### VOC Detection Dataset -VOC Detection Dataset is a sub-class of the [VOC Format Detection Dataset](#voc_format_detection_dataset), -where the folders are structured exactly similarly to the original PascalVOC. +*[source code](https://github.com/Deci-AI/data-gradients/blob/master/src/data_gradients/datasets/detection/voc_format_detection_dataset.py)* + +
+ +### 5. YoloFormatDetectionDataset + +The Yolo format Detection Dataset supports any dataset stored in the YOLO format. #### Expected folder structure -Any structure including at least one sub-directory for images and one for xml labels. They can be the same. +Any structure including at least one sub-directory for images and one for labels. They can be the same. Example 1: Separate directories for images and labels ``` -dataset_root/ - ├── VOC2007/ - │ ├── JPEGImages/ - │ │ ├── 1.jpg - │ │ ├── 2.jpg - │ │ └── ... - │ ├── Annotations/ - │ │ ├── 1.xml - │ │ ├── 2.xml - │ │ └── ... - │ └── ImageSets/ - │ └── Main - │ ├── train.txt - │ ├── val.txt - │ ├── train_val.txt - │ └── ... - └── VOC2012/ - └── ... + dataset_root/ + ├── images/ + │ ├── train/ + │ │ ├── 1.jpg + │ │ ├── 2.jpg + │ │ └── ... + │ ├── test/ + │ │ ├── ... + │ └── validation/ + │ ├── ... + └── labels/ + ├── train/ + │ ├── 1.txt + │ ├── 2.txt + │ └── ... + ├── test/ + │ ├── ... + └── validation/ + ├── ... ``` - -**Note**: The label file need to be stored in XML format, but the file extension can be different. - -#### Expected label files structure -The label files must be structured in XML format, like in the following example: - -``` xml - - - chair - - 1 - 213 - 263 - 375 - - - - sofa - - 104 - 151 - 334 - 287 - - - +Example 2: Same directory for images and labels +``` + dataset_root/ + ├── train/ + │ ├── 1.jpg + │ ├── 1.txt + │ ├── 2.jpg + │ ├── 2.txt + │ └── ... + └── validation/ + ├── ... ``` +#### Expected label files structure +The label files must be structured such that each row represents a bounding box label. +Each bounding box is represented by 5 elements: `class_id, cx, cy, w, h`. #### Instantiation -Let's take an example where we only have VOC2012 ``` dataset_root/ - └── VOC2012/ - ├── JPEGImages/ - │ ├── 1.jpg - │ ├── 2.jpg - │ └── ... - ├── Annotations/ - │ ├── 1.xml - │ ├── 2.xml + ├── images/ + │ ├── train/ + │ │ ├── 1.jpg + │ │ ├── 2.jpg + │ │ └── ... + │ ├── test/ + │ │ ├── ... + │ └── validation/ + │ ├── ... + └── labels/ + ├── train/ + │ ├── 1.txt + │ ├── 2.txt │ └── ... - └── ImageSets/ - └── Main - ├── train.txt - └── val.txt + ├── test/ + │ ├── ... + └── validation/ + ├── ... ``` ```python -from data_gradients.datasets.detection import VOCDetectionDataset +from data_gradients.datasets.detection import YoloFormatDetectionDataset -train_set = VOCDetectionDataset(root_dir="", year=2012, image_set="train") -val_set = VOCDetectionDataset(root_dir="", year=2012, image_set="val") +train_loader = YoloFormatDetectionDataset(root_dir="", images_dir="images/train", labels_dir="labels/train") +val_loader = YoloFormatDetectionDataset(root_dir="", images_dir="images/validation", labels_dir="labels/validation") ``` +This class does NOT support dataset formats such as Pascal VOC or COCO. + + +*[source code](https://github.com/Deci-AI/data-gradients/blob/master/src/data_gradients/datasets/detection/yolo_format_detection_dataset.py)* + +
-### Coco Format Detection Dataset -The Coco Format Detection Dataset supports datasets where labels and annotations are stored in COCO format. + +## Segmentation Datasets + +
+ +### 1. COCOFormatSegmentationDataset + +The Coco Format Segmentation Dataset supports datasets where labels and masks are stored in COCO format. #### Expected folder structure The dataset folder structure should include at least one sub-directory for images and one JSON file for annotations. Example: -``` - dataset_root/ - ├── images/ - │ ├── train/ - │ │ ├── 1.jpg - │ │ ├── 2.jpg - │ │ └── ... - │ ├── test/ - │ │ ├── ... - │ └── validation/ - │ ├── ... - └── annotations/ - ├── train.json - ├── test.json - └── validation.json -``` -#### Expected Annotation File Structure -The annotation files must be structured in JSON format following the COCO data format. - -#### Instantiation ``` dataset_root/ ├── images/ @@ -306,53 +441,201 @@ dataset_root/ │ │ ├── 1.jpg │ │ ├── 2.jpg │ │ └── ... - │ ├── val/ + │ ├── test/ │ │ ├── ... - │ └── test/ + │ └── validation/ │ ├── ... └── annotations/ ├── train.json ├── test.json └── validation.json ``` +#### Expected Annotation File Structure +The annotation files must be structured in JSON format following the COCO data format, including mask data. +#### Instantiation ```python -from data_gradients.datasets.detection import COCOFormatDetectionDataset - -train_set = COCOFormatDetectionDataset( - root_dir="", images_subdir="images/train", annotation_file_path="annotations/train.json" +from data_gradients.datasets.segmentation import COCOFormatSegmentationDataset +train_set = COCOFormatSegmentationDataset( + root_dir="", + images_subdir="images/train", + annotation_file_path="annotations/train.json" ) -val_set = COCOFormatDetectionDataset( - root_dir="", images_subdir="images/validation", annotation_file_path="annotations/validation.json" +val_set = COCOFormatSegmentationDataset( + root_dir="", + images_subdir="images/validation", + annotation_file_path="annotations/validation.json" ) ``` -### Coco Detection Dataset -Coco Detection Dataset expects the exact same annotation files and dataset structure os the original Coco dataset. +*[source code](https://github.com/Deci-AI/data-gradients/blob/master/src/data_gradients/datasets/segmentation/coco_format_segmentation_dataset.py)* + +
+ +### 2. COCOSegmentationDataset + +The COCOSegmentationDataset class is a convenience subclass of the COCOFormatSegmentationDataset that simplifies +the instantiation for the widely-used COCO Segmentation Dataset. + +This class assumes the default COCO dataset structure and naming conventions. The data should be stored in a specific +structure where each split of data (train, val) and year of the dataset is kept in a different directory. #### Expected folder structure -The dataset folder structure should -Example: ``` dataset_root/ ├── images/ │ ├── train2017/ - │ ├── val2017/ - │ └── ... + │ │ ├── 1.jpg + │ │ ├── 2.jpg + │ │ └── ... + │ └── val2017/ + │ ├── 15481.jpg + │ ├── 15482.jpg + │ └── ... └── annotations/ ├── instances_train2017.json - ├── instances_val2017.json - └── ... + └── instances_val2017.json ``` #### Instantiation -To instantiate a dataset object for training data of the year 2017, use the following code: ```python -from data_gradients.datasets.detection import COCODetectionDataset +from data_gradients.datasets.segmentation import COCOSegmentationDataset +train_set = COCOSegmentationDataset(root_dir="", split="train", year=2017) +val_set = COCOSegmentationDataset(root_dir="", split="val", year=2017) +``` -train_set = COCODetectionDataset(root_dir="", split="train", year=2017) -val_set = COCODetectionDataset(root_dir="", split="val", year=2017) + +*[source code](https://github.com/Deci-AI/data-gradients/blob/master/src/data_gradients/datasets/segmentation/coco_segmentation_dataset.py)* + +
+ +### 3. VOCFormatSegmentationDataset + +The VOC format Segmentation Dataset supports datasets where labels are stored as images, with each color in the image representing a different class. + +#### Expected folder structure +Similar to the VOCFormatDetectionDataset, this class also expects certain folder structures. For example: + +Example: Separate directories for images and labels +``` + dataset_root/ + ├── train.txt + ├── validation.txt + ├── images/ + │ ├── train/ + │ │ ├── 1.jpg + │ │ ├── 2.jpg + │ │ └── ... + │ ├── test/ + │ │ ├── ... + │ └── validation/ + │ ├── ... + └── labels/ + ├── train/ + │ ├── 1.png + │ ├── 2.png + │ └── ... + ├── test/ + │ ├── ... + └── validation/ + ├── ... +``` +Each label image should be a color image where the color of each pixel corresponds to the class of that pixel. + +The (optional) config file should include the list image ids to include. +``` +1 +5 +6 +# And so on ... +``` +The associated images/labels will then be loaded from the images_subdir and labels_subdir. +If config_path is not provided, all images will be used. + +#### Instantiation +``` +from data_gradients.datasets.segmentation import VOCFormatSegmentationDataset + +color_map = [ + [0, 0, 0], # class 0 + [255, 0, 0], # class 1 + [0, 255, 0], # class 2 + # ... +] + +train_set = VOCFormatSegmentationDataset( + root_dir="", + images_subdir="images/train", + labels_subdir="labels/train", + class_names=["background", "class1", "class2"], + color_map=color_map, + config_path="train.txt" +) +val_set = VOCFormatSegmentationDataset( + root_dir="", + images_subdir="images/validation", + labels_subdir="labels/validation", + class_names=["background", "class1", "class2"], + color_map=color_map, + config_path="validation.txt" +) +``` + + +*[source code](https://github.com/Deci-AI/data-gradients/blob/master/src/data_gradients/datasets/segmentation/voc_format_segmentation_dataset.py)* + +
+ +### 4. VOCSegmentationDataset + + +The VOCSegmentationDataset is specifically tailored for loading PASCAL VOC segmentation datasets. + +#### Expected folder structure +Similar to the VOCFormatSegmentationDataset, this class also expects certain folder structures. +The folder structure of the PASCAL VOC dataset is as follows: + +``` + dataset_root/ + ├── VOC2007/ + │ ├── JPEGImages/ + │ ├── SegmentationClass/ + │ └── ImageSets/ + │ └── Segmentation/ + │ ├── train.txt + │ └── val.txt + └── VOC2012/ + ├── JPEGImages/ + ├── SegmentationClass/ + └── ImageSets/ + └── Segmentation/ + ├── train.txt + └── val.txt ``` +Each label image should be a color image where the color of each pixel corresponds to the class of that pixel. + +#### Instantiation +``` +from data_gradients.datasets.segmentation import VOCSegmentationDataset + +train_set = VOCSegmentationDataset( + root_dir="", + year=2007, + split="train", + verbose=True +) +val_set = VOCSegmentationDataset( + root_dir="", + year=2007, + split="val", + verbose=True +) +``` + + +*[source code](https://github.com/Deci-AI/data-gradients/blob/master/src/data_gradients/datasets/segmentation/voc_segmentation_dataset.py)* + +
diff --git a/scripts/describe_datasets.py b/scripts/describe_datasets.py new file mode 100644 index 00000000..9282e41f --- /dev/null +++ b/scripts/describe_datasets.py @@ -0,0 +1,74 @@ +import inspect +from data_gradients.datasets import detection as detection_datasets +from data_gradients.datasets import segmentation as segmentation_datasets +import re + + +def remove_first_indentation(s: str) -> str: + # This regular expression matches lines that start with 4 spaces + # and replaces them with nothing, thereby removing the first indentation. + return re.sub(r"^ ", "", s, flags=re.MULTILINE) + + +def section_name_to_md_link(name: str) -> str: + """Convert a section name to markdown link.""" + link = name.lower().replace(" ", "-").replace(".", "") + return f"[{name}](#{link})" + + +def class_to_github_url(class_obj: type) -> str: + github_base_url = "https://github.com/Deci-AI/data-gradients/blob/master/src/" + class_path = inspect.getmodule(class_obj).__name__ + module_path = class_path.replace(".", "/") + ".py" + return github_base_url + module_path + + +# Define the categories of datasets +categories = ["Detection Datasets", "Segmentation Datasets"] +modules = [detection_datasets, segmentation_datasets] + +# Placeholder for the markdown content +dataset_descriptions = "" +table_of_contents = "## List of Datasets\n\n" + +# Iterate over categories and corresponding modules +for category, module in zip(categories, modules): + # Add category to table of contents + table_of_contents += f"- {section_name_to_md_link(category)}\n" + + # Add category title + dataset_descriptions += f"## {category}\n\n
\n\n" + + # Get classes from module + dataset_classes = inspect.getmembers(module, inspect.isclass) + for i, (class_name, class_obj) in enumerate(dataset_classes): + dataset_doc = class_obj.__doc__ if class_obj.__doc__ else "No description provided." + # dataset_doc = '\n'.join([m.lstrip() for m in dataset_doc.split('\n')]) + dataset_doc = remove_first_indentation(dataset_doc) + + # Create dataset title and add to table of contents + dataset_title = f"{i+1}. {class_name}" + table_of_contents += f" - {section_name_to_md_link(dataset_title)}\n" + + # Append dataset details to the markdown content + dataset_descriptions += f"### {dataset_title}\n\n" + dataset_descriptions += f"{dataset_doc}\n\n" + dataset_descriptions += f"*[source code]({class_to_github_url(class_obj)})*\n\n
\n\n" + + # Add empty line between categories + dataset_descriptions += "\n" + +# Combine table of contents and dataset descriptions +summary = f"""# Datasets Description + +DataGradients offer a few basic datasets which can help you load your data without needing to provide any additional code. + +These datasets contain only the basic functionalities. +They are meant to be used within SuperGradients and are not recommended to be used for training (No `transform` parameter available). + +{table_of_contents} + +{dataset_descriptions}""" + +with open("../documentation/datasets.md", "w") as f: + f.write(summary) From bef858457561a9c5ad2f30401c22018a0fef61f7 Mon Sep 17 00:00:00 2001 From: Louis Dupont Date: Mon, 28 Aug 2023 17:55:34 +0300 Subject: [PATCH 2/5] fix --- documentation/datasets.md | 2 +- scripts/describe_datasets.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/documentation/datasets.md b/documentation/datasets.md index 22d2b623..bf8022e2 100644 --- a/documentation/datasets.md +++ b/documentation/datasets.md @@ -1,4 +1,4 @@ -# Datasets Description +# Built-in Datasets DataGradients offer a few basic datasets which can help you load your data without needing to provide any additional code. diff --git a/scripts/describe_datasets.py b/scripts/describe_datasets.py index 9282e41f..2f5abe6e 100644 --- a/scripts/describe_datasets.py +++ b/scripts/describe_datasets.py @@ -59,7 +59,7 @@ def class_to_github_url(class_obj: type) -> str: dataset_descriptions += "\n" # Combine table of contents and dataset descriptions -summary = f"""# Datasets Description +summary = f"""# Built-in Datasets DataGradients offer a few basic datasets which can help you load your data without needing to provide any additional code. From 0e1202cc4ed718962c4cfdbf505ba03bc50b7bc6 Mon Sep 17 00:00:00 2001 From: Louis Dupont Date: Mon, 28 Aug 2023 17:58:14 +0300 Subject: [PATCH 3/5] squash all changes --- README.md | 144 ++++++++-------------------- documentation/dataset_extractors.md | 114 ++++++++++++++++++++++ 2 files changed, 155 insertions(+), 103 deletions(-) create mode 100644 documentation/dataset_extractors.md diff --git a/README.md b/README.md index 6e92638e..2652d707 100644 --- a/README.md +++ b/README.md @@ -33,10 +33,7 @@ and calibrate metrics to monitor your unique dataset. - [Dataset Analysis](#dataset-analysis) - [Report](#report) - [Feature Configuration](#feature-configuration) -- [Dataset Adapters](#dataset-adapters) - - [Image Adapter](#image-adapter) - - [Label Adapter](#label-adapter) - - [Example](#example) +- [Dataset Extractors](#dataset-extractors) - [Pre-computed Dataset Analysis](#pre-computed-dataset-analysis) - [License](#license) @@ -88,10 +85,12 @@ pip install data-gradients ### Prerequisites - **Dataset**: Includes a **Train** set and a **Validation** or a **Test** set. -- **Class Names**: A list of the unique categories present in your dataset. -- **Iterable**: A method to iterate over your Dataset providing images and labels. Can be any of the following: - - PyTorch Dataloader - - PyTorch Dataset +- One of + - **Class Names**: A list of the unique categories present in your dataset. + - **Number of classes**: How many unique classes appear in your dataset (make sure that this number is greater than the highest class index) +- **Dataset Iterable**: A method to iterate over your Dataset providing images and labels. Can be any of the following: + - PyTorch **Dataloader** + - PyTorch **Dataset** - Generator that yields image/label pairs - Any other iterable you use for model training/validation @@ -101,8 +100,8 @@ Please ensure all the points above are checked before you proceed with **DataGra - If something cannot be automatically determined, you will be asked to provide some extra information through a text input. - In some extreme cases, the process will crash and invite you to implement a custom dataset adapter (see relevant section) -**Heads up**: We currently don't provide out-of-the-box dataset/dataloader implementation. -You can find multiple dataset implementations in [PyTorch](https://pytorch.org/vision/stable/datasets.html) +**Heads up**: We currently provide a few out-of-the-box [dataset/dataloader](./documentation/datasets.md) implementation. +You can find more dataset implementations in [PyTorch](https://pytorch.org/vision/stable/datasets.html) or [SuperGradients](https://docs.deci.ai/super-gradients/src/super_gradients/training/datasets/Dataset_Setup_Instructions.html). **Example** @@ -118,13 +117,31 @@ class_names = ["person", "bicycle", "car", "motorcycle", ...] ### Dataset Analysis You are now ready to go, chose the relevant analyzer for your task and run it over your datasets! +**Image Classification** +```python +from data_gradients.managers.classification_manager import ClassificationAnalysisManager + +train_data = ... # Your dataset iterable (torch dataset/dataloader/...) +val_data = ... # Your dataset iterable (torch dataset/dataloader/...) +class_names = ... # [, , ...] + +analyzer = ClassificationAnalysisManager( + report_title="Testing Data-Gradients Classification", + train_data=train_data, + val_data=val_data, + class_names=class_names, +) + +analyzer.run() +``` + **Object Detection** ```python from data_gradients.managers.detection_manager import DetectionAnalysisManager -train_data = ... -val_data = ... -class_names = ... +train_data = ... # Your dataset iterable (torch dataset/dataloader/...) +val_data = ... # Your dataset iterable (torch dataset/dataloader/...) +class_names = ... # [, , ...] analyzer = DetectionAnalysisManager( report_title="Testing Data-Gradients Object Detection", @@ -141,9 +158,9 @@ analyzer.run() ```python from data_gradients.managers.segmentation_manager import SegmentationAnalysisManager -train_data = ... -val_data = ... -class_names = ... +train_data = ... # Your dataset iterable (torch dataset/dataloader/...) +val_data = ... # Your dataset iterable (torch dataset/dataloader/...) +class_names = ... # [, , ...] analyzer = SegmentationAnalysisManager( report_title="Testing Data-Gradients Segmentation", @@ -171,97 +188,18 @@ The feature configuration allows you to run the analysis on a subset of features If you are interested in customizing this configuration, you can check out the [documentation](documentation/feature_configuration.md) on that topic. -## Dataset Adapters -Before implementing a Dataset Adapter try running without it, in many cases DataGradient will support your dataset without any code. - -Two type of Dataset Adapters are available: `images_extractor` and `labels_extractor`. These functions should be passed to the main Analyzer function init. +## Dataset Extractors +**Ensuring Comprehensive Dataset Compatibility** -```python -from data_gradients.managers.segmentation_manager import SegmentationAnalysisManager +Integrating datasets with unique structures can present challenges. +To address this, DataGradients offers `extractors` tailored for enhancing compatibility with diverse dataset formats. -train_data = ... -val_data = ... +**Highlights**: +- Customized dataset outputs or distinctive annotation methodologies can be seamlessly accommodated using extractors. +- DataGradients is adept at automatic dataset inference; however, certain specificities, such as distinct image channel orders or bounding box definitions, may necessitate a tailored approach. -# Let Assume that in this case, the train_data and val_data return data in this format: -# (image, {"masks", "bboxes"}) -images_extractor = lambda data: data[0] # Extract the image -labels_extractor = lambda data: data[1]['masks'] # Extract the masks - -# In case of segmentation. -SegmentationAnalysisManager( - report_title="Test with Adapters", - train_data=train_data, - val_data=val_data, - images_extractor=images_extractor, - labels_extractor=labels_extractor, -) - -# For Detection, just change the Manager and the label_extractor definition. -``` +For an in-depth understanding and implementation details, we encourage a thorough review of the [Dataset Extractors Documentation](./documentation/dataset_extractors.md). -### Image Adapter -Image Adapter functions should respect the following: - -`images_extractor(data: Any) -> torch.Tensor` - -- `data` being the output of the dataset/dataloader that you provided. -- The function should return a Tensor representing your image(s). One of: - - `(BS, C, H, W)`, `(BS, H, W, C)`, `(BS, H, W)` for batch - - `(C, H, W)`, `(H, W, C)`, `(H, W)` for single image - - With `C`: number of channels (3 for RGB) - - -### Label Adapter -Label Adapter functions should respect the following: - -`labels_extractor(data: Any) -> torch.Tensor` - -- `data` being the output of the dataset/dataloader that you provided. -- The function should return a Tensor representing your labels(s): - - For **Segmentation**, one of: - - `(BS, C, H, W)`, `(BS, H, W, C)`, `(BS, H, W)` for batch - - `(C, H, W)`, `(H, W, C)`, `(H, W)` for single image - - `BS`: Batch Size - - `C`: number of channels - 3 for RGB - - `H`, `W`: Height and Width - - For **Detection**, one of: - - `(BS, N, 5)`, `(N, 6)` for batch - - `(N, 5)` for single image - - `BS`: Batch Size - - `N`: Padding size - - The last dimension should include your `class_id` and `bbox` - `class_id, x, y, x, y` for instance - - -### Example - -Let's imagine that your dataset returns a couple of `(image, annotation)` with `annotation` as below: -``` python -annotation = [ - {"bbox_coordinates": [1.08, 187.69, 611.59, 285.84], "class_id": 51}, - {"bbox_coordinates": [5.02, 321.39, 234.33, 365.42], "class_id": 52}, - ... -] -``` - -Because this dataset includes a very custom type of `annotation`, you will need to implement your own custom `labels_extractor` as below: -``` python -from data_gradients.managers.segmentation_manager import SegmentationAnalysisManager - -def labels_extractor(data: Tuple[PIL.Image.Image, List[Dict]]) -> torch.Tensor: - _image, annotations = data[:2] - labels = [] - for annotation in annotations: - class_id = annotation["class_id"] - bbox = annotation["bbox_coordinates"] - labels.append((class_id, *bbox)) - return torch.Tensor(labels) - - -SegmentationAnalysisManager( - ..., - labels_extractor=labels_extractor -) -``` ## Pre-computed Dataset Analysis diff --git a/documentation/dataset_extractors.md b/documentation/dataset_extractors.md new file mode 100644 index 00000000..b30f2685 --- /dev/null +++ b/documentation/dataset_extractors.md @@ -0,0 +1,114 @@ +# Dataset Extractors in DataGradients + +**If your dataset isn't plug-and-play with DataGradients, Dataset Extractors are here to help!** + +## Table of Contents +1. [Introduction](#1-introduction) +2. [What are Dataset Extractors?](#2-what-are-dataset-extractors) +3. [When Do You Need Dataset Extractors?](#3-when-do-you-need-dataset-extractors) +4. [Implementing Dataset Extractors](#4-implementing-dataset-extractors) +5. [Extractor Structures](#5-extractor-structures) + - [Image Extractor](#image-extractor) + - [Label Extractor](#label-extractor) +6. [Practical Example](#6-practical-example) + + +## 1. Introduction +DataGradients aims to automatically recognize your dataset's structure and output format. +This includes variations in image channel order, bounding box format, and segmentation mask type. + +However, unique datasets, especially with a nested data structure, may require Dataset Extractors for customized handling. + + +## 2. What are Dataset Extractors? +Dataset Extractors are user-defined functions that guide DataGradients in interpreting non-standard datasets. +The two primary extractors are: +- **`images_extractor`**: Responsible for extracting image data in a friendly format. +- **`labels_extractor`**: Responsible for extracting label data in a friendly format. + + +## 3. When Do You Need Dataset Extractors? +DataGradients is designed to automatically recognize standard dataset structures. +Yet, intricate or nested formats might be challenging for auto-inference. + +For these unique datasets, Dataset Extractors ensure seamless interfacing with DataGradients. + + +## 4. Implementing Dataset Extractors +After determining the need for extractors, integrate them during the instantiation of the Analysis Manager. +For illustration: + +```python +from data_gradients.managers.segmentation_manager import SegmentationAnalysisManager + +# Sample dataset returns: (image, {"masks", "bboxes"}) +images_extractor = lambda data: data[0] # Extract the image +labels_extractor = lambda data: data[1]['masks'] # Extract the masks + +SegmentationAnalysisManager( + report_title="Test with Extractors", + train_data=train_data, + val_data=val_data, + images_extractor=images_extractor, + labels_extractor=labels_extractor +) +``` + +## 5. Extractor Structures + +### Image Extractor +Function signature: +```python +images_extractor(data: Any) -> torch.Tensor +``` +Output must be a tensor representing your image(s): + - Batched: `(BS, C, H, W)`, `(BS, H, W, C)`, `(BS, H, W)` + - Single Image: `(C, H, W)`, `(H, W, C)`, `(H, W)` + - Where: + - `C`: Number of channels (e.g., 3 for RGB) + - `BS`: Batch Size + - `H`, `W`: Height and Width, respectively + +### Label Extractor +Function signature: +```python +labels_extractor(data: Any) -> torch.Tensor +``` +Depending on the task, the tensor format will differ: + +- **Segmentation**: + - Batched: `(BS, C, H, W)`, `(BS, H, W, C)`, `(BS, H, W)` + - Single Image: `(C, H, W)`, `(H, W, C)`, `(H, W)` +- **Detection**: + - Batched: `(BS, N, 5)`, `(N, 6)` + - Single Image: `(N, 5)` + - Last dimension details: `class_id, x1, y1, x2, y2` +- Where: + - `C`: Number of channels (e.g., 3 for RGB) + - `BS`: Batch Size + - `H`, `W`: Height and Width, respectively + +## 6. Practical Example +For a dataset returning a tuple `(image, annotation)` where `annotation` is structured as follows: + +```python +annotation = [ + {"bbox_coordinates": [1.08, 187.69, 611.59, 285.84], "class_id": 51}, + ... +] +``` + +A suitable `labels_extractor` would be: + +```python +import torch + +def labels_extractor(data) -> torch.Tensor: + _, annotations = data # annotations = [{"bbox_coordinates": [1.08, 187.69, 611.59, 285.84], "class_id": 51}, ...] + labels = [] + for annotation in annotations: + class_id = annotation["class_id"] + bbox = annotation["bbox_coordinates"] + labels.append((class_id, *bbox)) + return torch.Tensor(labels) # np.array([[51, 1.08, 187.69, 611.59, 285.84], ...]) +``` From 6414be51cdffbe57339ea18ca4f2a0aa7eba1f5a Mon Sep 17 00:00:00 2001 From: Louis Dupont Date: Thu, 31 Aug 2023 16:21:50 +0300 Subject: [PATCH 4/5] minor update --- README.md | 49 +++++++++++++++++++++++-------------------------- 1 file changed, 23 insertions(+), 26 deletions(-) diff --git a/README.md b/README.md index a674ca4a..94962bae 100644 --- a/README.md +++ b/README.md @@ -40,7 +40,7 @@ Non-exhaustive list of supported features. > 📘 **Deep Dive into Data Profiling** > Puzzled by some dataset challenges while using DataGradients? We've got you covered. -> Enrich your understanding with our **[🎓free online course](https://deci.ai/course/profiling-computer-vision-datasets-overview/?utm_campaign[…]=DG-PDF-report&utm_medium=DG-repo&utm_content=DG-Report-to-course)**. Dive into dataset profiling, confront its complexities, and harness the full potential of DataGradients. +> Enrich your understanding with this **[🎓free online course](https://deci.ai/course/profiling-computer-vision-datasets-overview/?utm_campaign[…]=DG-PDF-report&utm_medium=DG-repo&utm_content=DG-Report-to-course)**. Dive into dataset profiling, confront its complexities, and harness the full potential of DataGradients.
@@ -88,25 +88,17 @@ pip install data-gradients ### Prerequisites - **Dataset**: Includes a **Train** set and a **Validation** or a **Test** set. -- One of - - **Class Names**: A list of the unique categories present in your dataset. - - **Number of classes**: How many unique classes appear in your dataset (make sure that this number is greater than the highest class index) - **Dataset Iterable**: A method to iterate over your Dataset providing images and labels. Can be any of the following: - PyTorch **Dataloader** - PyTorch **Dataset** - Generator that yields image/label pairs - Any other iterable you use for model training/validation +- One of: + - **Class Names**: A list of the unique categories present in your dataset. + - **Number of classes**: Indicate how many unique classes are in your dataset. Ensure this number is greater than the highest class index (e.g., if your highest class index is 9, the number of classes should be at least 10). Please ensure all the points above are checked before you proceed with **DataGradients**. -**Good to Know**: DataGradients will try to find out how the dataset returns images and labels. -- If something cannot be automatically determined, you will be asked to provide some extra information through a text input. -- In some extreme cases, the process will crash and invite you to implement a custom dataset adapter (see relevant section) - -**Heads up**: We currently provide a few out-of-the-box [dataset/dataloader](./documentation/datasets.md) implementation. -You can find more dataset implementations in [PyTorch](https://pytorch.org/vision/stable/datasets.html) -or [SuperGradients](https://docs.deci.ai/super-gradients/src/super_gradients/training/datasets/Dataset_Setup_Instructions.html). - **Example** ``` python from torchvision.datasets import CocoDetection @@ -116,16 +108,24 @@ val_data = CocoDetection(...) class_names = ["person", "bicycle", "car", "motorcycle", ...] ``` +> **Good to Know** - DataGradients will try to find out how the dataset returns images and labels. +> - If something cannot be automatically determined, you will be asked to provide some extra information through a text input. +> - In some extreme cases, the process will crash and invite you to implement a custom [dataset extractor](#dataset-extractors) -### Dataset Analysis +> **Heads up** - DataGradients provides a few out-of-the-box [dataset/dataloader](./documentation/datasets.md) implementation. +> You can find more dataset implementations in [PyTorch](https://pytorch.org/vision/stable/datasets.html) +> or [SuperGradients](https://docs.deci.ai/super-gradients/src/super_gradients/training/datasets/Dataset_Setup_Instructions.html). + + +## Dataset Analysis You are now ready to go, chose the relevant analyzer for your task and run it over your datasets! **Image Classification** ```python from data_gradients.managers.classification_manager import ClassificationAnalysisManager -train_data = ... # Your dataset iterable (torch dataset/dataloader/...) -val_data = ... # Your dataset iterable (torch dataset/dataloader/...) +train_data = ... # Your dataset iterable (torch dataset/dataloader/...) +val_data = ... # Your dataset iterable (torch dataset/dataloader/...) class_names = ... # [, , ...] analyzer = ClassificationAnalysisManager( @@ -142,8 +142,8 @@ analyzer.run() ```python from data_gradients.managers.detection_manager import DetectionAnalysisManager -train_data = ... # Your dataset iterable (torch dataset/dataloader/...) -val_data = ... # Your dataset iterable (torch dataset/dataloader/...) +train_data = ... # Your dataset iterable (torch dataset/dataloader/...) +val_data = ... # Your dataset iterable (torch dataset/dataloader/...) class_names = ... # [, , ...] analyzer = DetectionAnalysisManager( @@ -161,8 +161,8 @@ analyzer.run() ```python from data_gradients.managers.segmentation_manager import SegmentationAnalysisManager -train_data = ... # Your dataset iterable (torch dataset/dataloader/...) -val_data = ... # Your dataset iterable (torch dataset/dataloader/...) +train_data = ... # Your dataset iterable (torch dataset/dataloader/...) +val_data = ... # Your dataset iterable (torch dataset/dataloader/...) class_names = ... # [, , ...] analyzer = SegmentationAnalysisManager( @@ -181,8 +181,8 @@ You can test the segmentation analysis tool in the following [example](https://g which does not require you to download any additional data. -### Report -Once the analysis is done, the path to your pdf report will be printed. +## Report +Once the analysis is done, the path to your pdf report will be printed. You can find here examples of [pre-computed dataset analysis reports](#pre-computed-dataset-analysis). ## Feature Configuration @@ -194,12 +194,9 @@ If you are interested in customizing this configuration, you can check out the [ ## Dataset Extractors **Ensuring Comprehensive Dataset Compatibility** -Integrating datasets with unique structures can present challenges. -To address this, DataGradients offers `extractors` tailored for enhancing compatibility with diverse dataset formats. +DataGradients is adept at automatic dataset inference; however, certain specificities, such as nested annotations structures or unique annotation format, may necessitate a tailored approach. -**Highlights**: -- Customized dataset outputs or distinctive annotation methodologies can be seamlessly accommodated using extractors. -- DataGradients is adept at automatic dataset inference; however, certain specificities, such as distinct image channel orders or bounding box definitions, may necessitate a tailored approach. +To address this, DataGradients offers `extractors` tailored for enhancing compatibility with diverse dataset formats. For an in-depth understanding and implementation details, we encourage a thorough review of the [Dataset Extractors Documentation](./documentation/dataset_extractors.md). From 5c5f0957336144afa8203349e01fe4a97e3ffb37 Mon Sep 17 00:00:00 2001 From: Louis Dupont Date: Thu, 31 Aug 2023 16:27:41 +0300 Subject: [PATCH 5/5] use __file__ --- scripts/describe_datasets.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/scripts/describe_datasets.py b/scripts/describe_datasets.py index 2f5abe6e..32759ea6 100644 --- a/scripts/describe_datasets.py +++ b/scripts/describe_datasets.py @@ -1,7 +1,9 @@ import inspect +import re +import os + from data_gradients.datasets import detection as detection_datasets from data_gradients.datasets import segmentation as segmentation_datasets -import re def remove_first_indentation(s: str) -> str: @@ -70,5 +72,6 @@ def class_to_github_url(class_obj: type) -> str: {dataset_descriptions}""" -with open("../documentation/datasets.md", "w") as f: +root_dir = os.path.dirname(os.path.dirname(__file__)) +with open(os.path.join(root_dir, "documentation", "datasets.md"), "w") as f: f.write(summary)