Feature/sg 953 add XML/VOC datasets (#117)

* add yolo * improve doc and name * first draft * support VOC * wip * rename * add more explicit default extensions * upadte extension list * update strings * rename annotation to label * add lena nd iter * add s * minor fix * add len and iter * inherit * replace image_set with split * add doc * add staticmethod * fix naming * improve doc * improve config_path doc
Deci-AI · Jul 4, 2023 · 8940293 · 8940293
1 parent 310acde
commit 8940293
Show file tree

Hide file tree

Showing 7 changed files with 640 additions and 15 deletions.
diff --git a/documentation/datasets.md b/documentation/datasets.md
@@ -6,7 +6,7 @@ These datasets contain only the very basic functionalities and are not recommend
 ## Object Detection
 
 
-### Yolo Format Dataset
+### Yolo Format Detection Dataset
 
 The Yolo format Detection Dataset supports any dataset stored in the YOLO format.
 
@@ -79,6 +79,194 @@ dataset_root/
 ```python
 from data_gradients.datasets.detection import YoloFormatDetectionDataset
 
-train_loader = YoloFormatDetectionDataset(root_dir="<path/to/dataset_root>", images_dir="images/train", labels_dir="labels/train")
-val_loader = YoloFormatDetectionDataset(root_dir="<path/to/dataset_root>", images_dir="images/validation", labels_dir="labels/validation")
+train_set = YoloFormatDetectionDataset(root_dir="<path/to/dataset_root>", images_dir="images/train", labels_dir="labels/train")
+val_set = YoloFormatDetectionDataset(root_dir="<path/to/dataset_root>", images_dir="images/validation", labels_dir="labels/validation")
+```
+
+### VOC Format Detection Dataset
+
+The VOC format Detection Dataset supports datasets where labels are stored in XML following according to VOC standard.
+
+#### Expected folder structure
+Any structure including at least one sub-directory for images and one for xml labels. They can be the same.
+
+Example 1: Separate directories for images and labels
+```
+    dataset_root/
+        ├── images/
+        │   ├── train/
+        │   │   ├── 1.jpg
+        │   │   ├── 2.jpg
+        │   │   └── ...
+        │   ├── test/
+        │   │   ├── ...
+        │   └── validation/
+        │       ├── ...
+        └── labels/
+            ├── train/
+            │   ├── 1.xml
+            │   ├── 2.xml
+            │   └── ...
+            ├── test/
+            │   ├── ...
+            └── validation/
+                ├── ...
+```
+
+Example 2: Same directory for images and labels
+```
+    dataset_root/
+        ├── train/
+        │   ├── 1.jpg
+        │   ├── 1.xml
+        │   ├── 2.jpg
+        │   ├── 2.xml
+        │   └── ...
+        └── validation/
+            ├── ...
+```
+
+**Note**: The label file need to be stored in XML format, but the file extension can be different.
+
+#### Expected label files structure
+The label files must be structured in XML format, like in the following example:
+
+``` xml
+<annotation>
+    <object>
+        <name>chair</name>
+        <bndbox>
+            <xmin>1</xmin>
+            <ymin>213</ymin>
+            <xmax>263</xmax>
+            <ymax>375</ymax>
+        </bndbox>
+    </object>
+    <object>
+        <name>sofa</name>
+        <bndbox>
+            <xmin>104</xmin>
+            <ymin>151</ymin>
+            <xmax>334</xmax>
+            <ymax>287</ymax>
+        </bndbox>
+    </object>
+</annotation>
+```
+
+
+#### Instantiation
+```
+dataset_root/
+    ├── images/
+    │   ├── train/
+    │   │   ├── 1.jpg
+    │   │   ├── 2.jpg
+    │   │   └── ...
+    │   ├── test/
+    │   │   ├── ...
+    │   └── validation/
+    │       ├── ...
+    └── labels/
+        ├── train/
+        │   ├── 1.txt
+        │   ├── 2.txt
+        │   └── ...
+        ├── test/
+        │   ├── ...
+        └── validation/
+            ├── ...
+```
+
+```python
+from data_gradients.datasets.detection import VOCFormatDetectionDataset
+
+train_set = VOCFormatDetectionDataset(root_dir="<path/to/dataset_root>", images_dir="images/train", labels_dir="labels/train")
+val_set = VOCFormatDetectionDataset(root_dir="<path/to/dataset_root>", images_dir="images/validation", labels_dir="labels/validation")
+```
+
+
+### VOC Detection Dataset
+VOC Detection Dataset is a sub-class of the [VOC Format Detection Dataset](#voc_format_detection_dataset), 
+where the folders are structured exactly similarly to the original PascalVOC.
+
+#### Expected folder structure
+Any structure including at least one sub-directory for images and one for xml labels. They can be the same.
+
+Example 1: Separate directories for images and labels
+```
+dataset_root/
+    ├── VOC2007/
+    │   ├── JPEGImages/
+    │   │   ├── 1.jpg
+    │   │   ├── 2.jpg
+    │   │   └── ...
+    │   ├── Annotations/
+    │   │   ├── 1.xml
+    │   │   ├── 2.xml
+    │   │   └── ...
+    │   └── ImageSets/
+    │       └── Main
+    │           ├── train.txt
+    │           ├── val.txt
+    │           ├── train_val.txt
+    │           └── ...
+    └── VOC2012/
+        └── ...
+```
+
+
+**Note**: The label file need to be stored in XML format, but the file extension can be different.
+
+#### Expected label files structure
+The label files must be structured in XML format, like in the following example:
+
+``` xml
+<annotation>
+    <object>
+        <name>chair</name>
+        <bndbox>
+            <xmin>1</xmin>
+            <ymin>213</ymin>
+            <xmax>263</xmax>
+            <ymax>375</ymax>
+        </bndbox>
+    </object>
+    <object>
+        <name>sofa</name>
+        <bndbox>
+            <xmin>104</xmin>
+            <ymin>151</ymin>
+            <xmax>334</xmax>
+            <ymax>287</ymax>
+        </bndbox>
+    </object>
+</annotation>
+```
+
+
+#### Instantiation
+Let's take an example where we only have VOC2012
+```
+dataset_root/
+    └── VOC2012/
+        ├── JPEGImages/
+        │   ├── 1.jpg
+        │   ├── 2.jpg
+        │   └── ...
+        ├── Annotations/
+        │   ├── 1.xml
+        │   ├── 2.xml
+        │   └── ...
+        └── ImageSets/
+            └── Main
+                ├── train.txt
+                └── val.txt
+```
+
+```python
+from data_gradients.datasets.detection import VOCDetectionDataset
+
+train_set = VOCDetectionDataset(root_dir="<path/to/dataset_root>", year=2012, image_set="train")
+val_set = VOCDetectionDataset(root_dir="<path/to/dataset_root>", year=2012, image_set="val")
 ```
diff --git a/src/data_gradients/datasets/FolderProcessor.py b/src/data_gradients/datasets/FolderProcessor.py
@@ -41,11 +41,11 @@ def __init__(
         verbose: bool = True,
     ):
         """
-        :param images_dir:      The directory containing the images.
-        :param labels_dir:      The directory containing the labels.
-        :param label_extensions: The extensions of the labels. Only the labels with these extensions will be considered.
-        :param image_extensions: The extensions of the images. Only the images with these extensions will be considered.
-        :param verbose:         Whether to print extra messages.
+        :param images_dir:          The directory containing the images.
+        :param labels_dir:          The directory containing the labels.
+        :param label_extensions:    The extensions of the labels. Only the labels with these extensions will be considered.
+        :param image_extensions:    The extensions of the images. Only the images with these extensions will be considered.
+        :param verbose:             Whether to print extra messages.
         """
 
         self.images_dir = images_dir
@@ -60,7 +60,11 @@ def _normalize_extension(self, extensions: List[str]) -> List[str]:
         return [ext.replace(".", "").lower() for ext in extensions]
 
     def get_image_and_label_file_names(self, images_dir: str, labels_dir: str) -> List[Tuple[str, str]]:
-        """Gather all image and label files from the provided sub_dirs."""
+        """Gather all image and label files that are in the directories.
+        :param images_dir:      The directory containing the images.
+        :param labels_dir:      The directory containing the labels.
+        :return:                A list of tuple(<path-to-image>, <path-to-label>).
+        """
         images_with_labels_files = []
 
         if not os.path.exists(images_dir):
@@ -83,10 +87,9 @@ def _get_file_names_in_folder(self, images_dir: str, labels_dir: str) -> Tuple[L
 
     def _match_file_names(self, all_images_file_names: List[str], all_labels_file_names: List[str]) -> List[Tuple[str, str]]:
         """Matches the names of image and label files."""
-        base_name = lambda file_name: os.path.splitext(os.path.basename(file_name))[0]
 
-        image_file_base_names = {base_name(file_name): file_name for file_name in all_images_file_names}
-        label_file_base_names = {base_name(file_name): file_name for file_name in all_labels_file_names}
+        image_file_base_names = {self.get_filename(file_name): file_name for file_name in all_images_file_names}
+        label_file_base_names = {self.get_filename(file_name): file_name for file_name in all_labels_file_names}
 
         common_base_names = set(image_file_base_names.keys()) & set(label_file_base_names.keys())
         unmatched_image_files = set(image_file_base_names.keys()) - set(label_file_base_names.keys())
@@ -108,6 +111,10 @@ def is_label(self, filename: str) -> bool:
         """Check if the given file name refers to image."""
         return filename.split(".")[-1].lower() in self.label_extensions
 
+    @staticmethod
+    def get_filename(file_name: str) -> str:
+        return os.path.splitext(os.path.basename(file_name))[0]
+
     def __len__(self):
         return len(self.images_with_labels_files)
 
@@ -117,3 +124,87 @@ def __getitem__(self, index):
     def __iter__(self):
         for image_label_file in self.images_with_labels_files:
             yield image_label_file
+
+
+class ImageLabelConfigIterator(ImageLabelFilesIterator):
+    """Iterate over all image and label files in the provided directories."""
+
+    def __init__(
+        self,
+        images_dir: str,
+        labels_dir: str,
+        config_path: str,
+        label_extensions: Sequence[str],
+        image_extensions: Sequence[str] = DEFAULT_IMG_EXTENSIONS,
+        verbose: bool = True,
+    ):
+        """
+        :param images_dir:          The directory containing the images.
+        :param labels_dir:          The directory containing the labels.
+        :param config_path:         Path to the config file. This config file should contain the list of file ids to include.
+        :param label_extensions:    The extensions of the labels. Only the labels with this extensions will be considered.
+        :param image_extensions:    The extensions of the images. Only the images with this extensions will be considered.
+        :param verbose:             Whether to print extra messages.
+        """
+        self.config_path = config_path
+        super().__init__(
+            images_dir=images_dir,
+            labels_dir=labels_dir,
+            label_extensions=label_extensions,
+            image_extensions=image_extensions,
+            verbose=verbose,
+        )
+
+    def get_image_and_label_file_names(self, images_dir: str, labels_dir: str) -> List[Tuple[str, str]]:
+        """Gather all image and label files that are both listed in the config_path and in the directories.
+        :param images_dir:      The directory containing the images.
+        :param labels_dir:      The directory containing the labels.
+        :return:                A list of tuple(<path-to-image>, <path-to-label>).
+        """
+        images_with_labels_files = super().get_image_and_label_file_names(images_dir=images_dir, labels_dir=labels_dir)
+        file_ids = self._load_file_ids(config_path=self.config_path)
+        filename_to_images_with_labels_files = {
+            self.get_filename(image_path): (image_path, label_path) for (image_path, label_path) in images_with_labels_files
+        }
+
+        images_with_labels_files = []
+        for file_id in file_ids:
+            if file_id in filename_to_images_with_labels_files:
+                images_with_labels_files.append(filename_to_images_with_labels_files[file_id])
+            elif self.verbose:
+                logger.warning(
+                    f"No file with `file_id={file_id}` found in `images_dir={images_dir}` and/or `labels_dir={labels_dir}`. "
+                    f"Hide this message by setting `verbose=False`."
+                )
+
+        if images_with_labels_files == []:
+            error_msg = (
+                f"Out of {len(file_ids)} file ids found in `config_path={self.config_path}`, "
+                f"no matching file found in `images_dir={images_dir}` and/or `labels_dir={labels_dir}`."
+            )
+            if not self.verbose:
+                error_msg += "\nSet `verbose=True` for more information."
+            raise RuntimeError(error_msg)
+        elif len(images_with_labels_files) != len(file_ids):
+            logger.warning(
+                f"Out of {len(file_ids)} file ids found in `config_path={self.config_path}`, "
+                f"{len(images_with_labels_files)} were found in both `images_dir={images_dir}` and `labels_dir={labels_dir}`. "
+                f"Hide this message by setting `verbose=False`."
+            )
+
+        return images_with_labels_files
+
+    def _load_file_ids(self, config_path: str) -> List[str]:
+        if not os.path.exists(config_path):
+            raise FileNotFoundError(f"The config file `{config_path}` does not exist.")
+
+        with open(config_path, "r") as f:
+            try:
+                file_ids = f.read().split()
+            except Exception as e:
+                raise e(f"Could not properly parse `config_path={config_path}`") from e
+
+        if file_ids == []:
+            raise RuntimeError(f"`config_path={config_path}` is empty and contains no file IDs.")
+
+        return file_ids
diff --git a/src/data_gradients/datasets/__init__.py b/src/data_gradients/datasets/__init__.py
@@ -1,4 +1,4 @@
-from data_gradients.datasets.detection import YoloFormatDetectionDataset
+from data_gradients.datasets.detection import VOCDetectionDataset, VOCFormatDetectionDataset, YoloFormatDetectionDataset
 from data_gradients.datasets.bdd_dataset import BDDDataset
 
-__all__ = ["YoloFormatDetectionDataset", "BDDDataset"]
+__all__ = ["VOCDetectionDataset", "VOCFormatDetectionDataset", "YoloFormatDetectionDataset", "BDDDataset"]
diff --git a/src/data_gradients/datasets/detection/__init__.py b/src/data_gradients/datasets/detection/__init__.py
@@ -1,3 +1,5 @@
+from data_gradients.datasets.detection.voc_detection_dataset import VOCDetectionDataset
+from data_gradients.datasets.detection.voc_format_detection_dataset import VOCFormatDetectionDataset
 from data_gradients.datasets.detection.yolo_format_detection_dataset import YoloFormatDetectionDataset
 
-__all__ = ["YoloFormatDetectionDataset"]
+__all__ = ["VOCDetectionDataset", "VOCFormatDetectionDataset", "YoloFormatDetectionDataset"]