diff --git a/cvat/apps/annotation/README.md b/cvat/apps/annotation/README.md index 308b2341109a..c41bc22469e7 100644 --- a/cvat/apps/annotation/README.md +++ b/cvat/apps/annotation/README.md @@ -170,44 +170,58 @@ This is native CVAT annotation format. - supported shapes - Rectangles, Polygons, Polylines, Points ### [Pascal VOC](http://host.robots.ox.ac.uk/pascal/VOC/) +- [Format specification](http://host.robots.ox.ac.uk/pascal/VOC/voc2012/devkit_doc.pdf) #### Pascal dumper description -- downloaded file: a zip archive with following structure: +- downloaded file: a zip archive of the following structure: ```bash - taskname.zip - ├── frame_000001.xml - ├── frame_000002.xml - ├── frame_000003.xml - └── ... + taskname.zip/ + ├── Annotations/ + │   ├── .xml + │   ├── .xml + │   └── .xml + ├── ImageSets/ + │   └── Main/ + │   └── default.txt + └── labelmap.txt ``` - Each annotation `*.xml` file has a name that corresponds to the name of the image file - (e.g. `frame_000001.xml` is the annotation for the `frame_000001.jpg` image). - Detailed structure specification of the `*.xml` file can be found - [here](http://host.robots.ox.ac.uk/pascal/VOC/voc2012/devkit_doc.pdf). -- supported shapes - Rectangles -- additional comments: If you plan to use 'truncated' and 'difficult' attributes please add the corresponding + +- supported shapes: Rectangles +- additional comments: If you plan to use `truncated` and `difficult` attributes please add the corresponding items to the CVAT label attributes: `~checkbox=difficult:false ~checkbox=truncated:false` #### Pascal loader description -- uploaded file: a zip archive with following structure: - ```bash - taskname.zip - ├── frame_000001.xml - ├── frame_000002.xml - ├── frame_000003.xml - └── ... - ``` - It should be possible to match the CVAT frame(imagename) and image filename from the annotation \*.xml - file (the tag filename, e.g. `2008_004457.jpg`). There are 2 options: - 1. full match between image name and filename from annotation *.xml - file (in case of a task was created from images or archive of images). - 1. match by frame number (if CVAT cannot match by name). File name should be in the following format `frame_%6d.jpg`. - It will be used when task was created from a video. +- uploaded file: a zip archive of the structure declared above or the following: + ```bash + taskname.zip/ + ├── .xml + ├── .xml + ├── .xml + └── labelmap.txt # optional + ``` -- supported shapes: Rectangles -- limitations: Support of Pascal VOC object detection format -- additional comments: the CVAT task should be created with the full label set that may be in the annotation files + The `labelmap.txt` file contains dataset labels. It **must** be included + if dataset labels **differ** from VOC default labels. The file structure: + ```bash + # label : color_rgb : 'body' parts : actions + background::: + aeroplane::: + bicycle::: + bird::: + ``` + + It must be possible for CVAT to match the frame (image name) and file name from annotation \*.xml + file (the tag filename, e.g. `2008_004457.jpg`). There are 2 options: + 1. full match between image name and filename from annotation \*.xml + (in cases when task was created from images or image archive). + 1. match by frame number (if CVAT cannot match by name). File name should + be in the following format `.jpg`. + It should be used when task was created from a video. + +- supported shapes: Rectangles +- limitations: Support of Pascal VOC object detection format +- additional comments: the CVAT task should be created with the full label set that may be in the annotation files #### How to create a task from Pascal VOC dataset 1. Download the Pascal Voc dataset (Can be downloaded from the @@ -222,7 +236,7 @@ This is native CVAT annotation format. (See [Creating an annotation task](cvat/apps/documentation/user_guide.md#creating-an-annotation-task) guide for details) 1. zip the corresponding annotation files -1. click `Upload annotation` button, choose `Pascal VOC ZIP 1.0` +1. click `Upload annotation` button, choose `Pascal VOC ZIP 1.1` and select the *.zip file with annotations from previous step. It may take some time. diff --git a/cvat/apps/annotation/pascal_voc.py b/cvat/apps/annotation/pascal_voc.py index 7d6c79a09230..e65a186b6e74 100644 --- a/cvat/apps/annotation/pascal_voc.py +++ b/cvat/apps/annotation/pascal_voc.py @@ -8,7 +8,7 @@ { "display_name": "{name} {format} {version}", "format": "ZIP", - "version": "1.0", + "version": "1.1", "handler": "dump" }, ], @@ -16,101 +16,57 @@ { "display_name": "{name} {format} {version}", "format": "ZIP", - "version": "1.0", + "version": "1.1", "handler": "load" }, ], } def load(file_object, annotations): - from pyunpack import Archive + from glob import glob import os + import os.path as osp + import shutil + from pyunpack import Archive from tempfile import TemporaryDirectory + from datumaro.plugins.voc_format.importer import VocImporter + from cvat.apps.dataset_manager.bindings import import_dm_annotations - def parse_xml_file(annotation_file): - import xml.etree.ElementTree as ET - root = ET.parse(annotation_file).getroot() - frame_number = annotations.match_frame(root.find('filename').text) - - for obj_tag in root.iter('object'): - bbox_tag = obj_tag.find("bndbox") - label = obj_tag.find('name').text - xmin = float(bbox_tag.find('xmin').text) - ymin = float(bbox_tag.find('ymin').text) - xmax = float(bbox_tag.find('xmax').text) - ymax = float(bbox_tag.find('ymax').text) - truncated = obj_tag.find('truncated') - truncated = truncated.text if truncated is not None else 0 - difficult = obj_tag.find('difficult') - difficult = difficult.text if difficult is not None else 0 - - annotations.add_shape(annotations.LabeledShape( - type='rectangle', - frame=frame_number, - label=label, - points=[xmin, ymin, xmax, ymax], - occluded=False, - attributes=[ - annotations.Attribute('truncated', truncated), - annotations.Attribute('difficult', difficult), - ], - )) - - archive_file = getattr(file_object, 'name') + archive_file = file_object if isinstance(file_object, str) else getattr(file_object, "name") with TemporaryDirectory() as tmp_dir: Archive(archive_file).extractall(tmp_dir) - for dirpath, _, filenames in os.walk(tmp_dir): - for _file in filenames: - if '.xml' == os.path.splitext(_file)[1]: - parse_xml_file(os.path.join(dirpath, _file)) + # support flat archive layout + anno_dir = osp.join(tmp_dir, 'Annotations') + if not osp.isdir(anno_dir): + anno_files = glob(osp.join(tmp_dir, '**', '*.xml'), recursive=True) + subsets_dir = osp.join(tmp_dir, 'ImageSets', 'Main') + os.makedirs(subsets_dir, exist_ok=True) + with open(osp.join(subsets_dir, 'train.txt'), 'w') as subset_file: + for f in anno_files: + subset_file.write(osp.splitext(osp.basename(f))[0] + '\n') -def dump(file_object, annotations): - from pascal_voc_writer import Writer - import os - from zipfile import ZipFile - from tempfile import TemporaryDirectory - - with TemporaryDirectory() as out_dir: - with ZipFile(file_object, 'w') as output_zip: - for frame_annotation in annotations.group_by_frame(): - image_name = frame_annotation.name - width = frame_annotation.width - height = frame_annotation.height - - writer = Writer(image_name, width, height) - writer.template_parameters['path'] = '' - writer.template_parameters['folder'] = '' + os.makedirs(anno_dir, exist_ok=True) + for f in anno_files: + shutil.move(f, anno_dir) - for shape in frame_annotation.labeled_shapes: - if shape.type != "rectangle": - continue + dm_project = VocImporter()(tmp_dir) + dm_dataset = dm_project.make_dataset() + import_dm_annotations(dm_dataset, annotations) - label = shape.label - xtl = shape.points[0] - ytl = shape.points[1] - xbr = shape.points[2] - ybr = shape.points[3] - - difficult = 0 - truncated = 0 - for attribute in shape.attributes: - if attribute.name == 'truncated' and 'true' == attribute.value.lower(): - truncated = 1 - elif attribute.name == 'difficult' and 'true' == attribute.value.lower(): - difficult = 1 +def dump(file_object, annotations): + from cvat.apps.dataset_manager.bindings import CvatAnnotationsExtractor + from cvat.apps.dataset_manager.util import make_zip_archive + from datumaro.components.project import Environment, Dataset + from tempfile import TemporaryDirectory - writer.addObject( - name=label, - xmin=xtl, - ymin=ytl, - xmax=xbr, - ymax=ybr, - truncated=truncated, - difficult=difficult, - ) + env = Environment() + id_from_image = env.transforms.get('id_from_image_name') - anno_name = os.path.basename('{}.{}'.format(os.path.splitext(image_name)[0], 'xml')) - anno_file = os.path.join(out_dir, anno_name) - writer.save(anno_file) - output_zip.write(filename=anno_file, arcname=anno_name) + extractor = CvatAnnotationsExtractor('', annotations) + extractor = extractor.transform(id_from_image) + extractor = Dataset.from_extractors(extractor) # apply lazy transforms + converter = env.make_converter('voc_detection') + with TemporaryDirectory() as temp_dir: + converter(extractor, save_dir=temp_dir) + make_zip_archive(temp_dir, file_object) \ No newline at end of file diff --git a/cvat/apps/annotation/yolo.py b/cvat/apps/annotation/yolo.py index ec8dcbbd5f00..379ea45abe3d 100644 --- a/cvat/apps/annotation/yolo.py +++ b/cvat/apps/annotation/yolo.py @@ -8,7 +8,7 @@ { "display_name": "{name} {format} {version}", "format": "ZIP", - "version": "1.0", + "version": "1.1", "handler": "dump" }, ], @@ -16,7 +16,7 @@ { "display_name": "{name} {format} {version}", "format": "ZIP", - "version": "1.0", + "version": "1.1", "handler": "load" }, ], diff --git a/cvat/apps/engine/tests/test_rest_api.py b/cvat/apps/engine/tests/test_rest_api.py index 680c44ed15a7..f33b0b7b615f 100644 --- a/cvat/apps/engine/tests/test_rest_api.py +++ b/cvat/apps/engine/tests/test_rest_api.py @@ -2650,8 +2650,8 @@ def _get_initial_annotation(annotation_format): elif annotation_format == "CVAT XML 1.1 for images": annotations["shapes"] = rectangle_shapes_with_attrs + rectangle_shapes_wo_attrs - elif annotation_format == "PASCAL VOC ZIP 1.0" or \ - annotation_format == "YOLO ZIP 1.0" or \ + elif annotation_format == "PASCAL VOC ZIP 1.1" or \ + annotation_format == "YOLO ZIP 1.1" or \ annotation_format == "TFRecord ZIP 1.0": annotations["shapes"] = rectangle_shapes_wo_attrs diff --git a/datumaro/datumaro/plugins/voc_format/converter.py b/datumaro/datumaro/plugins/voc_format/converter.py index 54d13e7e08f6..108fd499eab9 100644 --- a/datumaro/datumaro/plugins/voc_format/converter.py +++ b/datumaro/datumaro/plugins/voc_format/converter.py @@ -235,7 +235,8 @@ def save_subsets(self): if bbox is not None: _write_xml_bbox(bbox, obj_elem) - for part_bbox in filter(lambda x: obj.id == x.group, + for part_bbox in filter( + lambda x: obj.group and obj.group == x.group, layout_bboxes): part_elem = ET.SubElement(obj_elem, 'part') ET.SubElement(part_elem, 'name').text = \ diff --git a/datumaro/datumaro/plugins/voc_format/extractor.py b/datumaro/datumaro/plugins/voc_format/extractor.py index 66d83f0a5f6f..87a3374ffa4e 100644 --- a/datumaro/datumaro/plugins/voc_format/extractor.py +++ b/datumaro/datumaro/plugins/voc_format/extractor.py @@ -4,6 +4,7 @@ # SPDX-License-Identifier: MIT from collections import defaultdict +import logging as log import os import os.path as osp from xml.etree import ElementTree as ET @@ -13,7 +14,7 @@ AnnotationType, Label, Mask, Bbox, CompiledMask ) from datumaro.util import dir_items -from datumaro.util.image import lazy_image +from datumaro.util.image import lazy_image, Image from datumaro.util.mask_tools import lazy_mask, invert_colormap from .format import ( @@ -52,8 +53,12 @@ def _load_subsets(self, subsets_dir): subset_name = None subset = __class__.Subset(subset_name, self) + subset.items = [] with open(osp.join(subsets_dir, subset_file_name + '.txt'), 'r') as f: - subset.items = [line.split()[0] for line in f] + for line in f: + line = line.split()[0].strip() + if line: + subset.items.append(line) subsets[subset_name] = subset return subsets @@ -84,12 +89,7 @@ def _load_det_annotations(self): for ann_item in det_anno_items: with open(osp.join(det_anno_dir, ann_item + '.xml'), 'r') as f: ann_file_data = f.read() - ann_file_root = ET.fromstring(ann_file_data) - item = ann_file_root.find('filename').text - if not item: - item = ann_item - item = osp.splitext(item)[0] - det_annotations[item] = ann_file_data + det_annotations[ann_item] = ann_file_data self._annotations[VocTask.detection] = det_annotations @@ -134,6 +134,19 @@ def __iter__(self): def _get(self, item_id, subset_name): image = osp.join(self._path, VocPath.IMAGES_DIR, item_id + VocPath.IMAGE_EXT) + det_annotations = self._annotations.get(VocTask.detection) + if det_annotations is not None: + det_annotations = det_annotations.get(item_id) + if det_annotations is not None: + root_elem = ET.fromstring(det_annotations) + height = root_elem.find('size/height') + if height is not None: + height = int(height.text) + width = root_elem.find('size/width') + if width is not None: + width = int(width.text) + if height and width: + image = Image(path=image, size=(height, width)) annotations = self._get_annotations(item_id) @@ -217,7 +230,7 @@ def _get_annotations(self, item_id): for obj_id, object_elem in enumerate(root_elem.findall('object')): obj_id += 1 attributes = {} - group = None + group = obj_id obj_label_id = None label_elem = object_elem.find('name') @@ -262,20 +275,21 @@ def _get_annotations(self, item_id): for action, present in actions.items(): attributes[action] = present + has_parts = False for part_elem in object_elem.findall('part'): part = part_elem.find('name').text part_label_id = self._get_label_id(part) part_bbox = self._parse_bbox(part_elem) - group = obj_id if self._task is not VocTask.person_layout: break if part_bbox is None: continue + has_parts = True item_annotations.append(Bbox(*part_bbox, label=part_label_id, group=group)) - if self._task is VocTask.person_layout and not group: + if self._task is VocTask.person_layout and not has_parts: continue if self._task is VocTask.action_classification and not actions: continue @@ -699,7 +713,7 @@ def __init__(self, path): def _load_categories(self): from collections import OrderedDict - from datumaro.components.formats.voc import VocAction + from .format import VocAction label_map = OrderedDict((a.name, [[], [], []]) for a in VocAction) self._categories = make_voc_categories(label_map) diff --git a/datumaro/tests/test_voc_format.py b/datumaro/tests/test_voc_format.py index 12f9e7d33455..b91ee1a9325c 100644 --- a/datumaro/tests/test_voc_format.py +++ b/datumaro/tests/test_voc_format.py @@ -211,7 +211,7 @@ def __iter__(self): 'difficult': False, 'occluded': False, }, - id=1, + id=1, group=1, ), Bbox(4, 5, 2, 2, label=self._label('person'), attributes={ @@ -382,14 +382,14 @@ class DstExtractor(TestExtractorBase): def __iter__(self): return iter([ DatasetItem(id=1, subset='a', annotations=[ - Bbox(2, 3, 4, 5, label=2, id=1, + Bbox(2, 3, 4, 5, label=2, id=1, group=1, attributes={ 'truncated': False, 'difficult': False, 'occluded': True, } ), - Bbox(2, 3, 4, 5, label=3, id=2, + Bbox(2, 3, 4, 5, label=3, id=2, group=2, attributes={ 'truncated': True, 'difficult': False, @@ -399,7 +399,7 @@ def __iter__(self): ]), DatasetItem(id=2, subset='b', annotations=[ - Bbox(5, 4, 6, 5, label=3, id=1, + Bbox(5, 4, 6, 5, label=3, id=1, group=1, attributes={ 'truncated': False, 'difficult': True, @@ -498,16 +498,16 @@ class DstExtractor(TestExtractorBase): def __iter__(self): return iter([ DatasetItem(id=1, subset='a', annotations=[ - Bbox(2, 3, 4, 5, label=2, id=1, - attributes={ + Bbox(2, 3, 4, 5, label=2, + id=1, group=1, attributes={ 'truncated': True, 'difficult': False, 'occluded': False, # no attributes here in the label categories } ), - Bbox(5, 4, 3, 2, label=self._label('person'), id=2, - attributes={ + Bbox(5, 4, 3, 2, label=self._label('person'), + id=2, group=2, attributes={ 'truncated': True, 'difficult': False, 'occluded': False, @@ -579,7 +579,7 @@ class DstExtractor(TestExtractorBase): def __iter__(self): yield DatasetItem(id=1, annotations=[ # drop non voc label - Bbox(2, 3, 4, 5, label=self._label('cat'), id=1, + Bbox(2, 3, 4, 5, label=self._label('cat'), id=1, group=1, attributes={ 'truncated': False, 'difficult': False, @@ -615,16 +615,15 @@ def categories(self): class DstExtractor(TestExtractorBase): def __iter__(self): yield DatasetItem(id=1, annotations=[ - Bbox(2, 3, 4, 5, label=self._label(VOC.VocLabel(1).name), id=1, - attributes={ + Bbox(2, 3, 4, 5, label=self._label(VOC.VocLabel(1).name), + id=1, group=1, attributes={ 'truncated': False, 'difficult': False, 'occluded': False, } ), - Bbox(1, 2, 3, 4, - label=self._label('non_voc_label'), id=2, - attributes={ + Bbox(1, 2, 3, 4, label=self._label('non_voc_label'), + id=2, group=2, attributes={ 'truncated': False, 'difficult': False, 'occluded': False, @@ -663,15 +662,15 @@ def categories(self): class DstExtractor(TestExtractorBase): def __iter__(self): yield DatasetItem(id=1, annotations=[ - Bbox(2, 3, 4, 5, label=self._label('label_1'), id=1, - attributes={ + Bbox(2, 3, 4, 5, label=self._label('label_1'), + id=1, group=1, attributes={ 'truncated': False, 'difficult': False, 'occluded': False, } ), - Bbox(1, 2, 3, 4, label=self._label('label_2'), id=2, - attributes={ + Bbox(1, 2, 3, 4, label=self._label('label_2'), + id=2, group=2, attributes={ 'truncated': False, 'difficult': False, 'occluded': False,