Support for LFW dataset format (cvat-ai#110)

* add support for LFW dataset format * update documentation * update Changelog Co-authored-by: Maxim Zhiltsov <maxim.zhiltsov@intel.com>
TOsmanov · Feb 26, 2021 · dad5c05 · dad5c05
1 parent 1325eef
commit dad5c05
Show file tree

Hide file tree

Showing 10 changed files with 295 additions and 0 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -11,6 +11,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - `Icdar13/15` dataset format (<https://github.com/openvinotoolkit/datumaro/pull/96>)
 - Laziness, source caching, tracking of changes and partial updating for `Dataset` (<https://github.com/openvinotoolkit/datumaro/pull/102>)
 - `Market-1501` dataset format (<https://github.com/openvinotoolkit/datumaro/pull/108>)
+- `LFW` dataset format (<https://github.com/openvinotoolkit/datumaro/pull/110>)
 
 ### Changed
 - OpenVINO model launcher is updated for OpenVINO r2021.1 (<https://github.com/openvinotoolkit/datumaro/pull/100>)

diff --git a/README.md b/README.md
@@ -137,6 +137,7 @@ CVAT annotations                             ---> Publication, statistics etc.
   - [LabelMe](http://labelme.csail.mit.edu/Release3.0)
   - [ICDAR13/15](https://rrc.cvc.uab.es/?ch=2) (`word_recognition`, `text_localization`, `text_segmentation`)
   - [Market-1501](https://www.aitribune.com/dataset/2018051063) (`person re-identification`)
+  - [LFW](http://vis-www.cs.umass.edu/lfw/) (`person re-identification`, `landmarks`)
 - Dataset building
   - Merging multiple datasets into one
   - Dataset filtering by a custom criteria:

diff --git a/datumaro/plugins/lfw_format.py b/datumaro/plugins/lfw_format.py
@@ -0,0 +1,135 @@
+# Copyright (C) 2020 Intel Corporation
+#
+# SPDX-License-Identifier: MIT
+
+import os
+import os.path as osp
+import re
+
+from datumaro.components.converter import Converter
+from datumaro.components.extractor import (AnnotationType, DatasetItem,
+    Importer, Points, SourceExtractor)
+
+
+class LfwPath:
+    IMAGES_DIR = 'images'
+    LANDMARKS_FILE = 'landmarks.txt'
+    PAIRS_FILE = 'pairs.txt'
+    IMAGE_EXT = '.jpg'
+    PATTERN = re.compile(r'([\w]+)_([-\d]+)')
+
+class LfwExtractor(SourceExtractor):
+    def __init__(self, path):
+        if not osp.isfile(path):
+            raise NotADirectoryError("Can't read annotation file '%s'" % path)
+        super().__init__(subset=osp.basename(osp.dirname(path)))
+        self._dataset_dir = osp.dirname(osp.dirname(path))
+        self._items = list(self._load_items(path).values())
+
+    def _load_items(self, path):
+        items = {}
+        images_dir = osp.join(self._dataset_dir, self._subset, LfwPath.IMAGES_DIR)
+        with open(path, encoding='utf-8') as f:
+            for line in f:
+                pair = line.strip().split()
+                if len(pair) == 3:
+                    image1 = self.get_image_name(pair[0], pair[1])
+                    image2 = self.get_image_name(pair[0], pair[2])
+                    if image1 not in items:
+                        items[image1] = DatasetItem(id=image1, subset=self._subset,
+                            image=osp.join(images_dir, image1 + LfwPath.IMAGE_EXT),
+                            attributes={'positive_pairs': [], 'negative_pairs': []})
+                    if image2 not in items:
+                        items[image2] = DatasetItem(id=image2, subset=self._subset,
+                            image=osp.join(images_dir, image2 + LfwPath.IMAGE_EXT),
+                            attributes={'positive_pairs': [], 'negative_pairs': []})
+
+                    attributes = items[image1].attributes
+                    attributes['positive_pairs'].append(image2)
+                elif len(pair) == 4:
+                    image1 = self.get_image_name(pair[0], pair[1])
+                    image2 = self.get_image_name(pair[2], pair[3])
+                    if image1 not in items:
+                        items[image1] = DatasetItem(id=image1, subset=self._subset,
+                            image=osp.join(images_dir, image1 + LfwPath.IMAGE_EXT),
+                            attributes={'positive_pairs': [], 'negative_pairs': []})
+                    if image2 not in items:
+                        items[image2] = DatasetItem(id=image2, subset=self._subset,
+                            image=osp.join(images_dir, image2 + LfwPath.IMAGE_EXT),
+                            attributes={'positive_pairs': [], 'negative_pairs': []})
+
+                    attributes = items[image1].attributes
+                    attributes['negative_pairs'].append(image2)
+
+        landmarks_file = osp.join(self._dataset_dir, self._subset,
+            LfwPath.LANDMARKS_FILE)
+        if osp.isfile(landmarks_file):
+            with open(landmarks_file, encoding='utf-8') as f:
+                for line in f:
+                    line = line.split('\t')
+
+                    item_id = line[0]
+                    if item_id.endswith(LfwPath.IMAGE_EXT):
+                        item_id = item_id[:-len(LfwPath.IMAGE_EXT)]
+                    if item_id not in items:
+                        items[item_id] = DatasetItem(id=item_id, subset=self._subset,
+                            image=osp.join(images_dir, line[0]),
+                            attributes={'positive_pairs': [], 'negative_pairs': []})
+
+                    annotations = items[item_id].annotations
+                    annotations.append(Points([float(p) for p in line[1:]]))
+        return items
+
+    @staticmethod
+    def get_image_name(person, image_id):
+        return '{}/{}_{:04d}'.format(person, person, int(image_id))
+
+class LfwImporter(Importer):
+    @classmethod
+    def find_sources(cls, path):
+        return cls._find_sources_recursive(path, LfwPath.PAIRS_FILE, 'lfw')
+
+class LfwConverter(Converter):
+    DEFAULT_IMAGE_EXT = '.jpg'
+
+    def apply(self):
+        for subset_name, subset in self._extractor.subsets().items():
+            positive_pairs = []
+            negative_pairs = []
+            landmarks = []
+            for item in subset:
+                if item.has_image and self._save_images:
+                    self._save_image(item, osp.join(self._save_dir, subset_name,
+                        LfwPath.IMAGES_DIR, item.id + LfwPath.IMAGE_EXT))
+
+                person1, num1 = LfwPath.PATTERN.search(item.id).groups()
+                num1 = int(num1)
+                if 'positive_pairs' in item.attributes:
+                    for pair in item.attributes['positive_pairs']:
+                        num2 = LfwPath.PATTERN.search(pair).groups()[1]
+                        num2 = int(num2)
+                        positive_pairs.append('%s\t%s\t%s' % (person1, num1, num2))
+                if 'negative_pairs' in item.attributes:
+                    for pair in item.attributes['negative_pairs']:
+                        person2, num2 = LfwPath.PATTERN.search(pair).groups()
+                        num2 = int(num2)
+                        negative_pairs.append('%s\t%s\t%s\t%s' % \
+                            (person1, num1, person2, num2))
+
+                item_landmarks = [p for p in item.annotations
+                    if p.type == AnnotationType.points]
+                for landmark in item_landmarks:
+                    landmarks.append('%s\t%s' % (item.id + LfwPath.IMAGE_EXT,
+                        '\t'.join(str(p) for p in landmark.points)))
+
+            pairs_file = osp.join(self._save_dir, subset_name, LfwPath.PAIRS_FILE)
+            os.makedirs(osp.dirname(pairs_file), exist_ok=True)
+            with open(pairs_file, 'w', encoding='utf-8') as f:
+                f.writelines(['%s\n' % pair for pair in positive_pairs])
+                f.writelines(['%s\n' % pair for pair in negative_pairs])
+
+            if landmarks:
+                landmarks_file = osp.join(self._save_dir, subset_name,
+                    LfwPath.LANDMARKS_FILE)
+                with open(landmarks_file, 'w', encoding='utf-8') as f:
+                    f.writelines(['%s\n' % landmark for landmark in landmarks])
diff --git a/docs/user_manual.md b/docs/user_manual.md
@@ -126,6 +126,9 @@ List of supported formats:
 - Market-1501 (`person re-identification`)
   - [Format specification](https://www.aitribune.com/dataset/2018051063)
   - [Dataset example](../tests/assets/market1501_dataset)
+- LFW (`person re-identification`, `landmarks`)
+  - [Format specification](http://vis-www.cs.umass.edu/lfw/)
+  - [Dataset example](../tests/assets/lfw_dataset)
 
 List of supported annotation types:
 - Labels

diff --git a/tests/assets/lfw_dataset/test/images/name0/name0_0001.jpg b/tests/assets/lfw_dataset/test/images/name0/name0_0001.jpg
diff --git a/tests/assets/lfw_dataset/test/images/name1/name1_0001.jpg b/tests/assets/lfw_dataset/test/images/name1/name1_0001.jpg
diff --git a/tests/assets/lfw_dataset/test/images/name1/name1_0002.jpg b/tests/assets/lfw_dataset/test/images/name1/name1_0002.jpg
diff --git a/tests/assets/lfw_dataset/test/landmarks.txt b/tests/assets/lfw_dataset/test/landmarks.txt
@@ -0,0 +1,3 @@
+name0/name0_0001.jpg	0	4	3	3	2	2	1	0	3	0
+name1/name1_0001.jpg	1	6	4	6	3	3	2	1	4	1
+name1/name1_0002.jpg	0	5	3	5	2	2	1	0	3	0
diff --git a/tests/assets/lfw_dataset/test/pairs.txt b/tests/assets/lfw_dataset/test/pairs.txt
@@ -0,0 +1,5 @@
+1	2
+name1	1 	2
+name0	1	name1	1
+name0	1	name1	2
+
diff --git a/tests/test_lfw_format.py b/tests/test_lfw_format.py
@@ -0,0 +1,147 @@
+import os.path as osp
+from unittest import TestCase
+
+import numpy as np
+from datumaro.components.dataset import Dataset
+from datumaro.components.extractor import DatasetItem, Points
+from datumaro.plugins.lfw_format import LfwConverter, LfwImporter
+from datumaro.util.test_utils import TestDir, compare_datasets
+
+
+class LfwFormatTest(TestCase):
+    def test_can_save_and_load(self):
+        source_dataset = Dataset.from_iterable([
+            DatasetItem(id='name0/name0_0001',
+                subset='test', image=np.ones((2, 5, 3)),
+                attributes = {
+                    'positive_pairs': ['name0/name0_0002'],
+                    'negative_pairs': []
+                }
+            ),
+            DatasetItem(id='name0/name0_0002',
+                subset='test', image=np.ones((2, 5, 3)),
+                attributes = {
+                    'positive_pairs': [],
+                    'negative_pairs': ['name1/name1_0001']
+                }
+            ),
+            DatasetItem(id='name1/name1_0001',
+                subset='test', image=np.ones((2, 5, 3)),
+                attributes = {
+                    'positive_pairs': ['name1/name1_0002'],
+                    'negative_pairs': []
+                }
+            ),
+            DatasetItem(id='name1/name1_0002',
+                subset='test', image=np.ones((2, 5, 3)),
+                attributes = {
+                    'positive_pairs': [],
+                    'negative_pairs': ['name0/name0_0001']
+                }
+            ),
+        ])
+
+        with TestDir() as test_dir:
+            LfwConverter.convert(source_dataset, test_dir, save_images=True)
+            parsed_dataset = Dataset.import_from(test_dir, 'lfw')
+
+            compare_datasets(self, source_dataset, parsed_dataset)
+
+    def test_can_save_and_load_with_landmarks(self):
+        source_dataset = Dataset.from_iterable([
+            DatasetItem(id='name0/name0_0001',
+                subset='test', image=np.ones((2, 5, 3)),
+                attributes = {
+                    'positive_pairs': ['name0/name0_0002'],
+                    'negative_pairs': []
+                },
+                annotations=[
+                    Points([0, 4, 3, 3, 2, 2, 1, 0, 3, 0]),
+                ]
+            ),
+            DatasetItem(id='name0/name0_0002',
+                subset='test', image=np.ones((2, 5, 3)),
+                attributes = {
+                    'positive_pairs': [],
+                    'negative_pairs': []
+                },
+                annotations=[
+                    Points([0, 5, 3, 5, 2, 2, 1, 0, 3, 0]),
+                ]
+            ),
+        ])
+
+        with TestDir() as test_dir:
+            LfwConverter.convert(source_dataset, test_dir, save_images=True)
+            parsed_dataset = Dataset.import_from(test_dir, 'lfw')
+
+            compare_datasets(self, source_dataset, parsed_dataset)
+
+    def test_can_save_and_load_with_no_subsets(self):
+        source_dataset = Dataset.from_iterable([
+            DatasetItem(id='name0/name0_0001',
+                image=np.ones((2, 5, 3)),
+                attributes = {
+                    'positive_pairs': ['name0/name0_0002'],
+                    'negative_pairs': []
+                },
+            ),
+            DatasetItem(id='name0/name0_0002',
+                image=np.ones((2, 5, 3)),
+                attributes = {
+                    'positive_pairs': [],
+                    'negative_pairs': []
+                },
+            ),
+        ])
+
+        with TestDir() as test_dir:
+            LfwConverter.convert(source_dataset, test_dir, save_images=True)
+            parsed_dataset = Dataset.import_from(test_dir, 'lfw')
+
+            compare_datasets(self, source_dataset, parsed_dataset)
+
+DUMMY_DATASET_DIR = osp.join(osp.dirname(__file__), 'assets', 'lfw_dataset')
+
+class LfwImporterTest(TestCase):
+    def test_can_detect(self):
+        self.assertTrue(LfwImporter.detect(DUMMY_DATASET_DIR))
+
+    def test_can_import(self):
+        expected_dataset = Dataset.from_iterable([
+            DatasetItem(id='name0/name0_0001',
+                subset='test', image=np.ones((2, 5, 3)),
+                attributes = {
+                    'positive_pairs': [],
+                    'negative_pairs': ['name1/name1_0001',
+                        'name1/name1_0002']
+                },
+                annotations=[
+                    Points([0, 4, 3, 3, 2, 2, 1, 0, 3, 0]),
+                ]
+            ),
+            DatasetItem(id='name1/name1_0001',
+                subset='test', image=np.ones((2, 5, 3)),
+                attributes = {
+                    'positive_pairs': ['name1/name1_0002'],
+                    'negative_pairs': []
+                },
+                annotations=[
+                    Points([1, 6, 4, 6, 3, 3, 2, 1, 4, 1]),
+                ]
+            ),
+            DatasetItem(id='name1/name1_0002',
+                subset='test', image=np.ones((2, 5, 3)),
+                attributes = {
+                    'positive_pairs': [],
+                    'negative_pairs': []
+                },
+                annotations=[
+                    Points([0, 5, 3, 5, 2, 2, 1, 0, 3, 0]),
+                ]
+            ),
+        ])
+
+        dataset = Dataset.import_from(DUMMY_DATASET_DIR, 'lfw')
+
+        compare_datasets(self, expected_dataset, dataset)