[Fix] Fix two bugs in dataset preparer (#1513)

* fix two bugs * simplyfy code
open-mmlab · Nov 7, 2022 · 0afbb70 · 0afbb70
1 parent abf5a89
commit 0afbb70
Show file tree

Hide file tree

Showing 2 changed files with 14 additions and 9 deletions.
diff --git a/mmocr/datasets/preparers/data_converter.py b/mmocr/datasets/preparers/data_converter.py
@@ -143,14 +143,14 @@ def pack_instance(self, sample: Tuple, split: str) -> Dict:
         """
 
     @abstractmethod
-    def add_meta(self, sample: Dict) -> Dict:
+    def add_meta(self, sample: List) -> Dict:
         """Add meta information to the sample.
 
         Args:
-            sample (Dict): A sample of the dataset.
+            sample (List): A list of samples of the dataset.
 
         Returns:
-            Dict: A sample with meta information.
+            Dict: A dict contains the meta information and samples.
         """
 
     def mono_gather(self, ann_path: str, mapping: str, split: str,
@@ -297,7 +297,7 @@ def pack_instance(self,
 
         return packed_instances
 
-    def add_meta(self, sample: Dict) -> Dict:
+    def add_meta(self, sample: List) -> Dict:
         meta = {
             'metainfo': {
                 'dataset_type': 'TextDetDataset',
@@ -396,7 +396,7 @@ def pack_instance(self,
 
         return packed_instances
 
-    def add_meta(self, sample: Dict) -> Dict:
+    def add_meta(self, sample: List) -> Dict:
         meta = {
             'metainfo': {
                 'dataset_type': 'TextSpottingDataset',
@@ -465,7 +465,7 @@ def pack_instance(self, sample: Tuple, split: str) -> Dict:
 
         return packed_instance
 
-    def add_meta(self, sample: Dict) -> Dict:
+    def add_meta(self, sample: List) -> Dict:
         meta = {
             'metainfo': {
                 'dataset_type': 'TextRecogDataset',
@@ -522,7 +522,6 @@ def __init__(self,
             dataset_name=dataset_name,
             nproc=nproc,
             delete=delete)
-        self.ignore = self.parser.ignore
         self.lepr = long_edge_pad_ratio
         self.sepr = short_edge_pad_ratio
         # Crop converter crops the images of textdet to patches
@@ -555,7 +554,7 @@ def get_box(instance: Dict) -> List:
         img = mmcv.imread(img_path)
         for i, instance in enumerate(instances):
             box, text = get_box(instance), instance['text']
-            if text == self.ignore:
+            if instance['ignore']:
                 continue
             patch = crop_img(img, box, self.lepr, self.sepr)
             if patch.shape[0] == 0 or patch.shape[1] == 0:
@@ -572,6 +571,12 @@ def get_box(instance: Dict) -> List:
 
         return data_list
 
+    def add_meta(self, sample: List) -> Dict:
+        # Since the TextRecogCropConverter packs all of the patches in a single
+        # image into a list, we need to flatten the list.
+        sample = [item for sublist in sample for item in sublist]
+        return super().add_meta(sample)
+
 
 @DATA_CONVERTERS.register_module()
 class WildReceiptConverter(BaseDataConverter):

diff --git a/mmocr/datasets/preparers/dumpers/dumpers.py b/mmocr/datasets/preparers/dumpers/dumpers.py
@@ -40,7 +40,7 @@ def dump(self, data: Dict, data_root: str, split: str) -> str:
         dst_file = osp.join(data_root, f'{self.task}_{split}.json')
         mmengine.dump(data, dst_file)
 
-        cfg = f'\n{self.dataset_name}_{self.task}_{split} = dict (\n'
+        cfg = f'\n{self.dataset_name}_{self.task}_{split} = dict(\n'
         cfg += '    type=\'OCRDataset\',\n'
         cfg += '    data_root=' + f'{self.dataset_name}_{self.task}_data_root,\n'  # noqa: E501
         cfg += f'    ann_file=\'{osp.basename(dst_file)}\',\n'