open-mmlab · cuhk-hbsun · May 12, 2021 · Apr 28, 2021 · Apr 29, 2021 · May 12, 2021
diff --git a/configs/textrecog/crnn/README.md b/configs/textrecog/crnn/README.md
@@ -28,10 +28,13 @@
 | IIIT5K  |     3000     | regular |
 |   SVT   |     647      | regular |
 |  IC13   |     1015     | regular |
+|  IC15   |     2077     |irregular|
+|  SVTP   |     645      |irregular|
+|  CT80   |     288      |irregular|
 
 ## Results and models
 
 |                         methods                          |        | Regular Text |      |     |      | Irregular Text |      |                                                                                    download                                                                                    |
 | :------------------------------------------------------: | :----: | :----------: | :--: | :-: | :--: | :------------: | :--: | :----------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
 |                         methods                          | IIIT5K |     SVT      | IC13 |     | IC15 |      SVTP      | CT80 |
-| [CRNN](/configs/textrecog/crnn/crnn_academic_dataset.py) |  80.5  |     81.5     | 86.5 |     |  -   |       -        |  -   | [model](https://download.openmmlab.com/mmocr/textrecog/crnn/crnn_academic-a723a1c5.pth) \| [log](https://download.openmmlab.com/mmocr/textrecog/crnn/20210326_111035.log.json) |
+| [CRNN](/configs/textrecog/crnn/crnn_academic_dataset.py) |  80.5  |     81.5     | 86.5 |     |  54.1   |       59.1        |  55.6   | [model](https://download.openmmlab.com/mmocr/textrecog/crnn/crnn_academic-a723a1c5.pth) \| [log](https://download.openmmlab.com/mmocr/textrecog/crnn/20210326_111035.log.json) |
diff --git a/configs/textrecog/crnn/crnn_academic_dataset.py b/configs/textrecog/crnn/crnn_academic_dataset.py
@@ -95,13 +95,20 @@
     test_mode=False)
 
 test_prefix = 'data/mixture/'
-test_img_prefix1 = test_prefix + 'icdar_2013/'
-test_img_prefix2 = test_prefix + 'IIIT5K/'
-test_img_prefix3 = test_prefix + 'svt/'
 
-test_ann_file1 = test_prefix + 'icdar_2013/test_label_1015.txt'
-test_ann_file2 = test_prefix + 'IIIT5K/test_label.txt'
-test_ann_file3 = test_prefix + 'svt/test_label.txt'
+test_img_prefix1 = test_prefix + 'IIIT5K/'
+test_img_prefix2 = test_prefix + 'svt/'
+test_img_prefix3 = test_prefix + 'icdar_2013/'
+test_img_prefix4 = test_prefix + 'icdar_2015/'
+test_img_prefix5 = test_prefix + 'svtp/'
+test_img_prefix6 = test_prefix + 'ct80/'
+
+test_ann_file1 = test_prefix + 'IIIT5K/test_label.txt'
+test_ann_file2 = test_prefix + 'svt/test_label.txt'
+test_ann_file3 = test_prefix + 'icdar_2013/test_label_1015.txt'
+test_ann_file4 = test_prefix + 'icdar_2015/test_label.txt'
+test_ann_file5 = test_prefix + 'svtp/test_label.txt'
+test_ann_file6 = test_prefix + 'ct80/test_label.txt'
 
 test1 = dict(
     type=dataset_type,
@@ -126,12 +133,28 @@
 test3['img_prefix'] = test_img_prefix3
 test3['ann_file'] = test_ann_file3
 
+test4 = {key: value for key, value in test1.items()}
+test4['img_prefix'] = test_img_prefix4
+test4['ann_file'] = test_ann_file4
+
+test5 = {key: value for key, value in test1.items()}
+test5['img_prefix'] = test_img_prefix5
+test5['ann_file'] = test_ann_file5
+
+test6 = {key: value for key, value in test1.items()}
+test6['img_prefix'] = test_img_prefix6
+test6['ann_file'] = test_ann_file6
+
 data = dict(
     samples_per_gpu=64,
     workers_per_gpu=4,
     train=dict(type='ConcatDataset', datasets=[train1]),
-    val=dict(type='ConcatDataset', datasets=[test1, test2, test3]),
-    test=dict(type='ConcatDataset', datasets=[test1, test2, test3]))
+    val=dict(
+        type='ConcatDataset',
+        datasets=[test1, test2, test3, test4, test5, test6]),
+    test=dict(
+        type='ConcatDataset',
+        datasets=[test1, test2, test3, test4, test5, test6]))
 
 evaluation = dict(interval=1, metric='acc')
 

diff --git a/configs/textrecog/tps/README.md b/configs/textrecog/tps/README.md
@@ -1,9 +1,20 @@
-# Thin-Plate-Spline (TPS) transformation
+# CRNN with TPS based STN
 
 ## Introduction
 
 [ALGORITHM]
 
+```bibtex
+@article{shi2016end,
+  title={An end-to-end trainable neural network for image-based sequence recognition and its application to scene text recognition},
+  author={Shi, Baoguang and Bai, Xiang and Yao, Cong},
+  journal={IEEE transactions on pattern analysis and machine intelligence},
+  year={2016}
+}
+```
+
+[PREPROCESSOR]
+
 ```bibtex
 @article{shi2016robust,
   title={Robust Scene Text Recognition with Automatic Rectification},
@@ -13,14 +24,28 @@
 }
 ```
 
-## About using TPS in other models
-
-- Simply change `cfg.model.preprocessor` from `None` to
-```python
-dict(
-  type='TPSPreprocessor',
-  num_fiducial=20,
-  img_size=(32, 100),
-  rectified_img_size=(32, 100),
-  num_img_channel=1
-)
+## Results and Models
+
+### Train Dataset
+
+| trainset | instance_num | repeat_num | note  |
+| :------: | :----------: | :--------: | :---: |
+|  Syn90k  |   8919273    |     1      | synth |
+
+### Test Dataset
+
+| testset | instance_num |  note   |
+| :-----: | :----------: | :-----: |
+| IIIT5K  |     3000     | regular |
+|   SVT   |     647      | regular |
+|  IC13   |     1015     | regular |
+|  IC15   |     2077     |irregular|
+|  SVTP   |     645      |irregular|
+|  CT80   |     288      |irregular|
+
+## Results and models
+
+|                         methods                          |        | Regular Text |      |     |      | Irregular Text |      |                                                                                    download                                                                                    |
+| :------------------------------------------------------: | :----: | :----------: | :--: | :-: | :--: | :------------: | :--: | :----------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
+|                                                   | IIIT5K |     SVT      | IC13 |     | IC15 |      SVTP      | CT80 |
+| [CRNN-STN](/configs/textrecog/tps/crnn_tps_academic_dataset.py) |  80.8  |     81.3     | 85.0 |     |  59.6   |       68.1        |  53.8   | [model](https://download.openmmlab.com/mmocr/textrecog/tps/crnn_tps_academic_dataset_20210510-d221a905.pth) \| [log](https://download.openmmlab.com/mmocr/textrecog/tps/20210510_204353.log.json) |
diff --git a/configs/textrecog/tps/crnn_tps_academic_dataset.py b/configs/textrecog/tps/crnn_tps_academic_dataset.py
@@ -26,7 +26,7 @@
         img_size=(32, 100),
         rectified_img_size=(32, 100),
         num_img_channel=1),
-    backbone=dict(type='VeryDeepVgg', leakyRelu=False, input_channels=1),
+    backbone=dict(type='VeryDeepVgg', leaky_relu=False, input_channels=1),
     encoder=None,
     decoder=dict(type='CRNNDecoder', in_channels=512, rnn_flag=True),
     loss=dict(type='CTCLoss'),
@@ -68,9 +68,9 @@
     dict(
         type='ResizeOCR',
         height=32,
-        min_width=4,
-        max_width=None,
-        keep_aspect_ratio=True),
+        min_width=32,
+        max_width=100,
+        keep_aspect_ratio=False),
     dict(type='ToTensorOCR'),
     dict(type='NormalizeOCR', **img_norm_cfg),
     dict(
@@ -100,13 +100,20 @@
     test_mode=False)
 
 test_prefix = 'data/mixture/'
-test_img_prefix1 = test_prefix + 'icdar_2013/'
-test_img_prefix2 = test_prefix + 'IIIT5K/'
-test_img_prefix3 = test_prefix + 'svt/'
 
-test_ann_file1 = test_prefix + 'icdar_2013/test_label_1015.txt'
-test_ann_file2 = test_prefix + 'IIIT5K/test_label.txt'
-test_ann_file3 = test_prefix + 'svt/test_label.txt'
+test_img_prefix1 = test_prefix + 'IIIT5K/'
+test_img_prefix2 = test_prefix + 'svt/'
+test_img_prefix3 = test_prefix + 'icdar_2013/'
+test_img_prefix4 = test_prefix + 'icdar_2015/'
+test_img_prefix5 = test_prefix + 'svtp/'
+test_img_prefix6 = test_prefix + 'ct80/'
+
+test_ann_file1 = test_prefix + 'IIIT5K/test_label.txt'
+test_ann_file2 = test_prefix + 'svt/test_label.txt'
+test_ann_file3 = test_prefix + 'icdar_2013/test_label_1015.txt'
+test_ann_file4 = test_prefix + 'icdar_2015/test_label.txt'
+test_ann_file5 = test_prefix + 'svtp/test_label.txt'
+test_ann_file6 = test_prefix + 'ct80/test_label.txt'
 
 test1 = dict(
     type=dataset_type,
@@ -131,12 +138,28 @@
 test3['img_prefix'] = test_img_prefix3
 test3['ann_file'] = test_ann_file3
 
+test4 = {key: value for key, value in test1.items()}
+test4['img_prefix'] = test_img_prefix4
+test4['ann_file'] = test_ann_file4
+
+test5 = {key: value for key, value in test1.items()}
+test5['img_prefix'] = test_img_prefix5
+test5['ann_file'] = test_ann_file5
+
+test6 = {key: value for key, value in test1.items()}
+test6['img_prefix'] = test_img_prefix6
+test6['ann_file'] = test_ann_file6
+
 data = dict(
     samples_per_gpu=64,
     workers_per_gpu=4,
     train=dict(type='ConcatDataset', datasets=[train1]),
-    val=dict(type='ConcatDataset', datasets=[test1, test2, test3]),
-    test=dict(type='ConcatDataset', datasets=[test1, test2, test3]))
+    val=dict(
+        type='ConcatDataset',
+        datasets=[test1, test2, test3, test4, test5, test6]),
+    test=dict(
+        type='ConcatDataset',
+        datasets=[test1, test2, test3, test4, test5, test6]))
 
 evaluation = dict(interval=1, metric='acc')
 

diff --git a/mmocr/models/textrecog/preprocessor/tps_preprocessor.py b/mmocr/models/textrecog/preprocessor/tps_preprocessor.py
@@ -23,26 +23,35 @@
 
 @PREPROCESSOR.register_module()
 class TPSPreprocessor(BasePreprocessor):
-    """Rectification Network of RARE, namely TPS based STN."""
+    """Rectification Network of RARE, namely TPS based STN in.
+
+    <https://arxiv.org/pdf/1603.03915.pdf>`_.
+
+    Args:
+        num_fiducial (int): Number of fiducial points of TPS-STN.
+        img_size (tuple(int, int)): Size (height, width) of the input image.
+        rectified_img_size (tuple(int, int))::
+            Size (height, width) of the rectified image.
+        num_img_channel (int): Number of channels of the input image.
+
+    Output:
+        batch_rectified_img: Rectified image with size
+            [batch_size x num_img_channel x rectified_img_height
+            x rectified_img_width]
+    """
 
     def __init__(self,
-                 num_fiducial,
-                 img_size,
-                 rectified_img_size,
+                 num_fiducial=20,
+                 img_size=(32, 100),
+                 rectified_img_size=(32, 100),
                  num_img_channel=1):
-        """ Based on RARE TPS
-        Args:
-            num_fiducial (int): number of fiducial points of TPS-STN
-            img_size (int, int): (height, width) of the input image
-            rectified_img_size (int, int):
-                (height, width) of the rectified image
-            num_img_channel (int): the number of channels of the input image
-        output:
-            batch_rectified_img: rectified image
-                [batch_size x num_img_channel x rectified_img_height
-                x rectified_img_width]
-        """
         super().__init__()
+        assert isinstance(num_fiducial, int)
+        assert num_fiducial > 0
+        assert isinstance(img_size, tuple)
+        assert isinstance(rectified_img_size, tuple)
+        assert isinstance(num_img_channel, int)
+
         self.num_fiducial = num_fiducial
         self.img_size = img_size
         self.rectified_img_size = rectified_img_size
@@ -71,13 +80,15 @@ def forward(self, batch_img):
 
         return batch_rectified_img
 
-    def init_weights(self):
-        pass
-
 
 class LocalizationNetwork(nn.Module):
     """Localization Network of RARE, which predicts C' (K x 2) from input
-    (img_width x img_height)"""
+    (img_width x img_height)
+
+    Args:
+        num_fiducial (int): Number of fiducial points of TPS-STN.
+        num_img_channel (int): Number of channels of the input image.
+    """
 
     def __init__(self, num_fiducial, num_img_channel):
         super().__init__()
@@ -128,7 +139,8 @@ def forward(self, batch_img):
         Args:
             batch_img (tensor): Batch Input Image
                 [batch_size x num_img_channel x img_height x img_width]
-        output:
+
+        Output:
             batch_C_prime : Predicted coordinates of fiducial points for
             input batch [batch_size x num_fiducial x 2]
         """
@@ -141,8 +153,13 @@ def forward(self, batch_img):
 
 
 class GridGenerator(nn.Module):
-    """Grid Generator of RARE, which produces P_prime by multipling T with
-    P."""
+    """Grid Generator of RARE, which produces P_prime by multipling T with P.
+
+    Args:
+        num_fiducial (int): Number of fiducial points of TPS-STN.
+        rectified_img_size (tuple(int, int)):
+            Size (height, width) of the rectified image.
+    """
 
     def __init__(self, num_fiducial, rectified_img_size):
         """Generate P_hat and inv_delta_C for later."""

diff --git a/tests/test_models/test_ocr_preprocessor.py b/tests/test_models/test_ocr_preprocessor.py
@@ -1,10 +1,20 @@
+import pytest
 import torch
 
 from mmocr.models.textrecog.preprocessor import (BasePreprocessor,
                                                  TPSPreprocessor)
 
 
 def test_tps_preprocessor():
+    with pytest.raises(AssertionError):
+        TPSPreprocessor(num_fiducial=-1)
+    with pytest.raises(AssertionError):
+        TPSPreprocessor(img_size=32)
+    with pytest.raises(AssertionError):
+        TPSPreprocessor(rectified_img_size=100)
+    with pytest.raises(AssertionError):
+        TPSPreprocessor(num_img_channel='bgr')
+
     tps_preprocessor = TPSPreprocessor(
         num_fiducial=20,
         img_size=(32, 100),