add config file and readme for TPS

open-mmlab · Apr 23, 2021 · ca79c92 · ca79c92
1 parent 678b369
commit ca79c92
Show file tree

Hide file tree

Showing 2 changed files with 169 additions and 0 deletions.
diff --git a/configs/textrecog/tps/README.md b/configs/textrecog/tps/README.md
@@ -0,0 +1,26 @@
+# Thin-Plate-Spline (TPS) transformation
+
+## Introduction
+
+[ALGORITHM]
+
+```bibtex
+@article{shi2016robust,
+  title={Robust Scene Text Recognition with Automatic Rectification},
+  author={Shi, Baoguang and Wang, Xinggang and Lyu, Pengyuan and Yao,
+  Cong and Bai, Xiang},
+  year={2016}
+}
+```
+
+## About using TPS in other models
+
+- Simply change `cfg.model.preprocessor` from `None` to
+```python
+dict(
+  type='TPSPreprocessor',
+  num_fiducial=20,
+  img_size=(32, 100),
+  rectified_img_size=(32, 100),
+  num_img_channel=1
+)
diff --git a/configs/textrecog/tps/crnn_tps_academic_dataset.py b/configs/textrecog/tps/crnn_tps_academic_dataset.py
@@ -0,0 +1,143 @@
+_base_ = []
+checkpoint_config = dict(interval=1)
+# yapf:disable
+log_config = dict(
+    interval=1,
+    hooks=[
+        dict(type='TextLoggerHook')
+
+    ])
+# yapf:enable
+dist_params = dict(backend='nccl')
+log_level = 'INFO'
+load_from = None
+resume_from = None
+workflow = [('train', 1)]
+
+# model
+label_convertor = dict(
+    type='CTCConvertor', dict_type='DICT36', with_unknown=False, lower=True)
+
+model = dict(
+    type='CRNNNet',
+    preprocessor=dict(
+        type='TPSPreprocessor',
+        num_fiducial=20,
+        img_size=(32, 100),
+        rectified_img_size=(32, 100),
+        num_img_channel=1),
+    backbone=dict(type='VeryDeepVgg', leakyRelu=False, input_channels=1),
+    encoder=None,
+    decoder=dict(type='CRNNDecoder', in_channels=512, rnn_flag=True),
+    loss=dict(type='CTCLoss'),
+    label_convertor=label_convertor,
+    pretrained=None)
+
+train_cfg = None
+test_cfg = None
+
+# optimizer
+optimizer = dict(type='Adadelta', lr=1.0)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(policy='step', step=[])
+total_epochs = 5
+
+# data
+img_norm_cfg = dict(mean=[0.5], std=[0.5])
+
+train_pipeline = [
+    dict(type='LoadImageFromFile', color_type='grayscale'),
+    dict(
+        type='ResizeOCR',
+        height=32,
+        min_width=100,
+        max_width=100,
+        keep_aspect_ratio=False),
+    dict(type='ToTensorOCR'),
+    dict(type='NormalizeOCR', **img_norm_cfg),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=[
+            'filename', 'ori_shape', 'img_shape', 'text', 'valid_ratio'
+        ]),
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile', color_type='grayscale'),
+    dict(
+        type='ResizeOCR',
+        height=32,
+        min_width=4,
+        max_width=None,
+        keep_aspect_ratio=True),
+    dict(type='ToTensorOCR'),
+    dict(type='NormalizeOCR', **img_norm_cfg),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=['filename', 'ori_shape', 'img_shape', 'valid_ratio']),
+]
+
+dataset_type = 'OCRDataset'
+
+train_img_prefix = 'data/mixture/Syn90k/mnt/ramdisk/max/90kDICT32px'
+train_ann_file = 'data/mixture/Syn90k/label.lmdb'
+
+train1 = dict(
+    type=dataset_type,
+    img_prefix=train_img_prefix,
+    ann_file=train_ann_file,
+    loader=dict(
+        type='LmdbLoader',
+        repeat=1,
+        parser=dict(
+            type='LineStrParser',
+            keys=['filename', 'text'],
+            keys_idx=[0, 1],
+            separator=' ')),
+    pipeline=train_pipeline,
+    test_mode=False)
+
+test_prefix = 'data/mixture/'
+test_img_prefix1 = test_prefix + 'icdar_2013/'
+test_img_prefix2 = test_prefix + 'IIIT5K/'
+test_img_prefix3 = test_prefix + 'svt/'
+
+test_ann_file1 = test_prefix + 'icdar_2013/test_label_1015.txt'
+test_ann_file2 = test_prefix + 'IIIT5K/test_label.txt'
+test_ann_file3 = test_prefix + 'svt/test_label.txt'
+
+test1 = dict(
+    type=dataset_type,
+    img_prefix=test_img_prefix1,
+    ann_file=test_ann_file1,
+    loader=dict(
+        type='HardDiskLoader',
+        repeat=1,
+        parser=dict(
+            type='LineStrParser',
+            keys=['filename', 'text'],
+            keys_idx=[0, 1],
+            separator=' ')),
+    pipeline=test_pipeline,
+    test_mode=True)
+
+test2 = {key: value for key, value in test1.items()}
+test2['img_prefix'] = test_img_prefix2
+test2['ann_file'] = test_ann_file2
+
+test3 = {key: value for key, value in test1.items()}
+test3['img_prefix'] = test_img_prefix3
+test3['ann_file'] = test_ann_file3
+
+data = dict(
+    samples_per_gpu=64,
+    workers_per_gpu=4,
+    train=dict(type='ConcatDataset', datasets=[train1]),
+    val=dict(type='ConcatDataset', datasets=[test1, test2, test3]),
+    test=dict(type='ConcatDataset', datasets=[test1, test2, test3]))
+
+evaluation = dict(interval=1, metric='acc')
+
+cudnn_benchmark = True