add kie

bupt906 · Oct 9, 2021 · de88d73 · de88d73
1 parent c5f33b0
commit de88d73
Show file tree

Hide file tree

Showing 21 changed files with 1,035 additions and 25 deletions.
diff --git a/configs/e2e/e2e_r50_vd_pg.yml b/configs/e2e/e2e_r50_vd_pg.yml
@@ -69,7 +69,7 @@ Metric:
 Train:
   dataset:
     name: PGDataSet
-    label_file_list: [.././train_data/total_text/train/]
+    label_file_list: [.././train_data/total_text/train/total_text.txt]
     ratio_list: [1.0]
     data_format: icdar #two data format: icdar/textnet
     transforms:
@@ -93,7 +93,7 @@ Eval:
   dataset:
     name: PGDataSet
     data_dir: ./train_data/
-    label_file_list: [./train_data/total_text/test/]
+    label_file_list: [./train_data/total_text/test/total_text.txt]
     transforms:
       - DecodeImage: # load image
           img_mode: RGB

diff --git a/configs/kie/kie_unet_sdmgr.yml b/configs/kie/kie_unet_sdmgr.yml
@@ -0,0 +1,111 @@
+Global:
+  use_gpu: True
+  epoch_num: 300
+  log_smooth_window: 20
+  print_batch_step: 50
+  save_model_dir: ./output/kie_5/
+  save_epoch_step: 50
+  # evaluation is run every 5000 iterations after the 4000th iteration
+  eval_batch_step: [ 0, 80 ]
+  # 1. If pretrained_model is saved in static mode, such as classification pretrained model
+  #    from static branch, load_static_weights must be set as True.
+  # 2. If you want to finetune the pretrained models we provide in the docs,
+  #    you should set load_static_weights as False.
+  load_static_weights: False
+  cal_metric_during_train: False
+  pretrained_model: ./output/kie_4/best_accuracy
+  checkpoints:
+  save_inference_dir:
+  use_visualdl: False
+  class_path: ./train_data/wildreceipt/class_list.txt
+  infer_img: ./train_data/wildreceipt/1.txt
+  save_res_path: ./output/sdmgr_kie/predicts_kie.txt
+  img_scale: [ 1024, 512 ]
+
+Architecture:
+  model_type: kie
+  algorithm: SDMGR
+  Transform:
+  Backbone:
+    name: Kie_backbone
+  Head:
+    name: SDMGRHead
+
+Loss:
+  name: SDMGRLoss
+
+Optimizer:
+  name: Adam
+  beta1: 0.9
+  beta2: 0.999
+  lr:
+    name: Piecewise
+    learning_rate: 0.001
+    decay_epochs: [ 60, 80, 100]
+    values: [ 0.001, 0.0001, 0.00001]
+    warmup_epoch: 2
+  regularizer:
+    name: 'L2'
+    factor: 0.00005
+
+PostProcess:
+  name: None
+
+Metric:
+  name: KIEMetric
+  main_indicator: hmean
+
+Train:
+  dataset:
+    name: SimpleDataSet
+    data_dir: ./train_data/wildreceipt/
+    label_file_list: [ './train_data/wildreceipt/wildreceipt_train.txt' ]
+    ratio_list: [ 1.0 ]
+    transforms:
+      - DecodeImage: # load image
+          img_mode: RGB
+          channel_first: False
+      - NormalizeImage:
+          scale: 1
+          mean: [ 123.675, 116.28, 103.53 ]
+          std: [ 58.395, 57.12, 57.375 ]
+          order: 'hwc'
+      - KieLabelEncode: # Class handling label
+          character_dict_path: ./train_data/wildreceipt/dict.txt
+      - KieResize:
+      - ToCHWImage:
+      - KeepKeys:
+          keep_keys: [ 'image', 'relations', 'texts', 'points', 'labels', 'tag', 'shape'] # dataloader will return list in this order
+  loader:
+    shuffle: True
+    drop_last: False
+    batch_size_per_card: 4
+    num_workers: 4
+
+Eval:
+  dataset:
+    name: SimpleDataSet
+    data_dir: ./train_data/wildreceipt
+    label_file_list:
+      - ./train_data/wildreceipt/wildreceipt_test.txt
+      # - /paddle/data/PaddleOCR/train_data/wildreceipt/1.txt
+    transforms:
+      - DecodeImage: # load image
+          img_mode: RGB
+          channel_first: False
+      - KieLabelEncode: # Class handling label
+          character_dict_path: ./train_data/wildreceipt/dict.txt
+      - KieResize:
+      - NormalizeImage:
+          scale: 1
+          mean: [ 123.675, 116.28, 103.53 ]
+          std: [ 58.395, 57.12, 57.375 ]
+          order: 'hwc'
+      - ToCHWImage:
+      - KeepKeys:
+          keep_keys: [ 'image', 'relations', 'texts', 'points', 'labels', 'tag', 'ori_image', 'ori_boxes', 'shape']
+  loader:
+    shuffle: False
+    drop_last: False
+    batch_size_per_card: 1 # must be 1
+    num_workers: 4
diff --git a/doc/doc_ch/pgnet.md b/doc/doc_ch/pgnet.md
@@ -87,15 +87,15 @@ python3 tools/infer/predict_e2e.py --e2e_algorithm="PGNet" --image_dir="./doc/im
 ```
 /PaddleOCR/train_data/total_text/train/
   |- rgb/            # total_text数据集的训练数据
-      |- gt_0.png
+      |- img11.jpg
       | ...  
-  |- total_text.txt  # total_text数据集的训练标注
+  |- train.txt       # total_text数据集的训练标注
 ```
 
 total_text.txt标注文件格式如下，文件名和标注信息中间用"\t"分隔：
 ```
 " 图像文件名                    json.dumps编码的图像标注信息"
-rgb/gt_0.png    [{"transcription": "EST", "points": [[1004.0,689.0],[1019.0,698.0],[1034.0,708.0],[1049.0,718.0],[1064.0,728.0],[1079.0,738.0],[1095.0,748.0],[1094.0,774.0],[1079.0,765.0],[1065.0,756.0],[1050.0,747.0],[1036.0,738.0],[1021.0,729.0],[1007.0,721.0]]}, {...}]
+rgb/img11.jpg    [{"transcription": "ASRAMA", "points": [[214.0, 325.0], [235.0, 308.0], [259.0, 296.0], [286.0, 291.0], [313.0, 295.0], [338.0, 305.0], [362.0, 320.0], [349.0, 347.0], [330.0, 337.0], [310.0, 329.0], [290.0, 324.0], [269.0, 328.0], [249.0, 336.0], [231.0, 346.0]]}, {...}]
 ```
 json.dumps编码前的图像标注信息是包含多个字典的list，字典中的 `points` 表示文本框的四个点的坐标(x, y)，从左上角的点开始顺时针排列。
 `transcription` 表示当前文本框的文字，**当其内容为“###”时，表示该文本框无效，在训练时会跳过。**

diff --git a/doc/doc_en/pgnet_en.md b/doc/doc_en/pgnet_en.md
@@ -80,15 +80,15 @@ Download and unzip [totaltext](https://github.com/cs-chan/Total-Text-Dataset/blo
 ```
 /PaddleOCR/train_data/total_text/train/
   |- rgb/            # total_text training data of dataset
-      |- gt_0.png
+      |- img11.png
       | ...  
-  |- total_text.txt  # total_text training annotation of dataset
+  |- train.txt       # total_text training annotation of dataset
 ```
 
 total_text.txt: the format of dimension file is as follows，the file name and annotation information are separated by "\t"：
 ```
 " Image file name             Image annotation information encoded by json.dumps"
-rgb/gt_0.png    [{"transcription": "EST", "points": [[1004.0,689.0],[1019.0,698.0],[1034.0,708.0],[1049.0,718.0],[1064.0,728.0],[1079.0,738.0],[1095.0,748.0],[1094.0,774.0],[1079.0,765.0],[1065.0,756.0],[1050.0,747.0],[1036.0,738.0],[1021.0,729.0],[1007.0,721.0]]}, {...}]
+rgb/img11.jpg    [{"transcription": "ASRAMA", "points": [[214.0, 325.0], [235.0, 308.0], [259.0, 296.0], [286.0, 291.0], [313.0, 295.0], [338.0, 305.0], [362.0, 320.0], [349.0, 347.0], [330.0, 337.0], [310.0, 329.0], [290.0, 324.0], [269.0, 328.0], [249.0, 336.0], [231.0, 346.0]]}, {...}]
 ```
 The image annotation after **json.dumps()** encoding is a list containing multiple dictionaries.
 

diff --git a/ppocr/data/imaug/label_ops.py b/ppocr/data/imaug/label_ops.py
@@ -19,6 +19,7 @@
 
 import numpy as np
 import string
+from shapely.geometry import LineString, Point, Polygon
 
 
 class ClsLabelEncode(object):
@@ -200,19 +201,183 @@ def __init__(self,
         self.pad_num = len(self.dict)  # the length to pad
 
     def __call__(self, data):
+        text_label_index_list, temp_text = [], []
         texts = data['strs']
-        temp_texts = []
         for text in texts:
             text = text.lower()
-            text = self.encode(text)
-            if text is None:
-                return None
-            text = text + [self.pad_num] * (self.max_text_len - len(text))
-            temp_texts.append(text)
-        data['strs'] = np.array(temp_texts)
+            temp_text = []
+            for c_ in text:
+                if c_ in self.dict:
+                    temp_text.append(self.dict[c_])
+            temp_text = temp_text + [self.pad_num] * (self.max_text_len -
+                                                      len(temp_text))
+            text_label_index_list.append(temp_text)
+        data['strs'] = np.array(text_label_index_list)
         return data
 
 
+class KieLabelEncode(object):
+    def __init__(self, character_dict_path, norm=10, directed=False, **kwargs):
+        super(KieLabelEncode, self).__init__()
+        self.dict = dict({'': 0})
+        with open(character_dict_path, 'r') as fr:
+            idx = 1
+            for line in fr:
+                char = line.strip()
+                self.dict[char] = idx
+                idx += 1
+        self.norm = norm
+        self.directed = directed
+
+    def compute_relation(self, boxes):
+        """Compute relation between every two boxes."""
+        x1s, y1s = boxes[:, 0:1], boxes[:, 1:2]
+        x2s, y2s = boxes[:, 4:5], boxes[:, 5:6]
+        ws, hs = x2s - x1s + 1, np.maximum(y2s - y1s + 1, 1)
+        dxs = (x1s[:, 0][None] - x1s) / self.norm
+        dys = (y1s[:, 0][None] - y1s) / self.norm
+        xhhs, xwhs = hs[:, 0][None] / hs, ws[:, 0][None] / hs
+        whs = ws / hs + np.zeros_like(xhhs)
+        relations = np.stack([dxs, dys, whs, xhhs, xwhs], -1)
+        bboxes = np.concatenate([x1s, y1s, x2s, y2s], -1).astype(np.float32)
+        return relations, bboxes
+
+    def pad_text_indices(self, text_inds):
+        """Pad text index to same length."""
+        max_len = 100
+        recoder_len = max([len(text_ind) for text_ind in text_inds])
+        padded_text_inds = -np.ones((len(text_inds), max_len), np.int32)
+        for idx, text_ind in enumerate(text_inds):
+            padded_text_inds[idx, :len(text_ind)] = np.array(text_ind)
+        return padded_text_inds, recoder_len
+
+    def list_to_numpy(self, ann_infos):
+        """Convert bboxes, relations, texts and labels to ndarray."""
+        boxes, text_inds = ann_infos['points'], ann_infos['text_inds']
+        boxes = np.array(boxes, np.int32)
+        relations, bboxes = self.compute_relation(boxes)
+
+        labels = ann_infos.get('labels', None)
+        if labels is not None:
+            labels = np.array(labels, np.int32)
+            edges = ann_infos.get('edges', None)
+            if edges is not None:
+                labels = labels[:, None]
+                edges = np.array(edges)
+                edges = (edges[:, None] == edges[None, :]).astype(np.int32)
+                if self.directed:
+                    edges = (edges & labels == 1).astype(np.int32)
+                np.fill_diagonal(edges, -1)
+                labels = np.concatenate([labels, edges], -1)
+        padded_text_inds, recoder_len = self.pad_text_indices(text_inds)
+        max_num = 100
+        temp_bboxes = np.zeros([max_num, 4])
+        h, _ = bboxes.shape
+        temp_bboxes[:h, :h] = bboxes
+
+        temp_relations = np.zeros([max_num, max_num, 5])
+        temp_relations[:h, :h, :] = relations
+
+        temp_padded_text_inds = np.zeros([max_num, 100])
+        temp_padded_text_inds[:h, :] = padded_text_inds
+
+        temp_labels = np.zeros([max_num, 100])
+        temp_labels[:h, :h + 1] = labels
+
+        tag = np.array([h, recoder_len])
+        return dict(
+            image=ann_infos['image'],
+            points=temp_bboxes,
+            relations=temp_relations,
+            texts=temp_padded_text_inds,
+            labels=temp_labels,
+            tag=tag)
+
+    def convert_canonical(self, points_x, points_y):
+
+        assert len(points_x) == 4
+        assert len(points_y) == 4
+
+        points = [Point(points_x[i], points_y[i]) for i in range(4)]
+
+        polygon = Polygon([(p.x, p.y) for p in points])
+        min_x, min_y, _, _ = polygon.bounds
+        points_to_lefttop = [
+            LineString([points[i], Point(min_x, min_y)]) for i in range(4)
+        ]
+        distances = np.array([line.length for line in points_to_lefttop])
+        sort_dist_idx = np.argsort(distances)
+        lefttop_idx = sort_dist_idx[0]
+
+        if lefttop_idx == 0:
+            point_orders = [0, 1, 2, 3]
+        elif lefttop_idx == 1:
+            point_orders = [1, 2, 3, 0]
+        elif lefttop_idx == 2:
+            point_orders = [2, 3, 0, 1]
+        else:
+            point_orders = [3, 0, 1, 2]
+
+        sorted_points_x = [points_x[i] for i in point_orders]
+        sorted_points_y = [points_y[j] for j in point_orders]
+
+        return sorted_points_x, sorted_points_y
+
+    def sort_vertex(self, points_x, points_y):
+
+        assert len(points_x) == 4
+        assert len(points_y) == 4
+
+        x = np.array(points_x)
+        y = np.array(points_y)
+        center_x = np.sum(x) * 0.25
+        center_y = np.sum(y) * 0.25
+
+        x_arr = np.array(x - center_x)
+        y_arr = np.array(y - center_y)
+
+        angle = np.arctan2(y_arr, x_arr) * 180.0 / np.pi
+        sort_idx = np.argsort(angle)
+
+        sorted_points_x, sorted_points_y = [], []
+        for i in range(4):
+            sorted_points_x.append(points_x[sort_idx[i]])
+            sorted_points_y.append(points_y[sort_idx[i]])
+
+        return self.convert_canonical(sorted_points_x, sorted_points_y)
+
+    def __call__(self, data):
+        import json
+        label = data['label']
+        annotations = json.loads(label)
+        boxes, texts, text_inds, labels, edges = [], [], [], [], []
+        for ann in annotations:
+            box = ann['points']
+            x_list = [box[i][0] for i in range(4)]
+            y_list = [box[i][1] for i in range(4)]
+            sorted_x_list, sorted_y_list = self.sort_vertex(x_list, y_list)
+            sorted_box = []
+            for x, y in zip(sorted_x_list, sorted_y_list):
+                sorted_box.append(x)
+                sorted_box.append(y)
+            boxes.append(sorted_box)
+            text = ann['transcription']
+            texts.append(ann['transcription'])
+            text_ind = [self.dict[c] for c in text if c in self.dict]
+            text_inds.append(text_ind)
+            labels.append(ann['label'])
+            edges.append(ann.get('edge', 0))
+        ann_infos = dict(
+            image=data['image'],
+            points=boxes,
+            texts=texts,
+            text_inds=text_inds,
+            edges=edges,
+            labels=labels)
+
+        return self.list_to_numpy(ann_infos)
+
+
 class AttnLabelEncode(BaseRecLabelEncode):
     """ Convert between text-label and text-index """