cla_email

PaddlePaddle · Jul 22, 2024 · ac5d6c1 · ac5d6c1
1 parent beb1dde
commit ac5d6c1
Show file tree

Hide file tree

Showing 8 changed files with 28 additions and 34 deletions.
diff --git a/configs/rec/rec_latex_ocr.yml b/configs/rec/rec_latex_ocr.yml
@@ -16,7 +16,7 @@ Global:
   infer_img: doc/datasets/pme_demo/0000013.png
   infer_mode: False
   use_space_char: False
-  fast_tokenizer_file:  ppocr/utils/dict/latex_ocr_tokenizer.json
+  rec_char_dict_path:  ppocr/utils/dict/latex_ocr_tokenizer.json
   save_res_path: ./output/rec/predicts_latexocr.txt
 
 Optimizer:
@@ -46,7 +46,6 @@ Architecture:
   Head:
     name: LaTeXOCRHead
     pad_value: 0
-    ignore_index: -100
     is_export: False
     decoder_args:
       attn_on_attn: True
@@ -60,7 +59,7 @@ Loss:
 
 PostProcess:
   name: LaTeXOCRDecode
-  fast_tokenizer_file: ppocr/utils/dict/latex_ocr_tokenizer.json
+  rec_char_dict_path: ppocr/utils/dict/latex_ocr_tokenizer.json
 
 Metric:
   name: LaTeXOCRMetric

diff --git a/doc/doc_ch/algorithm_rec_latex_ocr.md b/doc/doc_ch/algorithm_rec_latex_ocr.md
@@ -75,7 +75,7 @@ python3 -m paddle.distributed.launch --gpus '0,1,2,3'  tools/train.py -c configs
 
 - 默认每训练22个epoch（60000次iteration）进行1次评估，若您更改训练的batch_size，或更换数据集，请在训练时作出如下修改
 ```
-python3 tools/train.py -c configs/rec/rec_latex_ocr.yml -o Global.eval_batch_step=[0, {length_of_dataset//batch_size*22}]
+python3 tools/train.py -c configs/rec/rec_latex_ocr.yml -o Global.eval_batch_step=[0,{length_of_dataset//batch_size*22}]
 ```
 
 <a name="3-2"></a>
@@ -115,7 +115,7 @@ python3 tools/export_model.py -c configs/rec/rec_latex_ocr.yml -o Global.pretrai
 # 目前的静态图模型支持的最大输出长度为512
 ```
 **注意：**
-- 如果您是在自己的数据集上训练的模型，并且调整了字典文件，请注意修改配置文件中的`character_dict_path`是否是所需要的字典文件。
+- 如果您是在自己的数据集上训练的模型，并且调整了字典文件，请检查配置文件中的`rec_char_dict_path`是否为所需要的字典文件。
 - [转换后模型下载地址](https://paddleocr.bj.bcebos.com/contribution/rec_latex_ocr_infer.tar)
 
 转换成功后，在目录下有三个文件：

diff --git a/ppocr/data/imaug/label_ops.py b/ppocr/data/imaug/label_ops.py
@@ -1777,10 +1777,10 @@ def encodech(self, text):
 class LatexOCRLabelEncode(object):
     def __init__(
         self,
-        fast_tokenizer_file,
+        rec_char_dict_path,
         **kwargs,
     ):
-        self.tokenizer = TokenizerFast.from_file(fast_tokenizer_file)
+        self.tokenizer = TokenizerFast.from_file(rec_char_dict_path)
         self.model_input_names = ["input_ids", "token_type_ids", "attention_mask"]
         self.pad_token_id = 0
         self.bos_token_id = 1

diff --git a/ppocr/data/imaug/latex_ocr_aug.py b/ppocr/data/imaug/latex_ocr_aug.py
@@ -25,19 +25,19 @@
 import math
 import cv2
 import numpy as np
-import albumentations as alb
+import albumentations as A
 from PIL import Image
 
 
 class LatexTrainTransform:
     def __init__(self, bitmap_prob=0.04, **kwargs):
         # your init code
         self.bitmap_prob = bitmap_prob
-        self.train_transform = alb.Compose(
+        self.train_transform = A.Compose(
             [
-                alb.Compose(
+                A.Compose(
                     [
-                        alb.ShiftScaleRotate(
+                        A.ShiftScaleRotate(
                             shift_limit=0,
                             scale_limit=(-0.15, 0),
                             rotate_limit=1,
@@ -46,7 +46,7 @@ def __init__(self, bitmap_prob=0.04, **kwargs):
                             value=[255, 255, 255],
                             p=1,
                         ),
-                        alb.GridDistortion(
+                        A.GridDistortion(
                             distort_limit=0.1,
                             border_mode=0,
                             interpolation=3,
@@ -56,13 +56,11 @@ def __init__(self, bitmap_prob=0.04, **kwargs):
                     ],
                     p=0.15,
                 ),
-                alb.RGBShift(
-                    r_shift_limit=15, g_shift_limit=15, b_shift_limit=15, p=0.3
-                ),
-                alb.GaussNoise(10, p=0.2),
-                alb.RandomBrightnessContrast(0.05, (-0.2, 0), True, p=0.2),
-                alb.ImageCompression(95, p=0.3),
-                alb.ToGray(always_apply=True),
+                A.RGBShift(r_shift_limit=15, g_shift_limit=15, b_shift_limit=15, p=0.3),
+                A.GaussNoise(10, p=0.2),
+                A.RandomBrightnessContrast(0.05, (-0.2, 0), True, p=0.2),
+                A.ImageCompression(95, p=0.3),
+                A.ToGray(always_apply=True),
             ]
         )
 
@@ -71,17 +69,16 @@ def __call__(self, data):
         if np.random.random() < self.bitmap_prob:
             img[img != 255] = 0
         img = self.train_transform(image=img)["image"]
-        # print(img.shape)
         data["image"] = img
         return data
 
 
 class LatexTestTransform:
     def __init__(self, **kwargs):
         # your init code
-        self.test_transform = alb.Compose(
+        self.test_transform = A.Compose(
             [
-                alb.ToGray(always_apply=True),
+                A.ToGray(always_apply=True),
             ]
         )
 
@@ -170,11 +167,9 @@ def __init__(self, **kwargs):
 
     def __call__(self, data):
         img = data["image"]
-        # H, W, C
         im_h, im_w = img.shape[:2]
         divide_h = math.ceil(im_h / 16) * 16
         divide_w = math.ceil(im_w / 16) * 16
-        # print(img.shape, "pad_shape")
         img = img[:, :, 0]
         img = np.pad(
             img, ((0, divide_h - im_h), (0, divide_w - im_w)), constant_values=(1, 1)

diff --git a/ppocr/data/latexocr_dataset.py b/ppocr/data/latexocr_dataset.py
@@ -47,8 +47,8 @@ def __init__(self, config, mode, logger, seed=None):
         self.batchsize = dataset_config.pop("batch_size_per_pair")
         self.keep_smaller_batches = dataset_config.pop("keep_smaller_batches")
         self.max_seq_len = global_config.pop("max_seq_len")
-        self.fast_tokenizer_file = global_config.pop("fast_tokenizer_file")
-        self.tokenizer = LatexOCRLabelEncode(self.fast_tokenizer_file)
+        self.rec_char_dict_path = global_config.pop("rec_char_dict_path")
+        self.tokenizer = LatexOCRLabelEncode(self.rec_char_dict_path)
 
         file = open(pkl_path, "rb")
         data = pickle.load(file)

diff --git a/ppocr/modeling/heads/rec_latexocr_head.py b/ppocr/modeling/heads/rec_latexocr_head.py
@@ -848,7 +848,6 @@ def __init__(
         net=None,
         in_channels=256,
         out_channels=256,
-        ignore_index=-100,
         pad_value=0,
         decoder_args=None,
         is_export=False,
@@ -868,7 +867,6 @@ def __init__(
         self.eos_token = 2
         self.max_length = 512
         self.pad_value = pad_value
-        self.ignore_index = ignore_index
 
         self.net = transformer_decoder
         self.max_seq_len = self.net.max_seq_len

diff --git a/ppocr/postprocess/rec_postprocess.py b/ppocr/postprocess/rec_postprocess.py
@@ -1216,9 +1216,9 @@ def add_special_char(self, dict_character):
 class LaTeXOCRDecode(object):
     """Convert between latex-symbol and symbol-index"""
 
-    def __init__(self, fast_tokenizer_file=None, **kwargs):
+    def __init__(self, rec_char_dict_path, **kwargs):
         super(LaTeXOCRDecode, self).__init__()
-        self.tokenizer = TokenizerFast.from_file(fast_tokenizer_file)
+        self.tokenizer = TokenizerFast.from_file(rec_char_dict_path)
 
     def post_process(self, s):
         text_reg = r"(\\(operatorname|mathrm|text|mathbf)\s?\*? {.*?})"

diff --git a/tools/infer/predict_rec.py b/tools/infer/predict_rec.py
@@ -136,7 +136,7 @@ def __init__(self, args, logger=None):
         elif self.rec_algorithm == "LaTeXOCR":
             postprocess_params = {
                 "name": "LaTeXOCRDecode",
-                "fast_tokenizer_file": args.rec_char_dict_path,
+                "rec_char_dict_path": args.rec_char_dict_path,
             }
         elif self.rec_algorithm == "ParseQ":
             postprocess_params = {
@@ -515,8 +515,7 @@ def norm_img_latexocr(self, img):
         max_dimensions = [672, 192]
         mean = np.array(mean).reshape(shape).astype("float32")
         std = np.array(std).reshape(shape).astype("float32")
-        img = (img.astype("float32") * scale - mean) / std
-        img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
+
         im_h, im_w = img.shape[:2]
         if (
             min_dimensions[0] <= im_w <= max_dimensions[0]
@@ -527,7 +526,10 @@ def norm_img_latexocr(self, img):
             img = Image.fromarray(np.uint8(img))
             img = self.minmax_size_(self.pad_(img), max_dimensions, min_dimensions)
             img = np.array(img)
-            img = np.dstack((img, img, img))
+            im_h, im_w = img.shape[:2]
+            img = np.dstack([img, img, img])
+        img = (img.astype("float32") * scale - mean) / std
+        img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
         divide_h = math.ceil(im_h / 16) * 16
         divide_w = math.ceil(im_w / 16) * 16
         img = np.pad(