PaddlePaddle · jzhang533 · Jul 24, 2024 · Jul 23, 2024 · Jul 23, 2024
diff --git a/doc/doc_ch/algorithm_rec_latex_ocr.md b/doc/doc_ch/algorithm_rec_latex_ocr.md
@@ -33,6 +33,11 @@
 ## 2. 环境配置
 请先参考[《运行环境准备》](./environment.md)配置PaddleOCR运行环境，参考[《项目克隆》](./clone.md)克隆项目代码。
 
+此外，需要安装额外的依赖：
+```shell
+pip install "tokenizers==0.19.1" "imagesize"
+```
+
 <a name="3"></a>
 ## 3. 模型训练、评估、预测
 

diff --git a/doc/doc_en/algorithm_rec_latex_ocr_en.md b/doc/doc_en/algorithm_rec_latex_ocr_en.md
@@ -31,6 +31,10 @@ Using LaTeX-OCR printed mathematical expression recognition datasets for trainin
 ## 2. Environment
 Please refer to ["Environment Preparation"](./environment_en.md) to configure the PaddleOCR environment, and refer to ["Project Clone"](./clone_en.md) to clone the project code.
 
+Furthermore, additional dependencies need to be installed:
+```shell
+pip install "tokenizers==0.19.1" "imagesize"
+```
 
 <a name="3"></a>
 ## 3. Model Training / Evaluation / Prediction

diff --git a/ppocr/data/imaug/label_ops.py b/ppocr/data/imaug/label_ops.py
@@ -26,7 +26,6 @@
 import random
 from random import sample
 from collections import defaultdict
-from tokenizers import Tokenizer as TokenizerFast
 
 from ppocr.utils.logging import get_logger
 from ppocr.data.imaug.vqa.augment import order_by_tbyx
@@ -1780,6 +1779,8 @@ def __init__(
         rec_char_dict_path,
         **kwargs,
     ):
+        from tokenizers import Tokenizer as TokenizerFast
+
         self.tokenizer = TokenizerFast.from_file(rec_char_dict_path)
         self.model_input_names = ["input_ids", "token_type_ids", "attention_mask"]
         self.pad_token_id = 0

diff --git a/ppocr/postprocess/rec_postprocess.py b/ppocr/postprocess/rec_postprocess.py
@@ -15,7 +15,6 @@
 import numpy as np
 import paddle
 from paddle.nn import functional as F
-from tokenizers import Tokenizer as TokenizerFast
 import re
 
 
@@ -1217,6 +1216,8 @@ class LaTeXOCRDecode(object):
     """Convert between latex-symbol and symbol-index"""
 
     def __init__(self, rec_char_dict_path, **kwargs):
+        from tokenizers import Tokenizer as TokenizerFast
+
         super(LaTeXOCRDecode, self).__init__()
         self.tokenizer = TokenizerFast.from_file(rec_char_dict_path)
 

diff --git a/ppocr/utils/formula_utils/math_txt2pkl.py b/ppocr/utils/formula_utils/math_txt2pkl.py
@@ -15,15 +15,15 @@
 import pickle
 from tqdm import tqdm
 import os
-import cv2
-import imagesize
+from paddle.utils import try_import
 from collections import defaultdict
 import glob
 from os.path import join
 import argparse
 
 
 def txt2pickle(images, equations, save_dir):
+    imagesize = try_import("imagesize")
     save_p = os.path.join(save_dir, "latexocr_{}.pkl".format(images.split("/")[-1]))
     min_dimensions = (32, 32)
     max_dimensions = (672, 192)

diff --git a/requirements.txt b/requirements.txt
@@ -13,5 +13,3 @@ Pillow
 pyyaml
 requests
 albumentations==1.4.10
-tokenizers==0.19.1
-imagesize