PaddlePaddle · jzhang533 · Apr 26, 2024 · Apr 16, 2024 · Apr 16, 2024 · Apr 16, 2024
diff --git a/paddleocr.py b/paddleocr.py
@@ -19,6 +19,7 @@
 __dir__ = os.path.dirname(__file__)
 
 import paddle
+from paddle.utils import try_import
 
 sys.path.append(os.path.join(__dir__, ""))
 
@@ -910,6 +911,7 @@ def main():
                 img = cv2.imread(img_path)
 
             if args.recovery and args.use_pdf2docx_api and flag_pdf:
+                try_import("pdf2docx")
                 from pdf2docx.converter import Converter
 
                 docx_file = os.path.join(args.output, "{}.docx".format(img_name))

diff --git a/ppstructure/pdf2word/pdf2word.py b/ppstructure/pdf2word/pdf2word.py
@@ -25,7 +25,6 @@
 
 fitz = try_import("fitz")
 from PIL import Image
-from pdf2docx.converter import Converter
 from qtpy.QtWidgets import (
     QApplication,
     QWidget,
@@ -209,6 +208,9 @@ def run(self):
                     break
                 # using use_pdf2docx_api for PDF parsing
                 if self.use_pdf2docx_api and os.path.basename(image_file)[-3:] == "pdf":
+                    try_import("pdf2docx")
+                    from pdf2docx.converter import Converter
+
                     self.totalPageCnt += 1
                     self.progressBarRange.emit(self.totalPageCnt)
                     print("===============using use_pdf2docx_api===============")

diff --git a/ppstructure/predict_system.py b/ppstructure/predict_system.py
@@ -28,6 +28,7 @@
 import logging
 from copy import deepcopy
 
+from paddle.utils import try_import
 from ppocr.utils.utility import get_image_file_list, check_and_read
 from ppocr.utils.logging import get_logger
 from ppocr.utils.visual import draw_ser_results, draw_re_results
@@ -300,6 +301,7 @@ def main(args):
         img_name = os.path.basename(image_file).split(".")[0]
 
         if args.recovery and args.use_pdf2docx_api and flag_pdf:
+            try_import("pdf2docx")
             from pdf2docx.converter import Converter
 
             os.makedirs(args.output, exist_ok=True)

diff --git a/ppstructure/recovery/requirements.txt b/ppstructure/recovery/requirements.txt
@@ -2,4 +2,3 @@ python-docx
 beautifulsoup4
 fonttools>=4.24.0
 fire>=0.3.0
-pdf2docx
diff --git a/ppstructure/table/table_metric/table_metric.py b/ppstructure/table/table_metric/table_metric.py
@@ -12,10 +12,10 @@
 from rapidfuzz.distance import Levenshtein
 from apted import APTED, Config
 from apted.helpers import Tree
-from lxml import etree, html
 from collections import deque
 from .parallel import parallel_process
 from tqdm import tqdm
+from paddle.utils import try_import
 
 
 class TableTree(Tree):
@@ -161,6 +161,9 @@ def evaluate(self, pred, true):
         """Computes TEDS score between the prediction and the ground truth of a
         given sample
         """
+        try_import("lxml")
+        from lxml import etree, html
+
         if (not pred) or (not true):
             return 0.0
         parser = html.HTMLParser(remove_comments=True, encoding="utf-8")

diff --git a/ppstructure/table/tablepyxl/style.py b/ppstructure/table/tablepyxl/style.py
@@ -1,19 +1,26 @@
 # This is where we handle translating css styles into openpyxl styles
 # and cascading those from parent to child in the dom.
 
-from openpyxl.cell import cell
-from openpyxl.styles import (
-    Font,
-    Alignment,
-    PatternFill,
-    NamedStyle,
-    Border,
-    Side,
-    Color,
-)
-from openpyxl.styles.fills import FILL_SOLID
-from openpyxl.styles.numbers import FORMAT_CURRENCY_USD_SIMPLE, FORMAT_PERCENTAGE
-from openpyxl.styles.colors import BLACK
+try:
+    from openpyxl.cell import cell
+    from openpyxl.styles import (
+        Font,
+        Alignment,
+        PatternFill,
+        NamedStyle,
+        Border,
+        Side,
+        Color,
+    )
+    from openpyxl.styles.fills import FILL_SOLID
+    from openpyxl.styles.numbers import FORMAT_CURRENCY_USD_SIMPLE, FORMAT_PERCENTAGE
+    from openpyxl.styles.colors import BLACK
+except:
+    import warnings
+
+    warnings.warn(
+        "Can not import openpyxl, some functions in the ppstructure may not work. Please manually install openpyxl before using ppstructure."
+    )
 
 FORMAT_DATE_MMDDYYYY = "mm/dd/yyyy"
 

diff --git a/ppstructure/table/tablepyxl/tablepyxl.py b/ppstructure/table/tablepyxl/tablepyxl.py
@@ -1,11 +1,9 @@
 # Do imports like python3 so our package works for 2 and 3
 from __future__ import absolute_import
 
-from lxml import html
-from openpyxl import Workbook
-from openpyxl.utils import get_column_letter
-from premailer import Premailer
+
 from tablepyxl.style import Table
+from paddle.utils import try_import
 
 
 def string_to_int(s):
@@ -15,6 +13,9 @@ def string_to_int(s):
 
 
 def get_Tables(doc):
+    try_import("lxml")
+    from lxml import etree, html
+
     tree = html.fromstring(doc)
     comments = tree.xpath("//comment()")
     for comment in comments:
@@ -27,7 +28,9 @@ def write_rows(worksheet, elem, row, column=1):
     Writes every tr child element of elem to a row in the worksheet
     returns the next row after all rows are written
     """
+    try_import("openpyxl")
     from openpyxl.cell.cell import MergedCell
+    from openpyxl.utils import get_column_letter
 
     initial_column = column
     for table_row in elem.rows:
@@ -87,6 +90,11 @@ def document_to_workbook(doc, wb=None, base_url=None):
     every table in the document.
     The workbook is returned
     """
+    try_import("premailer")
+    try_import("openpyxl")
+    from premailer import Premailer
+    from openpyxl import Workbook
+
     if not wb:
         wb = Workbook()
         wb.remove(wb.active)

diff --git a/requirements.txt b/requirements.txt
@@ -9,9 +9,5 @@ rapidfuzz
 opencv-python<=4.6.0.66
 opencv-contrib-python<=4.6.0.66
 cython
-lxml
-premailer
-openpyxl
-attrdict
 Pillow>=10.0.0
-pyyaml
+pyyaml