Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

【OCR Issue No.9】移除明确不适合放在ppocr依赖中的依赖项 #11946

Merged
merged 11 commits into from
Apr 26, 2024
2 changes: 2 additions & 0 deletions paddleocr.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
__dir__ = os.path.dirname(__file__)

import paddle
from paddle.utils import try_import

sys.path.append(os.path.join(__dir__, ""))

Expand Down Expand Up @@ -910,6 +911,7 @@ def main():
img = cv2.imread(img_path)

if args.recovery and args.use_pdf2docx_api and flag_pdf:
try_import("pdf2docx")
from pdf2docx.converter import Converter

docx_file = os.path.join(args.output, "{}.docx".format(img_name))
Expand Down
4 changes: 3 additions & 1 deletion ppstructure/pdf2word/pdf2word.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,6 @@

fitz = try_import("fitz")
from PIL import Image
from pdf2docx.converter import Converter
from qtpy.QtWidgets import (
QApplication,
QWidget,
Expand Down Expand Up @@ -209,6 +208,9 @@ def run(self):
break
# using use_pdf2docx_api for PDF parsing
if self.use_pdf2docx_api and os.path.basename(image_file)[-3:] == "pdf":
try_import("pdf2docx")
from pdf2docx.converter import Converter

self.totalPageCnt += 1
self.progressBarRange.emit(self.totalPageCnt)
print("===============using use_pdf2docx_api===============")
Expand Down
2 changes: 2 additions & 0 deletions ppstructure/predict_system.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
import logging
from copy import deepcopy

from paddle.utils import try_import
from ppocr.utils.utility import get_image_file_list, check_and_read
from ppocr.utils.logging import get_logger
from ppocr.utils.visual import draw_ser_results, draw_re_results
Expand Down Expand Up @@ -300,6 +301,7 @@ def main(args):
img_name = os.path.basename(image_file).split(".")[0]

if args.recovery and args.use_pdf2docx_api and flag_pdf:
try_import("pdf2docx")
from pdf2docx.converter import Converter

os.makedirs(args.output, exist_ok=True)
Expand Down
1 change: 0 additions & 1 deletion ppstructure/recovery/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,3 @@ python-docx
beautifulsoup4
fonttools>=4.24.0
fire>=0.3.0
pdf2docx
5 changes: 4 additions & 1 deletion ppstructure/table/table_metric/table_metric.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,10 +12,10 @@
from rapidfuzz.distance import Levenshtein
from apted import APTED, Config
from apted.helpers import Tree
from lxml import etree, html
from collections import deque
from .parallel import parallel_process
from tqdm import tqdm
from paddle.utils import try_import


class TableTree(Tree):
Expand Down Expand Up @@ -161,6 +161,9 @@ def evaluate(self, pred, true):
"""Computes TEDS score between the prediction and the ground truth of a
given sample
"""
try_import("lxml")
from lxml import etree, html

if (not pred) or (not true):
return 0.0
parser = html.HTMLParser(remove_comments=True, encoding="utf-8")
Expand Down
33 changes: 20 additions & 13 deletions ppstructure/table/tablepyxl/style.py
Original file line number Diff line number Diff line change
@@ -1,19 +1,26 @@
# This is where we handle translating css styles into openpyxl styles
# and cascading those from parent to child in the dom.

from openpyxl.cell import cell
from openpyxl.styles import (
Font,
Alignment,
PatternFill,
NamedStyle,
Border,
Side,
Color,
)
from openpyxl.styles.fills import FILL_SOLID
from openpyxl.styles.numbers import FORMAT_CURRENCY_USD_SIMPLE, FORMAT_PERCENTAGE
from openpyxl.styles.colors import BLACK
try:
from openpyxl.cell import cell
from openpyxl.styles import (
Font,
Alignment,
PatternFill,
NamedStyle,
Border,
Side,
Color,
)
from openpyxl.styles.fills import FILL_SOLID
from openpyxl.styles.numbers import FORMAT_CURRENCY_USD_SIMPLE, FORMAT_PERCENTAGE
from openpyxl.styles.colors import BLACK
except:
import warnings

warnings.warn(
"Can not import openpyxl, some functions in the ppstructure may not work. Please manually install openpyxl before using ppstructure."
)

FORMAT_DATE_MMDDYYYY = "mm/dd/yyyy"

Expand Down
16 changes: 12 additions & 4 deletions ppstructure/table/tablepyxl/tablepyxl.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,9 @@
# Do imports like python3 so our package works for 2 and 3
from __future__ import absolute_import

from lxml import html
from openpyxl import Workbook
from openpyxl.utils import get_column_letter
from premailer import Premailer

from tablepyxl.style import Table
from paddle.utils import try_import


def string_to_int(s):
Expand All @@ -15,6 +13,9 @@ def string_to_int(s):


def get_Tables(doc):
try_import("lxml")
from lxml import etree, html

tree = html.fromstring(doc)
comments = tree.xpath("//comment()")
for comment in comments:
Expand All @@ -27,7 +28,9 @@ def write_rows(worksheet, elem, row, column=1):
Writes every tr child element of elem to a row in the worksheet
returns the next row after all rows are written
"""
try_import("openpyxl")
from openpyxl.cell.cell import MergedCell
from openpyxl.utils import get_column_letter

initial_column = column
for table_row in elem.rows:
Expand Down Expand Up @@ -87,6 +90,11 @@ def document_to_workbook(doc, wb=None, base_url=None):
every table in the document.
The workbook is returned
"""
try_import("premailer")
try_import("openpyxl")
from premailer import Premailer
from openpyxl import Workbook

if not wb:
wb = Workbook()
wb.remove(wb.active)
Expand Down
6 changes: 1 addition & 5 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -9,9 +9,5 @@ rapidfuzz
opencv-python<=4.6.0.66
opencv-contrib-python<=4.6.0.66
cython
lxml
premailer
openpyxl
attrdict
Pillow>=10.0.0
pyyaml
pyyaml
Loading