diff --git a/apps/common/handle/impl/pdf_split_handle.py b/apps/common/handle/impl/pdf_split_handle.py index e4bac8d08ca..3e0d951e00e 100644 --- a/apps/common/handle/impl/pdf_split_handle.py +++ b/apps/common/handle/impl/pdf_split_handle.py @@ -9,14 +9,17 @@ import re from typing import List -import pypdf +from pypdf import PdfReader, PdfWriter import os import tempfile +import logging from langchain_community.document_loaders import PyPDFLoader from common.handle.base_split_handle import BaseSplitHandle from common.util.split_model import SplitModel +import time + default_pattern_list = [re.compile('(?<=^)# .*|(?<=\\n)# .*'), re.compile('(?<=\\n)(? 0: split_model = SplitModel(pattern_list, with_filter, limit) else: split_model = SplitModel(default_pattern_list, with_filter=with_filter, limit=limit) + + except BaseException as e: return {'name': file.name, 'content': []}