From ca3dfee1d943459e8361d5e9a56d676a0b330854 Mon Sep 17 00:00:00 2001 From: shaohuzhang1 Date: Thu, 28 Mar 2024 11:42:22 +0800 Subject: [PATCH 1/2] =?UTF-8?q?feat:=20=E5=88=86=E6=AE=B5API=E6=94=AF?= =?UTF-8?q?=E6=8C=81word,pdf?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- apps/common/handle/base_split_handle.py | 20 ++++++++ apps/common/handle/impl/doc_split_handle.py | 45 +++++++++++++++++ apps/common/handle/impl/pdf_split_handle.py | 50 +++++++++++++++++++ apps/common/handle/impl/text_split_handle.py | 47 +++++++++++++++++ .../serializers/document_serializers.py | 34 ++++++++----- pyproject.toml | 2 + 6 files changed, 185 insertions(+), 13 deletions(-) create mode 100644 apps/common/handle/base_split_handle.py create mode 100644 apps/common/handle/impl/doc_split_handle.py create mode 100644 apps/common/handle/impl/pdf_split_handle.py create mode 100644 apps/common/handle/impl/text_split_handle.py diff --git a/apps/common/handle/base_split_handle.py b/apps/common/handle/base_split_handle.py new file mode 100644 index 00000000000..2c3076c3d24 --- /dev/null +++ b/apps/common/handle/base_split_handle.py @@ -0,0 +1,20 @@ +# coding=utf-8 +""" + @project: maxkb + @Author:虎 + @file: base_split_handle.py + @date:2024/3/27 18:13 + @desc: +""" +from abc import ABC, abstractmethod +from typing import List + + +class BaseSplitHandle(ABC): + @abstractmethod + def support(self, file, get_buffer): + pass + + @abstractmethod + def handle(self, file, pattern_list: List, with_filter: bool, limit: int, get_buffer): + pass diff --git a/apps/common/handle/impl/doc_split_handle.py b/apps/common/handle/impl/doc_split_handle.py new file mode 100644 index 00000000000..50addb32cd4 --- /dev/null +++ b/apps/common/handle/impl/doc_split_handle.py @@ -0,0 +1,45 @@ +# coding=utf-8 +""" + @project: maxkb + @Author:虎 + @file: text_split_handle.py + @date:2024/3/27 18:19 + @desc: +""" +import io +import re +from typing import List + +from docx import Document + +from common.handle.base_split_handle import BaseSplitHandle +from common.util.split_model import SplitModel + +default_pattern_list = [re.compile('(?<=^)# .*|(?<=\\n)# .*'), re.compile('(? 0: + split_model = SplitModel(pattern_list, with_filter, limit) + else: + split_model = SplitModel(default_pattern_list, with_filter=with_filter, limit=limit) + except BaseException as e: + return {'name': file.name, + 'content': []} + return {'name': file.name, + 'content': split_model.parse(content) + } + + def support(self, file, get_buffer): + file_name: str = file.name.lower() + if file_name.endswith(".docx") or file_name.endswith(".doc"): + return True + return False diff --git a/apps/common/handle/impl/pdf_split_handle.py b/apps/common/handle/impl/pdf_split_handle.py new file mode 100644 index 00000000000..c839a10aa8b --- /dev/null +++ b/apps/common/handle/impl/pdf_split_handle.py @@ -0,0 +1,50 @@ +# coding=utf-8 +""" + @project: maxkb + @Author:虎 + @file: text_split_handle.py + @date:2024/3/27 18:19 + @desc: +""" +import re +from typing import List + +import fitz + +from common.handle.base_split_handle import BaseSplitHandle +from common.util.split_model import SplitModel + +default_pattern_list = [re.compile('(?<=^)# .*|(?<=\\n)# .*'), re.compile('(? 0: + split_model = SplitModel(pattern_list, with_filter, limit) + else: + split_model = SplitModel(default_pattern_list, with_filter=with_filter, limit=limit) + except BaseException as e: + return {'name': file.name, + 'content': []} + return {'name': file.name, + 'content': split_model.parse(content) + } + + def support(self, file, get_buffer): + file_name: str = file.name.lower() + if file_name.endswith(".pdf"): + return True + return False diff --git a/apps/common/handle/impl/text_split_handle.py b/apps/common/handle/impl/text_split_handle.py new file mode 100644 index 00000000000..67f56c37d30 --- /dev/null +++ b/apps/common/handle/impl/text_split_handle.py @@ -0,0 +1,47 @@ +# coding=utf-8 +""" + @project: maxkb + @Author:虎 + @file: text_split_handle.py + @date:2024/3/27 18:19 + @desc: +""" +import re +from typing import List + +import chardet + +from common.handle.base_split_handle import BaseSplitHandle +from common.util.split_model import SplitModel + +default_pattern_list = [re.compile('(?<=^)# .*|(?<=\\n)# .*'), re.compile('(? 0.5: + return True + return False + + def handle(self, file, pattern_list: List, with_filter: bool, limit: int, get_buffer): + buffer = get_buffer(file) + if pattern_list is not None and len(pattern_list) > 0: + split_model = SplitModel(pattern_list, with_filter, limit) + else: + split_model = SplitModel(default_pattern_list, with_filter=with_filter, limit=limit) + try: + content = buffer.decode(chardet.detect(buffer)['encoding']) + except BaseException as e: + return {'name': file.name, + 'content': []} + return {'name': file.name, + 'content': split_model.parse(content) + } diff --git a/apps/dataset/serializers/document_serializers.py b/apps/dataset/serializers/document_serializers.py index 529479927d4..8e062d8b4d7 100644 --- a/apps/dataset/serializers/document_serializers.py +++ b/apps/dataset/serializers/document_serializers.py @@ -22,6 +22,9 @@ from common.event.common import work_thread_pool from common.event.listener_manage import ListenerManagement, SyncWebDocumentArgs from common.exception.app_exception import AppApiException +from common.handle.impl.doc_split_handle import DocSplitHandle +from common.handle.impl.pdf_split_handle import PdfSplitHandle +from common.handle.impl.text_split_handle import TextSplitHandle from common.mixins.api_mixin import ApiMixin from common.util.common import post from common.util.field_message import ErrMessage @@ -593,17 +596,22 @@ def batch_delete(self, instance: Dict, with_valid=True): return True +class FileBufferHandle: + buffer = None + + def get_buffer(self, file): + if self.buffer is None: + self.buffer = file.read() + return self.buffer + + +default_split_handle = TextSplitHandle() +split_handles = [DocSplitHandle(), PdfSplitHandle(), default_split_handle] + + def file_to_paragraph(file, pattern_list: List, with_filter: bool, limit: int): - data = file.read() - if pattern_list is not None and len(pattern_list) > 0: - split_model = SplitModel(pattern_list, with_filter, limit) - else: - split_model = get_split_model(file.name, with_filter=with_filter, limit=limit) - try: - content = data.decode(chardet.detect(data)['encoding']) - except BaseException as e: - return {'name': file.name, - 'content': []} - return {'name': file.name, - 'content': split_model.parse(content) - } + get_buffer = FileBufferHandle().get_buffer + for split_handle in split_handles: + if split_handle.support(file, get_buffer): + return split_handle.handle(file, pattern_list, with_filter, limit, get_buffer) + return default_split_handle.handle(file, pattern_list, with_filter, limit, get_buffer) diff --git a/pyproject.toml b/pyproject.toml index 2675941b23d..b2152f4c9c9 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -31,6 +31,8 @@ langchain-openai = "^0.0.8" django-ipware = "^6.0.4" django-apscheduler = "^0.6.2" chardet2 = "^2.0.3" +pymupdf = "^1.24.0" +python-docx = "^1.1.0" [build-system] requires = ["poetry-core"] From 2f305690fb77cacd9289d8c85309625cde45748a Mon Sep 17 00:00:00 2001 From: wangdan-fit2cloud Date: Fri, 29 Mar 2024 17:59:51 +0800 Subject: [PATCH 2/2] =?UTF-8?q?fix:=20=E9=80=9A=E7=94=A8=E5=9E=8B=E7=9F=A5?= =?UTF-8?q?=E8=AF=86=E5=BA=93=E6=94=AF=E6=8C=81=E4=B8=8A=E4=BC=A0=20PDF/DO?= =?UTF-8?q?C=20=E6=A0=BC=E5=BC=8F=E7=9A=84=E6=96=87=E6=A1=A3#19?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- ui/src/views/dataset/component/UploadComponent.vue | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ui/src/views/dataset/component/UploadComponent.vue b/ui/src/views/dataset/component/UploadComponent.vue index d4d3fc4ac8f..b750889c535 100644 --- a/ui/src/views/dataset/component/UploadComponent.vue +++ b/ui/src/views/dataset/component/UploadComponent.vue @@ -16,7 +16,7 @@ action="#" :auto-upload="false" :show-file-list="false" - accept=".txt, .md, .csv, .log" + accept=".txt, .md, .csv, .log, .doc, .docx, .pdf" :limit="50" :on-exceed="onExceed" >