From ca3dfee1d943459e8361d5e9a56d676a0b330854 Mon Sep 17 00:00:00 2001
From: shaohuzhang1 <shaohu.zhang@fit2cloud.com>
Date: Thu, 28 Mar 2024 11:42:22 +0800
Subject: [PATCH 1/2] =?UTF-8?q?feat:=20=E5=88=86=E6=AE=B5API=E6=94=AF?=
 =?UTF-8?q?=E6=8C=81word,pdf?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 apps/common/handle/base_split_handle.py       | 20 ++++++++
 apps/common/handle/impl/doc_split_handle.py   | 45 +++++++++++++++++
 apps/common/handle/impl/pdf_split_handle.py   | 50 +++++++++++++++++++
 apps/common/handle/impl/text_split_handle.py  | 47 +++++++++++++++++
 .../serializers/document_serializers.py       | 34 ++++++++-----
 pyproject.toml                                |  2 +
 6 files changed, 185 insertions(+), 13 deletions(-)
 create mode 100644 apps/common/handle/base_split_handle.py
 create mode 100644 apps/common/handle/impl/doc_split_handle.py
 create mode 100644 apps/common/handle/impl/pdf_split_handle.py
 create mode 100644 apps/common/handle/impl/text_split_handle.py

diff --git a/apps/common/handle/base_split_handle.py b/apps/common/handle/base_split_handle.py
new file mode 100644
index 00000000000..2c3076c3d24
--- /dev/null
+++ b/apps/common/handle/base_split_handle.py
@@ -0,0 +1,20 @@
+# coding=utf-8
+"""
+    @project: maxkb
+    @Author：虎
+    @file： base_split_handle.py
+    @date：2024/3/27 18:13
+    @desc:
+"""
+from abc import ABC, abstractmethod
+from typing import List
+
+
+class BaseSplitHandle(ABC):
+    @abstractmethod
+    def support(self, file, get_buffer):
+        pass
+
+    @abstractmethod
+    def handle(self, file, pattern_list: List, with_filter: bool, limit: int, get_buffer):
+        pass
diff --git a/apps/common/handle/impl/doc_split_handle.py b/apps/common/handle/impl/doc_split_handle.py
new file mode 100644
index 00000000000..50addb32cd4
--- /dev/null
+++ b/apps/common/handle/impl/doc_split_handle.py
@@ -0,0 +1,45 @@
+# coding=utf-8
+"""
+    @project: maxkb
+    @Author：虎
+    @file： text_split_handle.py
+    @date：2024/3/27 18:19
+    @desc:
+"""
+import io
+import re
+from typing import List
+
+from docx import Document
+
+from common.handle.base_split_handle import BaseSplitHandle
+from common.util.split_model import SplitModel
+
+default_pattern_list = [re.compile('(?<=^)# .*|(?<=\\n)# .*'), re.compile('(?<!#)## (?!#).*'),
+                        re.compile("(?<!#)### (?!#).*"),
+                        re.compile("(?<!#)#### (?!#).*"), re.compile("(?<!#)##### (?!#).*"),
+                        re.compile("(?<!#)###### (?!#).*"), re.compile("(?<!\n)\n\n+")]
+
+
+class DocSplitHandle(BaseSplitHandle):
+    def handle(self, file, pattern_list: List, with_filter: bool, limit: int, get_buffer):
+        try:
+            buffer = get_buffer(file)
+            doc = Document(io.BytesIO(buffer))
+            content = "\n".join([para.text for para in doc.paragraphs])
+            if pattern_list is not None and len(pattern_list) > 0:
+                split_model = SplitModel(pattern_list, with_filter, limit)
+            else:
+                split_model = SplitModel(default_pattern_list, with_filter=with_filter, limit=limit)
+        except BaseException as e:
+            return {'name': file.name,
+                    'content': []}
+        return {'name': file.name,
+                'content': split_model.parse(content)
+                }
+
+    def support(self, file, get_buffer):
+        file_name: str = file.name.lower()
+        if file_name.endswith(".docx") or file_name.endswith(".doc"):
+            return True
+        return False
diff --git a/apps/common/handle/impl/pdf_split_handle.py b/apps/common/handle/impl/pdf_split_handle.py
new file mode 100644
index 00000000000..c839a10aa8b
--- /dev/null
+++ b/apps/common/handle/impl/pdf_split_handle.py
@@ -0,0 +1,50 @@
+# coding=utf-8
+"""
+    @project: maxkb
+    @Author：虎
+    @file： text_split_handle.py
+    @date：2024/3/27 18:19
+    @desc:
+"""
+import re
+from typing import List
+
+import fitz
+
+from common.handle.base_split_handle import BaseSplitHandle
+from common.util.split_model import SplitModel
+
+default_pattern_list = [re.compile('(?<=^)# .*|(?<=\\n)# .*'), re.compile('(?<!#)## (?!#).*'),
+                        re.compile("(?<!#)### (?!#).*"),
+                        re.compile("(?<!#)#### (?!#).*"), re.compile("(?<!#)##### (?!#).*"),
+                        re.compile("(?<!#)###### (?!#).*"), re.compile("(?<!\n)\n\n+")]
+
+
+def number_to_text(pdf_document, page_number):
+    page = pdf_document.load_page(page_number)
+    text = page.get_text()
+    return text
+
+
+class PdfSplitHandle(BaseSplitHandle):
+    def handle(self, file, pattern_list: List, with_filter: bool, limit: int, get_buffer):
+        try:
+            buffer = get_buffer(file)
+            pdf_document = fitz.open(file.name, buffer)
+            content = "\n".join([number_to_text(pdf_document, page_number) for page_number in range(len(pdf_document))])
+            if pattern_list is not None and len(pattern_list) > 0:
+                split_model = SplitModel(pattern_list, with_filter, limit)
+            else:
+                split_model = SplitModel(default_pattern_list, with_filter=with_filter, limit=limit)
+        except BaseException as e:
+            return {'name': file.name,
+                    'content': []}
+        return {'name': file.name,
+                'content': split_model.parse(content)
+                }
+
+    def support(self, file, get_buffer):
+        file_name: str = file.name.lower()
+        if file_name.endswith(".pdf"):
+            return True
+        return False
diff --git a/apps/common/handle/impl/text_split_handle.py b/apps/common/handle/impl/text_split_handle.py
new file mode 100644
index 00000000000..67f56c37d30
--- /dev/null
+++ b/apps/common/handle/impl/text_split_handle.py
@@ -0,0 +1,47 @@
+# coding=utf-8
+"""
+    @project: maxkb
+    @Author：虎
+    @file： text_split_handle.py
+    @date：2024/3/27 18:19
+    @desc:
+"""
+import re
+from typing import List
+
+import chardet
+
+from common.handle.base_split_handle import BaseSplitHandle
+from common.util.split_model import SplitModel
+
+default_pattern_list = [re.compile('(?<=^)# .*|(?<=\\n)# .*'), re.compile('(?<!#)## (?!#).*'),
+                        re.compile("(?<!#)### (?!#).*"),
+                        re.compile("(?<!#)#### (?!#).*"), re.compile("(?<!#)##### (?!#).*"),
+                        re.compile("(?<!#)###### (?!#).*"), re.compile("(?<!\n)\n\n+")]
+
+
+class TextSplitHandle(BaseSplitHandle):
+    def support(self, file, get_buffer):
+        buffer = get_buffer(file)
+        file_name: str = file.name.lower()
+        if file_name.endswith(".md") or file_name.endswith('.txt'):
+            return True
+        result = chardet.detect(buffer)
+        if result['encoding'] != 'ascii' and result['confidence'] > 0.5:
+            return True
+        return False
+
+    def handle(self, file, pattern_list: List, with_filter: bool, limit: int, get_buffer):
+        buffer = get_buffer(file)
+        if pattern_list is not None and len(pattern_list) > 0:
+            split_model = SplitModel(pattern_list, with_filter, limit)
+        else:
+            split_model = SplitModel(default_pattern_list, with_filter=with_filter, limit=limit)
+        try:
+            content = buffer.decode(chardet.detect(buffer)['encoding'])
+        except BaseException as e:
+            return {'name': file.name,
+                    'content': []}
+        return {'name': file.name,
+                'content': split_model.parse(content)
+                }
diff --git a/apps/dataset/serializers/document_serializers.py b/apps/dataset/serializers/document_serializers.py
index 529479927d4..8e062d8b4d7 100644
--- a/apps/dataset/serializers/document_serializers.py
+++ b/apps/dataset/serializers/document_serializers.py
@@ -22,6 +22,9 @@
 from common.event.common import work_thread_pool
 from common.event.listener_manage import ListenerManagement, SyncWebDocumentArgs
 from common.exception.app_exception import AppApiException
+from common.handle.impl.doc_split_handle import DocSplitHandle
+from common.handle.impl.pdf_split_handle import PdfSplitHandle
+from common.handle.impl.text_split_handle import TextSplitHandle
 from common.mixins.api_mixin import ApiMixin
 from common.util.common import post
 from common.util.field_message import ErrMessage
@@ -593,17 +596,22 @@ def batch_delete(self, instance: Dict, with_valid=True):
             return True
 
 
+class FileBufferHandle:
+    buffer = None
+
+    def get_buffer(self, file):
+        if self.buffer is None:
+            self.buffer = file.read()
+        return self.buffer
+
+
+default_split_handle = TextSplitHandle()
+split_handles = [DocSplitHandle(), PdfSplitHandle(), default_split_handle]
+
+
 def file_to_paragraph(file, pattern_list: List, with_filter: bool, limit: int):
-    data = file.read()
-    if pattern_list is not None and len(pattern_list) > 0:
-        split_model = SplitModel(pattern_list, with_filter, limit)
-    else:
-        split_model = get_split_model(file.name, with_filter=with_filter, limit=limit)
-    try:
-        content = data.decode(chardet.detect(data)['encoding'])
-    except BaseException as e:
-        return {'name': file.name,
-                'content': []}
-    return {'name': file.name,
-            'content': split_model.parse(content)
-            }
+    get_buffer = FileBufferHandle().get_buffer
+    for split_handle in split_handles:
+        if split_handle.support(file, get_buffer):
+            return split_handle.handle(file, pattern_list, with_filter, limit, get_buffer)
+    return default_split_handle.handle(file, pattern_list, with_filter, limit, get_buffer)
diff --git a/pyproject.toml b/pyproject.toml
index 2675941b23d..b2152f4c9c9 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -31,6 +31,8 @@ langchain-openai = "^0.0.8"
 django-ipware = "^6.0.4"
 django-apscheduler = "^0.6.2"
 chardet2 = "^2.0.3"
+pymupdf = "^1.24.0"
+python-docx = "^1.1.0"
 
 [build-system]
 requires = ["poetry-core"]

From 2f305690fb77cacd9289d8c85309625cde45748a Mon Sep 17 00:00:00 2001
From: wangdan-fit2cloud <dan.wang@fit2cloud.com>
Date: Fri, 29 Mar 2024 17:59:51 +0800
Subject: [PATCH 2/2] =?UTF-8?q?fix:=20=E9=80=9A=E7=94=A8=E5=9E=8B=E7=9F=A5?=
 =?UTF-8?q?=E8=AF=86=E5=BA=93=E6=94=AF=E6=8C=81=E4=B8=8A=E4=BC=A0=20PDF/DO?=
 =?UTF-8?q?C=20=E6=A0=BC=E5=BC=8F=E7=9A=84=E6=96=87=E6=A1=A3#19?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 ui/src/views/dataset/component/UploadComponent.vue | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ui/src/views/dataset/component/UploadComponent.vue b/ui/src/views/dataset/component/UploadComponent.vue
index d4d3fc4ac8f..b750889c535 100644
--- a/ui/src/views/dataset/component/UploadComponent.vue
+++ b/ui/src/views/dataset/component/UploadComponent.vue
@@ -16,7 +16,7 @@
         action="#"
         :auto-upload="false"
         :show-file-list="false"
-        accept=".txt, .md, .csv, .log"
+        accept=".txt, .md, .csv, .log, .doc, .docx, .pdf"
         :limit="50"
         :on-exceed="onExceed"
       >