Skip to content

Pr@main@pdf #23

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Mar 29, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 20 additions & 0 deletions apps/common/handle/base_split_handle.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
# coding=utf-8
"""
@project: maxkb
@Author:虎
@file: base_split_handle.py
@date:2024/3/27 18:13
@desc:
"""
from abc import ABC, abstractmethod
from typing import List


class BaseSplitHandle(ABC):
@abstractmethod
def support(self, file, get_buffer):
pass

@abstractmethod
def handle(self, file, pattern_list: List, with_filter: bool, limit: int, get_buffer):
pass
45 changes: 45 additions & 0 deletions apps/common/handle/impl/doc_split_handle.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
# coding=utf-8
"""
@project: maxkb
@Author:虎
@file: text_split_handle.py
@date:2024/3/27 18:19
@desc:
"""
import io
import re
from typing import List

from docx import Document

from common.handle.base_split_handle import BaseSplitHandle
from common.util.split_model import SplitModel

default_pattern_list = [re.compile('(?<=^)# .*|(?<=\\n)# .*'), re.compile('(?<!#)## (?!#).*'),
re.compile("(?<!#)### (?!#).*"),
re.compile("(?<!#)#### (?!#).*"), re.compile("(?<!#)##### (?!#).*"),
re.compile("(?<!#)###### (?!#).*"), re.compile("(?<!\n)\n\n+")]


class DocSplitHandle(BaseSplitHandle):
def handle(self, file, pattern_list: List, with_filter: bool, limit: int, get_buffer):
try:
buffer = get_buffer(file)
doc = Document(io.BytesIO(buffer))
content = "\n".join([para.text for para in doc.paragraphs])
if pattern_list is not None and len(pattern_list) > 0:
split_model = SplitModel(pattern_list, with_filter, limit)
else:
split_model = SplitModel(default_pattern_list, with_filter=with_filter, limit=limit)
except BaseException as e:
return {'name': file.name,
'content': []}
return {'name': file.name,
'content': split_model.parse(content)
}

def support(self, file, get_buffer):
file_name: str = file.name.lower()
if file_name.endswith(".docx") or file_name.endswith(".doc"):
return True
return False
50 changes: 50 additions & 0 deletions apps/common/handle/impl/pdf_split_handle.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
# coding=utf-8
"""
@project: maxkb
@Author:虎
@file: text_split_handle.py
@date:2024/3/27 18:19
@desc:
"""
import re
from typing import List

import fitz

from common.handle.base_split_handle import BaseSplitHandle
from common.util.split_model import SplitModel

default_pattern_list = [re.compile('(?<=^)# .*|(?<=\\n)# .*'), re.compile('(?<!#)## (?!#).*'),
re.compile("(?<!#)### (?!#).*"),
re.compile("(?<!#)#### (?!#).*"), re.compile("(?<!#)##### (?!#).*"),
re.compile("(?<!#)###### (?!#).*"), re.compile("(?<!\n)\n\n+")]


def number_to_text(pdf_document, page_number):
page = pdf_document.load_page(page_number)
text = page.get_text()
return text


class PdfSplitHandle(BaseSplitHandle):
def handle(self, file, pattern_list: List, with_filter: bool, limit: int, get_buffer):
try:
buffer = get_buffer(file)
pdf_document = fitz.open(file.name, buffer)
content = "\n".join([number_to_text(pdf_document, page_number) for page_number in range(len(pdf_document))])
if pattern_list is not None and len(pattern_list) > 0:
split_model = SplitModel(pattern_list, with_filter, limit)
else:
split_model = SplitModel(default_pattern_list, with_filter=with_filter, limit=limit)
except BaseException as e:
return {'name': file.name,
'content': []}
return {'name': file.name,
'content': split_model.parse(content)
}

def support(self, file, get_buffer):
file_name: str = file.name.lower()
if file_name.endswith(".pdf"):
return True
return False
47 changes: 47 additions & 0 deletions apps/common/handle/impl/text_split_handle.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
# coding=utf-8
"""
@project: maxkb
@Author:虎
@file: text_split_handle.py
@date:2024/3/27 18:19
@desc:
"""
import re
from typing import List

import chardet

from common.handle.base_split_handle import BaseSplitHandle
from common.util.split_model import SplitModel

default_pattern_list = [re.compile('(?<=^)# .*|(?<=\\n)# .*'), re.compile('(?<!#)## (?!#).*'),
re.compile("(?<!#)### (?!#).*"),
re.compile("(?<!#)#### (?!#).*"), re.compile("(?<!#)##### (?!#).*"),
re.compile("(?<!#)###### (?!#).*"), re.compile("(?<!\n)\n\n+")]


class TextSplitHandle(BaseSplitHandle):
def support(self, file, get_buffer):
buffer = get_buffer(file)
file_name: str = file.name.lower()
if file_name.endswith(".md") or file_name.endswith('.txt'):
return True
result = chardet.detect(buffer)
if result['encoding'] != 'ascii' and result['confidence'] > 0.5:
return True
return False

def handle(self, file, pattern_list: List, with_filter: bool, limit: int, get_buffer):
buffer = get_buffer(file)
if pattern_list is not None and len(pattern_list) > 0:
split_model = SplitModel(pattern_list, with_filter, limit)
else:
split_model = SplitModel(default_pattern_list, with_filter=with_filter, limit=limit)
try:
content = buffer.decode(chardet.detect(buffer)['encoding'])
except BaseException as e:
return {'name': file.name,
'content': []}
return {'name': file.name,
'content': split_model.parse(content)
}
34 changes: 21 additions & 13 deletions apps/dataset/serializers/document_serializers.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,9 @@
from common.event.common import work_thread_pool
from common.event.listener_manage import ListenerManagement, SyncWebDocumentArgs
from common.exception.app_exception import AppApiException
from common.handle.impl.doc_split_handle import DocSplitHandle
from common.handle.impl.pdf_split_handle import PdfSplitHandle
from common.handle.impl.text_split_handle import TextSplitHandle
from common.mixins.api_mixin import ApiMixin
from common.util.common import post
from common.util.field_message import ErrMessage
Expand Down Expand Up @@ -593,17 +596,22 @@ def batch_delete(self, instance: Dict, with_valid=True):
return True


class FileBufferHandle:
buffer = None

def get_buffer(self, file):
if self.buffer is None:
self.buffer = file.read()
return self.buffer


default_split_handle = TextSplitHandle()
split_handles = [DocSplitHandle(), PdfSplitHandle(), default_split_handle]


def file_to_paragraph(file, pattern_list: List, with_filter: bool, limit: int):
data = file.read()
if pattern_list is not None and len(pattern_list) > 0:
split_model = SplitModel(pattern_list, with_filter, limit)
else:
split_model = get_split_model(file.name, with_filter=with_filter, limit=limit)
try:
content = data.decode(chardet.detect(data)['encoding'])
except BaseException as e:
return {'name': file.name,
'content': []}
return {'name': file.name,
'content': split_model.parse(content)
}
get_buffer = FileBufferHandle().get_buffer
for split_handle in split_handles:
if split_handle.support(file, get_buffer):
return split_handle.handle(file, pattern_list, with_filter, limit, get_buffer)
return default_split_handle.handle(file, pattern_list, with_filter, limit, get_buffer)
2 changes: 2 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,8 @@ langchain-openai = "^0.0.8"
django-ipware = "^6.0.4"
django-apscheduler = "^0.6.2"
chardet2 = "^2.0.3"
pymupdf = "^1.24.0"
python-docx = "^1.1.0"

[build-system]
requires = ["poetry-core"]
Expand Down
2 changes: 1 addition & 1 deletion ui/src/views/dataset/component/UploadComponent.vue
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
action="#"
:auto-upload="false"
:show-file-list="false"
accept=".txt, .md, .csv, .log"
accept=".txt, .md, .csv, .log, .doc, .docx, .pdf"
:limit="50"
:on-exceed="onExceed"
>
Expand Down