Skip to content

feat: 支持上传html格式的文档 #364 #518

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
May 23, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
66 changes: 66 additions & 0 deletions apps/common/handle/impl/html_split_handle.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
# coding=utf-8
"""
@project: maxkb
@Author:虎
@file: html_split_handle.py
@date:2024/5/23 10:58
@desc:
"""
import re
from typing import List

from bs4 import BeautifulSoup
from charset_normalizer import detect
from html2text import html2text

from common.handle.base_split_handle import BaseSplitHandle
from common.util.split_model import SplitModel

default_pattern_list = [re.compile('(?<=^)# .*|(?<=\\n)# .*'),
re.compile('(?<=\\n)(?<!#)## (?!#).*|(?<=^)(?<!#)## (?!#).*'),
re.compile("(?<=\\n)(?<!#)### (?!#).*|(?<=^)(?<!#)### (?!#).*"),
re.compile("(?<=\\n)(?<!#)#### (?!#).*|(?<=^)(?<!#)#### (?!#).*"),
re.compile("(?<=\\n)(?<!#)##### (?!#).*|(?<=^)(?<!#)##### (?!#).*"),
re.compile("(?<=\\n)(?<!#)###### (?!#).*|(?<=^)(?<!#)###### (?!#).*")]


def get_encoding(buffer):
beautiful_soup = BeautifulSoup(buffer, "html.parser")
meta_list = beautiful_soup.find_all('meta')
charset_list = [meta.attrs.get('charset') for meta in meta_list if
meta.attrs is not None and 'charset' in meta.attrs]
if len(charset_list) > 0:
charset = charset_list[0]
return charset
return detect(buffer)['encoding']


class HTMLSplitHandle(BaseSplitHandle):
def support(self, file, get_buffer):
buffer = get_buffer(file)
file_name: str = file.name.lower()
if file_name.endswith(".html"):
return True
result = detect(buffer)
if result['encoding'] is not None and result['confidence'] is not None and result['encoding'] != 'ascii' and \
result['confidence'] > 0.5:
return True
return False

def handle(self, file, pattern_list: List, with_filter: bool, limit: int, get_buffer, save_image):
buffer = get_buffer(file)

if pattern_list is not None and len(pattern_list) > 0:
split_model = SplitModel(pattern_list, with_filter, limit)
else:
split_model = SplitModel(default_pattern_list, with_filter=with_filter, limit=limit)
try:
encoding = get_encoding(buffer)
content = buffer.decode(encoding)
content = html2text(content)
except BaseException as e:
return {'name': file.name,
'content': []}
return {'name': file.name,
'content': split_model.parse(content)
}
3 changes: 2 additions & 1 deletion apps/dataset/serializers/document_serializers.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
from common.event.listener_manage import ListenerManagement, SyncWebDocumentArgs, UpdateEmbeddingDatasetIdArgs
from common.exception.app_exception import AppApiException
from common.handle.impl.doc_split_handle import DocSplitHandle
from common.handle.impl.html_split_handle import HTMLSplitHandle
from common.handle.impl.pdf_split_handle import PdfSplitHandle
from common.handle.impl.text_split_handle import TextSplitHandle
from common.mixins.api_mixin import ApiMixin
Expand Down Expand Up @@ -772,7 +773,7 @@ def get_buffer(self, file):


default_split_handle = TextSplitHandle()
split_handles = [DocSplitHandle(), PdfSplitHandle(), default_split_handle]
split_handles = [HTMLSplitHandle(), DocSplitHandle(), PdfSplitHandle(), default_split_handle]


def save_image(image_list):
Expand Down
2 changes: 1 addition & 1 deletion ui/src/utils/utils.ts
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ export function getImgUrl(name: string) {
}
// 是否是白名单后缀
export function isRightType(name: string) {
const typeList = ['txt', 'pdf', 'docx', 'csv', 'md']
const typeList = ['txt', 'pdf', 'docx', 'csv', 'md', 'html']
return typeList.includes(fileType(name))
}

Expand Down
6 changes: 4 additions & 2 deletions ui/src/views/dataset/component/UploadComponent.vue
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
action="#"
:auto-upload="false"
:show-file-list="false"
accept=".txt, .md, .csv, .log, .docx, .pdf"
accept=".txt, .md, .csv, .log, .docx, .pdf, .html"
:limit="50"
:on-exceed="onExceed"
:on-change="fileHandleChange"
Expand All @@ -31,7 +31,9 @@
<em class="hover" @click.prevent="handlePreview(true)"> 选择文件夹 </em>
</p>
<div class="upload__decoration">
<p>支持格式:TXT、Markdown、PDF、DOCX,每次最多上传50个文件,每个文件不超过 100MB</p>
<p>
支持格式:TXT、Markdown、PDF、DOCX、HTML 每次最多上传50个文件,每个文件不超过 100MB
</p>
<p>若使用【高级分段】建议上传前规范文件的分段标识</p>
</div>
</div>
Expand Down
Loading