Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

pull request 20241114 #951

Open
wants to merge 13 commits into
base: master
Choose a base branch
from
14 changes: 12 additions & 2 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,9 @@ FROM ubuntu:22.04
# Set environment variables to non-interactive to avoid prompts during installation
ENV DEBIAN_FRONTEND=noninteractive

RUN /bin/bash -c "sed -i 's/archive.ubuntu.com/mirrors.aliyun.com/g' /etc/apt/sources.list && \
sed -i 's/security.ubuntu.com/mirrors.aliyun.com/g' /etc/apt/sources.list"

# Update the package list and install necessary packages
RUN apt-get update && \
apt-get install -y \
Expand Down Expand Up @@ -41,10 +44,17 @@ RUN /bin/bash -c "wget https://gitee.com/myhloli/MinerU/raw/master/magic-pdf.tem
pip3 install -U magic-pdf"

# Download models and update the configuration file
RUN /bin/bash -c "pip3 install modelscope && \
RUN /bin/bash -c "source /opt/mineru_venv/bin/activate && \
pip3 install modelscope && \
wget https://gitee.com/myhloli/MinerU/raw/master/scripts/download_models.py && \
python3 download_models.py && \
sed -i 's|cpu|cuda|g' /root/magic-pdf.json"

# install extents
COPY requirements-fastapi.txt /minerugw/requirements-fastapi.txt

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

To do not increase by default the size of the image, this could be set as a optional requirements.


RUN /bin/bash -c "source /opt/mineru_venv/bin/activate && \
pip3 install -r /minerugw/requirements-fastapi.txt"

# Set the entry point to activate the virtual environment and run the command line tool
ENTRYPOINT ["/bin/bash", "-c", "source /opt/mineru_venv/bin/activate && exec \"$@\"", "--"]
CMD ["/bin/bash", "-c", "source /opt/mineru_venv/bin/activate && exec \"$@\"", "--"]
45 changes: 45 additions & 0 deletions docker-compose.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
services:
mineru-gw:
build:
context: .
depends_on:
- redis
container_name: mineru-gw
ports:
- "8910:80"
volumes:
- ./services/fastapi/app:/minerugw/app:rw
deploy:
resources:
reservations:
devices:
- driver: nvidia
count: all
capabilities: [gpu]
environment:
TZ: Asia/Shanghai
command: ["/minerugw/app/start.sh"]
restart: always
networks:
- default

redis:
image: redis:7.2.4
container_name: redis
ports:
- "6380:6379"
volumes:
- ./services/redis/conf/redis.conf:/etc/redis.conf
- ./services/redis/conf/:/data/
restart: always
entrypoint: ["redis-server", "/etc/redis.conf"]
environment:
TZ: Asia/Shanghai
networks:
- default

networks:
default:
driver: bridge
ipam:
driver: default
3 changes: 3 additions & 0 deletions requirements-fastapi.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
fastapi>=0.115.2,<0.115.4
uvicorn>=0.30.0,<0.32.0
redis>=5.2.0
Empty file.
94 changes: 94 additions & 0 deletions services/fastapi/app/magic_pdf_parse_util.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,94 @@
import os
import json
import datetime
import shutil

from loguru import logger

from magic_pdf.libs.draw_bbox import draw_layout_bbox, draw_span_bbox
from magic_pdf.pipe.UNIPipe import UNIPipe
from magic_pdf.pipe.OCRPipe import OCRPipe
from magic_pdf.pipe.TXTPipe import TXTPipe
from magic_pdf.rw.DiskReaderWriter import DiskReaderWriter

from . import redis_util

def pdf_parse(
md5_value,
pdf_bytes: bytes,
parse_method: str = 'auto',
model_json_path: str = None,
output_dir: str = None
):
"""
执行从 pdf 转换到 json、md 的过程,输出 md 和 json 文件到 pdf 文件所在的目录
:param parse_method: 解析方法, 共 auto、ocr、txt 三种,默认 auto,如果效果不好,可以尝试 ocr
:param model_json_path: 已经存在的模型数据文件,如果为空则使用内置模型,pdf 和 model_json 务必对应
:param is_json_md_dump: 是否将解析后的数据写入到 .json 和 .md 文件中,默认 True,会将不同阶段的数据写入到不同的 .json 文件中(共3个.json文件),md内容会保存到 .md 文件中
:param output_dir: 输出结果的目录地址,会生成一个以 pdf 文件名命名的文件夹并保存所有结果
"""
try:
file_info = redis_util.get_file_info(md5_value)
if not file_info:
return
if file_info["state"] != "init":
return
redis_util.set_parse_parsing(md5_value)
current_script_dir = os.path.dirname(os.path.abspath(__file__))
foldname = datetime.datetime.now().strftime("%Y%m%d%H%M%S")
if output_dir:
output_path = os.path.join(output_dir, foldname)
else:
output_path = os.path.join(current_script_dir, foldname)

output_image_path = os.path.join(output_path, 'images')

# 获取图片的父路径,为的是以相对路径保存到 .md 和 conent_list.json 文件中
image_path_parent = os.path.basename(output_image_path)

if model_json_path:
# 读取已经被模型解析后的pdf文件的 json 原始数据,list 类型
model_json = json.loads(open(model_json_path, "r", encoding="utf-8").read())
else:
model_json = []

# 执行解析步骤
image_writer = DiskReaderWriter(output_image_path)

# 选择解析方式
# jso_useful_key = {"_pdf_type": "", "model_list": model_json}
# pipe = UNIPipe(pdf_bytes, jso_useful_key, image_writer)
if parse_method == "auto":
jso_useful_key = {"_pdf_type": "", "model_list": model_json}
pipe = UNIPipe(pdf_bytes, jso_useful_key, image_writer)
elif parse_method == "txt":
pipe = TXTPipe(pdf_bytes, model_json, image_writer)
elif parse_method == "ocr":
pipe = OCRPipe(pdf_bytes, model_json, image_writer)
else:
shutil.rmtree(output_path)
redis_util.set_parse_failed(md5_value)
logger.error("unknown parse method, only auto, ocr, txt allowed")
return

# 执行分类
pipe.pipe_classify()

# 如果没有传入模型数据,则使用内置模型解析
if not model_json:
pipe.pipe_analyze() # 解析

# 执行解析
pipe.pipe_parse()

# 保存 text 和 md 格式的结果
content_list = pipe.pipe_mk_uni_format(image_path_parent, drop_mode="none")
md_content = pipe.pipe_mk_markdown(image_path_parent, drop_mode="none")

# delete fold
shutil.rmtree(output_path)
redis_util.set_parse_parsed(md5_value, content_list, md_content)

except Exception as e:
redis_util.set_parse_failed(md5_value)
logger.exception(e)
51 changes: 51 additions & 0 deletions services/fastapi/app/main.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
'''
Author: dt_4541218930 abcstorms@163.com
Date: 2024-11-14 17:04:42
LastEditors: dt_4541218930 abcstorms@163.com
LastEditTime: 2024-11-15 22:38:00
FilePath: \lzmineru\services\fastapi\app\main.py
Description: 这是默认设置,请设置`customMade`, 打开koroFileHeader查看配置 进行设置: https://github.com/OBKoro1/koro1FileHeader/wiki/%E9%85%8D%E7%BD%AE
'''
from fastapi import FastAPI
import urllib.request
import hashlib
import queue
import threading
from . import magic_pdf_parse_util
from . import redis_util

message_queue = queue.Queue(20)

app = FastAPI()

def calc_md5(byteContent: bytes):
hash_md5 = hashlib.md5()
hash_md5.update(byteContent)
return hash_md5.hexdigest()

def commit_parse_task(md5_value, byteContent: bytes, parse_method):
message_queue.put({"md5": md5_value, "byteContent": byteContent, "parse_method": parse_method})

def queue_consumer(q):
while True:
item = q.get()
if (item):
magic_pdf_parse_util.pdf_parse(item['md5'], item['byteContent'], item['parse_method'])

consumer_thread = threading.Thread(target=queue_consumer, args=(message_queue,))
consumer_thread.start()

@app.post("/parse_pdf")
async def parse_pdf(imageUrl: str, parse_method: str = 'auto'):
pdf_bytes = urllib.request.urlopen(imageUrl).read()
md5_value = calc_md5(pdf_bytes)
file_info = redis_util.get_file_info(md5_value)
if file_info:
return file_info
try:
commit_parse_task(md5_value, pdf_bytes, parse_method)
except Exception:
redis_util.set_parse_deny(md5_value)
return redis_util.get_file_info(md5_value)
redis_util.set_parse_init(md5_value)
return redis_util.get_file_info(md5_value)
42 changes: 42 additions & 0 deletions services/fastapi/app/redis_util.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
import redis
import json
import enum

ParseState = enum.Enum('ParseState', ('deny', 'init', 'start', 'done', 'failed'))

redis_conn = redis.Redis(host='host.docker.internal', port=6380, db=0)

def get_file_info(md5_value):
json_str = redis_conn.get(md5_value)
if json_str:
return json.loads(json_str)

def del_file_info(md5_value):
redis_conn.delete(md5_value)

def set_file_info_expire(md5_value, expire_seconds):
redis_conn.expire(md5_value, expire_seconds)

def set_file_info(md5_value, state: ParseState, content_list = "", md_content = ""):
json_str = json.dumps({"state": state.name, "content_list": content_list, "md_content": md_content})
redis_conn.set(md5_value, json_str)

def set_parse_deny(md5_value):
set_file_info(md5_value, ParseState.deny)
set_file_info_expire(md5_value, 5)

def set_parse_failed(md5_value):
set_file_info(md5_value, ParseState.failed)
set_file_info_expire(md5_value, 10)

def set_parse_init(md5_value):
set_file_info(md5_value, ParseState.init)
set_file_info_expire(md5_value, 60 * 60)

def set_parse_parsing(md5_value):
set_file_info(md5_value, ParseState.start)
set_file_info_expire(md5_value, 60 * 30)

def set_parse_parsed(md5_value, content_list, md_content):
set_file_info(md5_value, ParseState.done, content_list, md_content)
set_file_info_expire(md5_value, 60 * 60 * 24)
5 changes: 5 additions & 0 deletions services/fastapi/app/start.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
#!/bin/bash
echo "starting miner server"
source /opt/mineru_venv/bin/activate
cd /minerugw
uvicorn app.main:app --host 0.0.0.0 --port 80
28 changes: 28 additions & 0 deletions services/fastapi/app/test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
'''
Author: FutureMeng be_loving@163.com
Date: 2024-11-13 19:05:01
LastEditors: FutureMeng be_loving@163.com
LastEditTime: 2024-11-13 19:06:17
FilePath: \lzmineru\api\test.py
Description: 这是默认设置,请设置`customMade`, 打开koroFileHeader查看配置 进行设置: https://github.com/OBKoro1/koro1FileHeader/wiki/%E9%85%8D%E7%BD%AE
'''

import urllib.request
import os
from magic_pdf.pipe.UNIPipe import UNIPipe
from magic_pdf.rw.DiskReaderWriter import DiskReaderWriter

current_script_dir = os.path.dirname(os.path.abspath(__file__))
local_image_dir = os.path.join(current_script_dir, 'images')
image_dir = str(os.path.basename(local_image_dir))
imageUrl = 'https://one-jiulu.oss-cn-beijing.aliyuncs.com/9250ba5ccbf34249b054d063d32ec8f8.pdf?OSSAccessKeyId=LTAI5tABhdnCgSeVaptuWLfx&Expires=1732100601&Signature=XZqGPO%2BJ76bEJ0ou8GZQUO7vhjs%3D'

pdf_bytes = urllib.request.urlopen(imageUrl).read()
image_writer = DiskReaderWriter(local_image_dir)
jso_useful_key = {"_pdf_type": "", "model_list": []}
pipe = UNIPipe(pdf_bytes, jso_useful_key, image_writer)
pipe.pipe_classify()
pipe.pipe_analyze()
pipe.pipe_parse()
md_content = pipe.pipe_mk_markdown(image_dir, drop_mode="none")
print(md_content)
Binary file added services/redis/conf/dump.rdb
Binary file not shown.
Loading
Loading