Skip to content

Commit

Permalink
feat(machine-learning): add mineru (#21932)
Browse files Browse the repository at this point in the history
  • Loading branch information
hongbo-miao authored Dec 24, 2024
1 parent e134135 commit af52d36
Show file tree
Hide file tree
Showing 8 changed files with 3,592 additions and 0 deletions.
43 changes: 43 additions & 0 deletions .github/workflows/test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -103,6 +103,7 @@ jobs:
kafka-rust-proto-producer: ${{ steps.filter.outputs.kafka-rust-proto-producer }}
kafka-rust-udp-kafka-bridge: ${{ steps.filter.outputs.kafka-rust-udp-kafka-bridge }}
kafka-rust-zeromq-kafka-bridge: ${{ steps.filter.outputs.kafka-rust-zeromq-kafka-bridge }}
mineru: ${{ steps.filter.outputs.mineru }}
mobile-android: ${{ steps.filter.outputs.mobile-android }}
mobile-ios: ${{ steps.filter.outputs.mobile-ios }}
mobile-react-native: ${{ steps.filter.outputs.mobile-react-native }}
Expand Down Expand Up @@ -370,6 +371,9 @@ jobs:
iads-zeromq-iads-bridge:
- '.github/workflows/test.yml'
- 'data-visualization/iads/iads-rtstation/zeromq-iads-bridge/**'
mineru:
- '.github/workflows/test.yml'
- 'machine-learning/mineru/**'
mobile-android:
- '.github/workflows/test.yml'
- 'mobile/mobile-android/**'
Expand Down Expand Up @@ -1919,6 +1923,45 @@ jobs:
with:
directory: machine-learning/hm-kubeflow/pipelines/classify-mnist

mineru-test:
name: MinerU | Test
needs: detect-changes
if: ${{ needs.detect-changes.outputs.mineru == 'true' }}
runs-on: ubuntu-24.04
environment: test
timeout-minutes: 10
steps:
- name: Checkout
uses: actions/checkout@v4.2.2
- name: Install uv
uses: astral-sh/setup-uv@v5.1.0
with:
version: 0.5.11
enable-cache: true
cache-dependency-glob: machine-learning/mineru/uv.lock
- name: Set up Python
uses: actions/setup-python@v5.3.0
with:
python-version-file: machine-learning/mineru/pyproject.toml
- name: Install Detectron2
working-directory: machine-learning/mineru
run: |
uv venv
uv pip install torch
uv pip install --no-build-isolation git+https://github.com/facebookresearch/detectron2.git
- name: Install dependencies
working-directory: machine-learning/mineru
run: |
uv sync --dev
- name: Test
working-directory: machine-learning/mineru
run: |
uv run poe test-coverage
- name: Upload coverage to Codecov
uses: codecov/codecov-action@v5.1.2
with:
directory: machine-learning/mineru

docling-test:
name: Docling | Test
needs: detect-changes
Expand Down
3 changes: 3 additions & 0 deletions .mergify.yml
Original file line number Diff line number Diff line change
Expand Up @@ -311,6 +311,9 @@ pull_request_rules:
- or:
- check-success=Kubeflow (classify-mnist) | Test
- check-skipped=Kubeflow (classify-mnist) | Test
- or:
- check-success=MinerU | Test
- check-skipped=MinerU | Test
- or:
- check-success=Docling | Test
- check-skipped=Docling | Test
Expand Down
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -424,6 +424,7 @@ The diagram illustrates the repository's architecture, which is considered overl

- **LlamaIndex** - LLM application framework
- **LangChain** - LLM application framework
- **MinerU** - PDF parsing
- **Docling** - LLM application framework
- **GPT4All** - Local LLM models
- **LiteLLM** - LLM gateway
Expand Down
13 changes: 13 additions & 0 deletions machine-learning/mineru/Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
uv-install-python::
uv python install
uv-update-lock-file:
uv lock
uv-install-dependencies:
uv sync --dev

uv-run-dev:
uv run poe dev
uv-run-test:
uv run poe test
uv-run-test-coverage:
uv run poe test-coverage
27 changes: 27 additions & 0 deletions machine-learning/mineru/pyproject.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
[project]
name = "mineru"
version = "1.0.0"
requires-python = "~=3.12.0"
dependencies = [
"detectron2",
"magic-pdf[full]==0.10.6",
]

[dependency-groups]
dev = [
"poethepoet==0.31.1",
"pytest==8.3.4",
"pytest-cov==6.0.0",
]

[tool.uv]
package = false
prerelease = "allow"

[tool.uv.sources]
detectron2 = { git = "https://github.com/facebookresearch/detectron2.git" }

[tool.poe.tasks]
dev = "python src/main.py"
test = "pytest --verbose --verbose"
test-coverage = "pytest --cov=. --cov-report=xml"
3 changes: 3 additions & 0 deletions machine-learning/mineru/src/dummy_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
class TestDummy:
def test_dummy(self):
assert 1 + 1 == 2
60 changes: 60 additions & 0 deletions machine-learning/mineru/src/main.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
import logging
from pathlib import Path

from magic_pdf.config.enums import SupportedPdfParseMethod
from magic_pdf.data.data_reader_writer import FileBasedDataReader, FileBasedDataWriter
from magic_pdf.data.dataset import PymuDocDataset
from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze


def process_pdf(pdf_file_path: Path, output_dir_path: Path) -> None:
pdf_flle_stem = pdf_file_path.stem
output_dir_path.mkdir(parents=True, exist_ok=True)
logging.info(f"Processing PDF: {pdf_file_path}")

image_writer = FileBasedDataWriter(str(output_dir_path))
markdown_writer = FileBasedDataWriter(str(output_dir_path))
image_dir = output_dir_path.name

# Read PDF file into memory
reader1 = FileBasedDataReader("")
pdf_bytes = reader1.read(str(pdf_file_path))
ds = PymuDocDataset(pdf_bytes)

# Process PDF based on type
pdf_type = ds.classify()
logging.info(f"Processing PDF using {pdf_type} mode")

if pdf_type == SupportedPdfParseMethod.OCR:
infer_result = ds.apply(doc_analyze, ocr=True)
pipe_result = infer_result.pipe_ocr_mode(image_writer)
else:
infer_result = ds.apply(doc_analyze, ocr=False)
pipe_result = infer_result.pipe_txt_mode(image_writer)

# Generate outputs
model_output = str(output_dir_path / f"{pdf_flle_stem}_model.pdf")
layout_output = str(output_dir_path / f"{pdf_flle_stem}_layout.pdf")
spans_output = str(output_dir_path / f"{pdf_flle_stem}_spans.pdf")
markdown_output = f"{pdf_flle_stem}.md"
content_list_output = f"{pdf_flle_stem}_content_list.json"

infer_result.draw_model(model_output)
pipe_result.draw_layout(layout_output)
pipe_result.draw_span(spans_output)
pipe_result.dump_md(markdown_writer, markdown_output, image_dir)
pipe_result.dump_content_list(markdown_writer, content_list_output, image_dir)

logging.info(f"All outputs saved to: {output_dir_path}")
logging.info("Processing completed successfully")


def main() -> None:
pdf_file_path = Path("data/file.pdf")
output_dir_path = Path("output")
process_pdf(pdf_file_path, output_dir_path)


if __name__ == "__main__":
logging.basicConfig(level=logging.INFO)
main()
Loading

0 comments on commit af52d36

Please sign in to comment.