feat(machine-learning): add mineru (#21932)

hongbo-miao · Dec 24, 2024 · af52d36 · af52d36
1 parent e134135
commit af52d36
Show file tree

Hide file tree

Showing 8 changed files with 3,592 additions and 0 deletions.
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
@@ -103,6 +103,7 @@ jobs:
       kafka-rust-proto-producer: ${{ steps.filter.outputs.kafka-rust-proto-producer }}
       kafka-rust-udp-kafka-bridge: ${{ steps.filter.outputs.kafka-rust-udp-kafka-bridge }}
       kafka-rust-zeromq-kafka-bridge: ${{ steps.filter.outputs.kafka-rust-zeromq-kafka-bridge }}
+      mineru: ${{ steps.filter.outputs.mineru }}
       mobile-android: ${{ steps.filter.outputs.mobile-android }}
       mobile-ios: ${{ steps.filter.outputs.mobile-ios }}
       mobile-react-native: ${{ steps.filter.outputs.mobile-react-native }}
@@ -370,6 +371,9 @@ jobs:
             iads-zeromq-iads-bridge:
               - '.github/workflows/test.yml'
               - 'data-visualization/iads/iads-rtstation/zeromq-iads-bridge/**'
+            mineru:
+              - '.github/workflows/test.yml'
+              - 'machine-learning/mineru/**'
             mobile-android:
               - '.github/workflows/test.yml'
               - 'mobile/mobile-android/**'
@@ -1919,6 +1923,45 @@ jobs:
         with:
           directory: machine-learning/hm-kubeflow/pipelines/classify-mnist
 
+  mineru-test:
+    name: MinerU | Test
+    needs: detect-changes
+    if: ${{ needs.detect-changes.outputs.mineru == 'true' }}
+    runs-on: ubuntu-24.04
+    environment: test
+    timeout-minutes: 10
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4.2.2
+      - name: Install uv
+        uses: astral-sh/setup-uv@v5.1.0
+        with:
+          version: 0.5.11
+          enable-cache: true
+          cache-dependency-glob: machine-learning/mineru/uv.lock
+      - name: Set up Python
+        uses: actions/setup-python@v5.3.0
+        with:
+          python-version-file: machine-learning/mineru/pyproject.toml
+      - name: Install Detectron2
+        working-directory: machine-learning/mineru
+        run: |
+          uv venv
+          uv pip install torch
+          uv pip install --no-build-isolation git+https://github.com/facebookresearch/detectron2.git
+      - name: Install dependencies
+        working-directory: machine-learning/mineru
+        run: |
+          uv sync --dev
+      - name: Test
+        working-directory: machine-learning/mineru
+        run: |
+          uv run poe test-coverage
+      - name: Upload coverage to Codecov
+        uses: codecov/codecov-action@v5.1.2
+        with:
+          directory: machine-learning/mineru
+
   docling-test:
     name: Docling | Test
     needs: detect-changes

diff --git a/.mergify.yml b/.mergify.yml
@@ -311,6 +311,9 @@ pull_request_rules:
       - or:
           - check-success=Kubeflow (classify-mnist) | Test
           - check-skipped=Kubeflow (classify-mnist) | Test
+      - or:
+          - check-success=MinerU | Test
+          - check-skipped=MinerU | Test
       - or:
           - check-success=Docling | Test
           - check-skipped=Docling | Test

diff --git a/README.md b/README.md
@@ -424,6 +424,7 @@ The diagram illustrates the repository's architecture, which is considered overl
 
 - **LlamaIndex** - LLM application framework
 - **LangChain** - LLM application framework
+- **MinerU** - PDF parsing
 - **Docling** - LLM application framework
 - **GPT4All** - Local LLM models
 - **LiteLLM** - LLM gateway

diff --git a/machine-learning/mineru/Makefile b/machine-learning/mineru/Makefile
@@ -0,0 +1,13 @@
+uv-install-python::
+	uv python install
+uv-update-lock-file:
+	uv lock
+uv-install-dependencies:
+	uv sync --dev
+
+uv-run-dev:
+	uv run poe dev
+uv-run-test:
+	uv run poe test
+uv-run-test-coverage:
+	uv run poe test-coverage
diff --git a/machine-learning/mineru/pyproject.toml b/machine-learning/mineru/pyproject.toml
@@ -0,0 +1,27 @@
+[project]
+name = "mineru"
+version = "1.0.0"
+requires-python = "~=3.12.0"
+dependencies = [
+  "detectron2",
+  "magic-pdf[full]==0.10.6",
+]
+
+[dependency-groups]
+dev = [
+  "poethepoet==0.31.1",
+  "pytest==8.3.4",
+  "pytest-cov==6.0.0",
+]
+
+[tool.uv]
+package = false
+prerelease = "allow"
+
+[tool.uv.sources]
+detectron2 = { git = "https://github.com/facebookresearch/detectron2.git" }
+
+[tool.poe.tasks]
+dev = "python src/main.py"
+test = "pytest --verbose --verbose"
+test-coverage = "pytest --cov=. --cov-report=xml"
diff --git a/machine-learning/mineru/src/dummy_test.py b/machine-learning/mineru/src/dummy_test.py
@@ -0,0 +1,3 @@
+class TestDummy:
+    def test_dummy(self):
+        assert 1 + 1 == 2
diff --git a/machine-learning/mineru/src/main.py b/machine-learning/mineru/src/main.py
@@ -0,0 +1,60 @@
+import logging
+from pathlib import Path
+
+from magic_pdf.config.enums import SupportedPdfParseMethod
+from magic_pdf.data.data_reader_writer import FileBasedDataReader, FileBasedDataWriter
+from magic_pdf.data.dataset import PymuDocDataset
+from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
+
+
+def process_pdf(pdf_file_path: Path, output_dir_path: Path) -> None:
+    pdf_flle_stem = pdf_file_path.stem
+    output_dir_path.mkdir(parents=True, exist_ok=True)
+    logging.info(f"Processing PDF: {pdf_file_path}")
+
+    image_writer = FileBasedDataWriter(str(output_dir_path))
+    markdown_writer = FileBasedDataWriter(str(output_dir_path))
+    image_dir = output_dir_path.name
+
+    # Read PDF file into memory
+    reader1 = FileBasedDataReader("")
+    pdf_bytes = reader1.read(str(pdf_file_path))
+    ds = PymuDocDataset(pdf_bytes)
+
+    # Process PDF based on type
+    pdf_type = ds.classify()
+    logging.info(f"Processing PDF using {pdf_type} mode")
+
+    if pdf_type == SupportedPdfParseMethod.OCR:
+        infer_result = ds.apply(doc_analyze, ocr=True)
+        pipe_result = infer_result.pipe_ocr_mode(image_writer)
+    else:
+        infer_result = ds.apply(doc_analyze, ocr=False)
+        pipe_result = infer_result.pipe_txt_mode(image_writer)
+
+    # Generate outputs
+    model_output = str(output_dir_path / f"{pdf_flle_stem}_model.pdf")
+    layout_output = str(output_dir_path / f"{pdf_flle_stem}_layout.pdf")
+    spans_output = str(output_dir_path / f"{pdf_flle_stem}_spans.pdf")
+    markdown_output = f"{pdf_flle_stem}.md"
+    content_list_output = f"{pdf_flle_stem}_content_list.json"
+
+    infer_result.draw_model(model_output)
+    pipe_result.draw_layout(layout_output)
+    pipe_result.draw_span(spans_output)
+    pipe_result.dump_md(markdown_writer, markdown_output, image_dir)
+    pipe_result.dump_content_list(markdown_writer, content_list_output, image_dir)
+
+    logging.info(f"All outputs saved to: {output_dir_path}")
+    logging.info("Processing completed successfully")
+
+
+def main() -> None:
+    pdf_file_path = Path("data/file.pdf")
+    output_dir_path = Path("output")
+    process_pdf(pdf_file_path, output_dir_path)
+
+
+if __name__ == "__main__":
+    logging.basicConfig(level=logging.INFO)
+    main()