Skip to content

Commit

Permalink
Merge pull request #159 from enoch3712/128-markitdown-documentloader
Browse files Browse the repository at this point in the history
128 markitdown documentloader
  • Loading branch information
enoch3712 authored Dec 28, 2024

Verified

This commit was created on GitHub.com and signed with GitHub’s verified signature. The key has expired.
2 parents bded4e1 + 0b8b1e5 commit cd8e195
Showing 6 changed files with 865 additions and 633 deletions.
63 changes: 63 additions & 0 deletions docs/core-concepts/document-loaders/markitdown.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
# MarkItDown Document Loader

MarkItDown is a versatile document processing library that can handle multiple file formats. ExtractThinker's MarkItDown loader provides a robust interface for text extraction with optional vision mode support.

## Basic Usage

Here's how to use the MarkItDown loader:

```python
from extract_thinker import Extractor
from extract_thinker.document_loader import DocumentLoaderMarkItDown

# Initialize the loader
loader = DocumentLoaderMarkItDown()

# Load document content
pages = loader.load("document.pdf")

# Access text content from first page
text = pages[0]["content"]

# Enable vision mode for image extraction
loader.set_vision_mode(True)
pages_with_images = loader.load("document.pdf")

# Access both text and image
text = pages_with_images[0]["content"]
image = pages_with_images[0]["image"] # bytes object
```

## Features

- Multi-format support (PDF, DOC, DOCX, PPT, PPTX, XLS, XLSX, etc.)
- Text extraction from various file types
- Optional vision mode for image extraction
- Page-by-page processing
- Stream-based loading support
- Caching capabilities
- LLM integration support

## Supported Formats

- Documents: PDF, DOC, DOCX, PPT, PPTX, XLS, XLSX
- Text: TXT, HTML, XML, JSON
- Images: JPG, JPEG, PNG, BMP, GIF
- Audio: WAV, MP3, M4A
- Others: CSV, TSV, ZIP

## Best Practices

1. **Document Processing**
- Use vision mode only when image extraction is needed
- Enable caching for repeated processing
- Handle large documents using stream-based loading

2. **Performance**
- Configure cache TTL based on your needs
- Monitor memory usage with large files
- Use appropriate file formats for best results

3. **LLM Integration**
- Provide LLM client and model when needed
- Configure based on your specific use case
25 changes: 25 additions & 0 deletions extract_thinker/document_loader/document_loader.py
Original file line number Diff line number Diff line change
@@ -154,3 +154,28 @@ def can_handle_vision(self, source: Union[str, BytesIO]) -> bool:
return False
except Exception:
return False

def can_handle_paginate(self, source: Union[str, BytesIO]) -> bool:
"""
Checks if the source supports pagination (e.g., PDF, PPT).
Args:
source: Either a file path (str) or a BytesIO stream
Returns:
bool: True if the source supports pagination
"""
try:
if isinstance(source, str):
# For file paths, check the extension
ext = get_file_extension(source).lower()
else:
# For BytesIO streams, use magic to detect mime type
mime = magic.from_buffer(source.getvalue(), mime=True)
source.seek(0) # Reset stream position
return mime == 'application/pdf'

# List of extensions that support pagination
return ext in ['pdf']
except Exception:
return False
88 changes: 88 additions & 0 deletions extract_thinker/document_loader/document_loader_markitdown.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
from io import BytesIO
from typing import Any, Dict, List, Union
from operator import attrgetter
from cachetools import cachedmethod
from cachetools.keys import hashkey
import magic

from extract_thinker.document_loader.cached_document_loader import CachedDocumentLoader
from extract_thinker.utils import MIME_TYPE_MAPPING

try:
from markitdown import MarkItDown
except ImportError:
raise ImportError("MarkItDown library is not installed. Please install it with 'pip install markitdown'.")

class DocumentLoaderMarkItDown(CachedDocumentLoader):
"""
Document loader that uses MarkItDown to extract content from various file formats.
Supports text extraction and optional image/page rendering in vision mode.
"""

SUPPORTED_FORMATS = [
"pdf", "doc", "docx", "ppt", "pptx", "xls", "xlsx",
"csv", "tsv", "txt", "html", "xml", "json", "zip",
"jpg", "jpeg", "png", "bmp", "gif", "wav", "mp3", "m4a"
]

def __init__(self, content: Any = None, cache_ttl: int = 300, llm_client=None, llm_model=None):
super().__init__(content, cache_ttl)
self.markitdown = MarkItDown(llm_client=llm_client, llm_model=llm_model)

@cachedmethod(cache=attrgetter('cache'),
key=lambda self, source: hashkey(source if isinstance(source, str) else source.getvalue(), self.vision_mode))
def load(self, source: Union[str, BytesIO]) -> List[Dict[str, Any]]:
"""
Load and process content using MarkItDown.
Returns a list of pages, each containing:
- content: The text content
- image: The page/image bytes if vision_mode is True
Args:
source: Either a file path or BytesIO stream
Returns:
List[Dict[str, Any]]: List of pages with content and optional images
"""
if not self.can_handle(source):
raise ValueError(f"Cannot handle source: {source}")

if self.vision_mode and not self.can_handle_vision(source):
raise ValueError(f"Cannot handle source in vision mode: {source}")

try:
# Extract text content using MarkItDown
if isinstance(source, str):
result = self.markitdown.convert(source)
else:
# For BytesIO, we need to determine the file type
source.seek(0)
mime = magic.from_buffer(source.getvalue(), mime=True)
ext = next((ext for ext, mime_types in MIME_TYPE_MAPPING.items()
if mime in (mime_types if isinstance(mime_types, list) else [mime_types])), 'txt')
result = self.markitdown.convert_stream(source, file_extension=f".{ext}")
source.seek(0)

text_content = result.text_content

# Split into pages if supported
pages = []
if self.can_handle_paginate(source):
raw_pages = text_content.split("\f")
for page_text in raw_pages:
if page_text.strip():
pages.append({"content": page_text.strip()})
else:
pages = [{"content": text_content.strip()}]

# Add images in vision mode
if self.vision_mode:
images_dict = self.convert_to_images(source)
for idx, page_dict in enumerate(pages):
if idx in images_dict:
page_dict["image"] = images_dict[idx]

return pages

except Exception as e:
raise ValueError(f"Error processing document with MarkItDown: {str(e)}")
10 changes: 5 additions & 5 deletions extract_thinker/process.py
Original file line number Diff line number Diff line change
@@ -221,14 +221,14 @@ def split(self, classifications: List[Classification], strategy: SplittingStrate
eager_group = self.splitter.split_eager_doc_group(pages, classifications)
self.doc_groups = eager_group
else: # LAZY strategy
processed_groups = self.splitter.split_lazy_doc_group(pages, classifications)
self.doc_groups = processed_groups.doc_groups
if document_loader.can_handle_paginate(self.file_path):
processed_groups = self.splitter.split_lazy_doc_group(pages, classifications)
self.doc_groups = processed_groups.doc_groups
else:
raise ValueError("Document Type does not support lazy splitting. for now only pdf is supported")

return self

def where(self, condition):
pass

def extract(self, vision: bool = False) -> List[Any]:
"""Extract information from the document groups."""
if self.doc_groups is None:
Loading

0 comments on commit cd8e195

Please sign in to comment.