-
Notifications
You must be signed in to change notification settings - Fork 85
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Loading status checks…
Merge pull request #159 from enoch3712/128-markitdown-documentloader
128 markitdown documentloader
- Loading branch information
Showing
6 changed files
with
865 additions
and
633 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,63 @@ | ||
# MarkItDown Document Loader | ||
|
||
MarkItDown is a versatile document processing library that can handle multiple file formats. ExtractThinker's MarkItDown loader provides a robust interface for text extraction with optional vision mode support. | ||
|
||
## Basic Usage | ||
|
||
Here's how to use the MarkItDown loader: | ||
|
||
```python | ||
from extract_thinker import Extractor | ||
from extract_thinker.document_loader import DocumentLoaderMarkItDown | ||
|
||
# Initialize the loader | ||
loader = DocumentLoaderMarkItDown() | ||
|
||
# Load document content | ||
pages = loader.load("document.pdf") | ||
|
||
# Access text content from first page | ||
text = pages[0]["content"] | ||
|
||
# Enable vision mode for image extraction | ||
loader.set_vision_mode(True) | ||
pages_with_images = loader.load("document.pdf") | ||
|
||
# Access both text and image | ||
text = pages_with_images[0]["content"] | ||
image = pages_with_images[0]["image"] # bytes object | ||
``` | ||
|
||
## Features | ||
|
||
- Multi-format support (PDF, DOC, DOCX, PPT, PPTX, XLS, XLSX, etc.) | ||
- Text extraction from various file types | ||
- Optional vision mode for image extraction | ||
- Page-by-page processing | ||
- Stream-based loading support | ||
- Caching capabilities | ||
- LLM integration support | ||
|
||
## Supported Formats | ||
|
||
- Documents: PDF, DOC, DOCX, PPT, PPTX, XLS, XLSX | ||
- Text: TXT, HTML, XML, JSON | ||
- Images: JPG, JPEG, PNG, BMP, GIF | ||
- Audio: WAV, MP3, M4A | ||
- Others: CSV, TSV, ZIP | ||
|
||
## Best Practices | ||
|
||
1. **Document Processing** | ||
- Use vision mode only when image extraction is needed | ||
- Enable caching for repeated processing | ||
- Handle large documents using stream-based loading | ||
|
||
2. **Performance** | ||
- Configure cache TTL based on your needs | ||
- Monitor memory usage with large files | ||
- Use appropriate file formats for best results | ||
|
||
3. **LLM Integration** | ||
- Provide LLM client and model when needed | ||
- Configure based on your specific use case |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
88 changes: 88 additions & 0 deletions
88
extract_thinker/document_loader/document_loader_markitdown.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,88 @@ | ||
from io import BytesIO | ||
from typing import Any, Dict, List, Union | ||
from operator import attrgetter | ||
from cachetools import cachedmethod | ||
from cachetools.keys import hashkey | ||
import magic | ||
|
||
from extract_thinker.document_loader.cached_document_loader import CachedDocumentLoader | ||
from extract_thinker.utils import MIME_TYPE_MAPPING | ||
|
||
try: | ||
from markitdown import MarkItDown | ||
except ImportError: | ||
raise ImportError("MarkItDown library is not installed. Please install it with 'pip install markitdown'.") | ||
|
||
class DocumentLoaderMarkItDown(CachedDocumentLoader): | ||
""" | ||
Document loader that uses MarkItDown to extract content from various file formats. | ||
Supports text extraction and optional image/page rendering in vision mode. | ||
""" | ||
|
||
SUPPORTED_FORMATS = [ | ||
"pdf", "doc", "docx", "ppt", "pptx", "xls", "xlsx", | ||
"csv", "tsv", "txt", "html", "xml", "json", "zip", | ||
"jpg", "jpeg", "png", "bmp", "gif", "wav", "mp3", "m4a" | ||
] | ||
|
||
def __init__(self, content: Any = None, cache_ttl: int = 300, llm_client=None, llm_model=None): | ||
super().__init__(content, cache_ttl) | ||
self.markitdown = MarkItDown(llm_client=llm_client, llm_model=llm_model) | ||
|
||
@cachedmethod(cache=attrgetter('cache'), | ||
key=lambda self, source: hashkey(source if isinstance(source, str) else source.getvalue(), self.vision_mode)) | ||
def load(self, source: Union[str, BytesIO]) -> List[Dict[str, Any]]: | ||
""" | ||
Load and process content using MarkItDown. | ||
Returns a list of pages, each containing: | ||
- content: The text content | ||
- image: The page/image bytes if vision_mode is True | ||
Args: | ||
source: Either a file path or BytesIO stream | ||
Returns: | ||
List[Dict[str, Any]]: List of pages with content and optional images | ||
""" | ||
if not self.can_handle(source): | ||
raise ValueError(f"Cannot handle source: {source}") | ||
|
||
if self.vision_mode and not self.can_handle_vision(source): | ||
raise ValueError(f"Cannot handle source in vision mode: {source}") | ||
|
||
try: | ||
# Extract text content using MarkItDown | ||
if isinstance(source, str): | ||
result = self.markitdown.convert(source) | ||
else: | ||
# For BytesIO, we need to determine the file type | ||
source.seek(0) | ||
mime = magic.from_buffer(source.getvalue(), mime=True) | ||
ext = next((ext for ext, mime_types in MIME_TYPE_MAPPING.items() | ||
if mime in (mime_types if isinstance(mime_types, list) else [mime_types])), 'txt') | ||
result = self.markitdown.convert_stream(source, file_extension=f".{ext}") | ||
source.seek(0) | ||
|
||
text_content = result.text_content | ||
|
||
# Split into pages if supported | ||
pages = [] | ||
if self.can_handle_paginate(source): | ||
raw_pages = text_content.split("\f") | ||
for page_text in raw_pages: | ||
if page_text.strip(): | ||
pages.append({"content": page_text.strip()}) | ||
else: | ||
pages = [{"content": text_content.strip()}] | ||
|
||
# Add images in vision mode | ||
if self.vision_mode: | ||
images_dict = self.convert_to_images(source) | ||
for idx, page_dict in enumerate(pages): | ||
if idx in images_dict: | ||
page_dict["image"] = images_dict[idx] | ||
|
||
return pages | ||
|
||
except Exception as e: | ||
raise ValueError(f"Error processing document with MarkItDown: {str(e)}") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.