Merge pull request #159 from enoch3712/128-markitdown-documentloader

128 markitdown documentloader
enoch3712 · Dec 28, 2024 · cd8e195 · cd8e195
2 parents bded4e1 + 0b8b1e5
commit cd8e195
Showing 6 changed files with 865 additions and 633 deletions.
diff --git a/docs/core-concepts/document-loaders/markitdown.md b/docs/core-concepts/document-loaders/markitdown.md
@@ -0,0 +1,63 @@
+# MarkItDown Document Loader
+
+MarkItDown is a versatile document processing library that can handle multiple file formats. ExtractThinker's MarkItDown loader provides a robust interface for text extraction with optional vision mode support.
+
+## Basic Usage
+
+Here's how to use the MarkItDown loader:
+
+```python
+from extract_thinker import Extractor
+from extract_thinker.document_loader import DocumentLoaderMarkItDown
+
+# Initialize the loader
+loader = DocumentLoaderMarkItDown()
+
+# Load document content
+pages = loader.load("document.pdf")
+
+# Access text content from first page
+text = pages[0]["content"]
+
+# Enable vision mode for image extraction
+loader.set_vision_mode(True)
+pages_with_images = loader.load("document.pdf")
+
+# Access both text and image
+text = pages_with_images[0]["content"]
+image = pages_with_images[0]["image"]  # bytes object
+```
+
+## Features
+
+- Multi-format support (PDF, DOC, DOCX, PPT, PPTX, XLS, XLSX, etc.)
+- Text extraction from various file types
+- Optional vision mode for image extraction
+- Page-by-page processing
+- Stream-based loading support
+- Caching capabilities
+- LLM integration support
+
+## Supported Formats
+
+- Documents: PDF, DOC, DOCX, PPT, PPTX, XLS, XLSX
+- Text: TXT, HTML, XML, JSON
+- Images: JPG, JPEG, PNG, BMP, GIF
+- Audio: WAV, MP3, M4A
+- Others: CSV, TSV, ZIP
+
+## Best Practices
+
+1. **Document Processing**
+   - Use vision mode only when image extraction is needed
+   - Enable caching for repeated processing
+   - Handle large documents using stream-based loading
+
+2. **Performance**
+   - Configure cache TTL based on your needs
+   - Monitor memory usage with large files
+   - Use appropriate file formats for best results
+
+3. **LLM Integration**
+   - Provide LLM client and model when needed
+   - Configure based on your specific use case
diff --git a/extract_thinker/document_loader/document_loader.py b/extract_thinker/document_loader/document_loader.py
@@ -154,3 +154,28 @@ def can_handle_vision(self, source: Union[str, BytesIO]) -> bool:
             return False
         except Exception:
             return False
+
+    def can_handle_paginate(self, source: Union[str, BytesIO]) -> bool:
+        """
+        Checks if the source supports pagination (e.g., PDF, PPT).
+        
+        Args:
+            source: Either a file path (str) or a BytesIO stream
+            
+        Returns:
+            bool: True if the source supports pagination
+        """
+        try:
+            if isinstance(source, str):
+                # For file paths, check the extension
+                ext = get_file_extension(source).lower()
+            else:
+                # For BytesIO streams, use magic to detect mime type
+                mime = magic.from_buffer(source.getvalue(), mime=True)
+                source.seek(0)  # Reset stream position
+                return mime == 'application/pdf'
+
+            # List of extensions that support pagination
+            return ext in ['pdf']
+        except Exception:
+            return False  
diff --git a/extract_thinker/document_loader/document_loader_markitdown.py b/extract_thinker/document_loader/document_loader_markitdown.py
@@ -0,0 +1,88 @@
+from io import BytesIO
+from typing import Any, Dict, List, Union
+from operator import attrgetter
+from cachetools import cachedmethod
+from cachetools.keys import hashkey
+import magic
+
+from extract_thinker.document_loader.cached_document_loader import CachedDocumentLoader
+from extract_thinker.utils import MIME_TYPE_MAPPING
+
+try:
+    from markitdown import MarkItDown
+except ImportError:
+    raise ImportError("MarkItDown library is not installed. Please install it with 'pip install markitdown'.")
+
+class DocumentLoaderMarkItDown(CachedDocumentLoader):
+    """
+    Document loader that uses MarkItDown to extract content from various file formats.
+    Supports text extraction and optional image/page rendering in vision mode.
+    """
+
+    SUPPORTED_FORMATS = [
+        "pdf", "doc", "docx", "ppt", "pptx", "xls", "xlsx", 
+        "csv", "tsv", "txt", "html", "xml", "json", "zip",
+        "jpg", "jpeg", "png", "bmp", "gif", "wav", "mp3", "m4a"
+    ]
+
+    def __init__(self, content: Any = None, cache_ttl: int = 300, llm_client=None, llm_model=None):
+        super().__init__(content, cache_ttl)
+        self.markitdown = MarkItDown(llm_client=llm_client, llm_model=llm_model)
+
+    @cachedmethod(cache=attrgetter('cache'), 
+                  key=lambda self, source: hashkey(source if isinstance(source, str) else source.getvalue(), self.vision_mode))
+    def load(self, source: Union[str, BytesIO]) -> List[Dict[str, Any]]:
+        """
+        Load and process content using MarkItDown.
+        Returns a list of pages, each containing:
+        - content: The text content
+        - image: The page/image bytes if vision_mode is True
+        
+        Args:
+            source: Either a file path or BytesIO stream
+            
+        Returns:
+            List[Dict[str, Any]]: List of pages with content and optional images
+        """
+        if not self.can_handle(source):
+            raise ValueError(f"Cannot handle source: {source}")
+
+        if self.vision_mode and not self.can_handle_vision(source):
+            raise ValueError(f"Cannot handle source in vision mode: {source}")
+
+        try:
+            # Extract text content using MarkItDown
+            if isinstance(source, str):
+                result = self.markitdown.convert(source)
+            else:
+                # For BytesIO, we need to determine the file type
+                source.seek(0)
+                mime = magic.from_buffer(source.getvalue(), mime=True)
+                ext = next((ext for ext, mime_types in MIME_TYPE_MAPPING.items() 
+                          if mime in (mime_types if isinstance(mime_types, list) else [mime_types])), 'txt')
+                result = self.markitdown.convert_stream(source, file_extension=f".{ext}")
+                source.seek(0)
+
+            text_content = result.text_content
+
+            # Split into pages if supported
+            pages = []
+            if self.can_handle_paginate(source):
+                raw_pages = text_content.split("\f")
+                for page_text in raw_pages:
+                    if page_text.strip():
+                        pages.append({"content": page_text.strip()})
+            else:
+                pages = [{"content": text_content.strip()}]
+
+            # Add images in vision mode
+            if self.vision_mode:
+                images_dict = self.convert_to_images(source)
+                for idx, page_dict in enumerate(pages):
+                    if idx in images_dict:
+                        page_dict["image"] = images_dict[idx]
+
+            return pages
+
+        except Exception as e:
+            raise ValueError(f"Error processing document with MarkItDown: {str(e)}")
diff --git a/extract_thinker/process.py b/extract_thinker/process.py
@@ -221,14 +221,14 @@ def split(self, classifications: List[Classification], strategy: SplittingStrate
             eager_group = self.splitter.split_eager_doc_group(pages, classifications)
             self.doc_groups = eager_group
         else:  # LAZY strategy
-            processed_groups = self.splitter.split_lazy_doc_group(pages, classifications)
-            self.doc_groups = processed_groups.doc_groups
+            if document_loader.can_handle_paginate(self.file_path):
+                processed_groups = self.splitter.split_lazy_doc_group(pages, classifications)
+                self.doc_groups = processed_groups.doc_groups
+            else:
+                raise ValueError("Document Type does not support lazy splitting. for now only pdf is supported")
 
         return self
 
-    def where(self, condition):
-        pass
-
     def extract(self, vision: bool = False) -> List[Any]:
         """Extract information from the document groups."""
         if self.doc_groups is None: