|
| 1 | +# To run this example, you will need to: |
| 2 | +# 1. Set a `MISTRAL_API_KEY` environment variable |
| 3 | +# 2. Place a PDF file named `sample.pdf` in the same directory as this script |
| 4 | +# |
| 5 | +# This example demonstrates OCR document processing with structured annotations, |
| 6 | +# embedding the extracted documents using Mistral embeddings, and storing them |
| 7 | +# in an InMemoryDocumentStore for later retrieval. |
| 8 | +# |
| 9 | +# You can customize the ImageAnnotation and DocumentAnnotation schemas below |
| 10 | +# to extract different structured information from your documents. |
| 11 | + |
| 12 | +from typing import List |
| 13 | + |
| 14 | +from haystack import Pipeline |
| 15 | +from haystack.components.writers import DocumentWriter |
| 16 | +from haystack.document_stores.in_memory import InMemoryDocumentStore |
| 17 | +from mistralai.models import DocumentURLChunk |
| 18 | +from pydantic import BaseModel, Field |
| 19 | + |
| 20 | +from haystack_integrations.components.converters.mistral.ocr_document_converter import ( |
| 21 | + MistralOCRDocumentConverter, |
| 22 | +) |
| 23 | +from haystack_integrations.components.embedders.mistral.document_embedder import ( |
| 24 | + MistralDocumentEmbedder, |
| 25 | +) |
| 26 | + |
| 27 | + |
| 28 | +# Define schema for structured image annotations (bbox) |
| 29 | +class ImageAnnotation(BaseModel): |
| 30 | + image_type: str = Field(..., description="The type of image content") |
| 31 | + description: str = Field(..., description="Brief description of the image") |
| 32 | + |
| 33 | + |
| 34 | +# Define schema for structured document annotations |
| 35 | +class DocumentAnnotation(BaseModel): |
| 36 | + language: str = Field(..., description="Primary language of the document") |
| 37 | + urls: List[str] = Field(..., description="URLs found in the document") |
| 38 | + topics: List[str] = Field(..., description="Main topics covered in the document") |
| 39 | + |
| 40 | + |
| 41 | +# Initialize document store |
| 42 | +document_store = InMemoryDocumentStore() |
| 43 | + |
| 44 | +# Create indexing pipeline |
| 45 | +indexing_pipeline = Pipeline() |
| 46 | + |
| 47 | +# Add components to the pipeline |
| 48 | +indexing_pipeline.add_component( |
| 49 | + "converter", |
| 50 | + MistralOCRDocumentConverter(pages=[0, 1]), |
| 51 | +) |
| 52 | +indexing_pipeline.add_component( |
| 53 | + "embedder", |
| 54 | + MistralDocumentEmbedder(), |
| 55 | +) |
| 56 | +indexing_pipeline.add_component( |
| 57 | + "writer", |
| 58 | + DocumentWriter(document_store=document_store), |
| 59 | +) |
| 60 | + |
| 61 | +# Connect components |
| 62 | +indexing_pipeline.connect("converter.documents", "embedder.documents") |
| 63 | +indexing_pipeline.connect("embedder.documents", "writer.documents") |
| 64 | + |
| 65 | +# Prepare sources: URL and local file |
| 66 | +sources = [ |
| 67 | + DocumentURLChunk(document_url="https://arxiv.org/pdf/1706.03762"), |
| 68 | + "./sample.pdf", # Local PDF file |
| 69 | +] |
| 70 | + |
| 71 | +# Run the pipeline with annotation schemas |
| 72 | +result = indexing_pipeline.run( |
| 73 | + { |
| 74 | + "converter": { |
| 75 | + "sources": sources, |
| 76 | + "bbox_annotation_schema": ImageAnnotation, |
| 77 | + "document_annotation_schema": DocumentAnnotation, |
| 78 | + } |
| 79 | + } |
| 80 | +) |
| 81 | + |
| 82 | + |
| 83 | +# Check out documents processed by OCR. |
| 84 | +# Optional with enriched content (from bbox annotation) and semantic meta data (from document annotation) |
| 85 | +documents = document_store.storage |
| 86 | +# Check out mistral api response for unprocessed data and with usage_info |
| 87 | +raw_mistral_response = result["converter"]["raw_mistral_response"] |
0 commit comments