Skip to content

Commit

Permalink
update pdf extraction workflow
Browse files Browse the repository at this point in the history
  • Loading branch information
diptanu committed Feb 4, 2025
1 parent c117c08 commit 1f4debe
Show file tree
Hide file tree
Showing 5 changed files with 17 additions and 112 deletions.
27 changes: 6 additions & 21 deletions examples/pdf_document_extraction/docker-compose.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ services:
"indexify-cli",
"executor",
"-f",
"default:Extract_pages_tables_images_pdf_docling:download_pdf:0.1",
"default:Extract_pages_tables_images_pdf_docling:download_pdf",
"--server-addr",
"indexify:8900"
]
Expand All @@ -37,7 +37,7 @@ services:
"indexify-cli",
"executor",
"-f",
"default:Extract_pages_tables_images_pdf_docling:image-embedding-docling:0.1",
"default:Extract_pages_tables_images_pdf_docling:image-embedding-docling",
"--server-addr",
"indexify:8900"
]
Expand All @@ -55,7 +55,7 @@ services:
"indexify-cli",
"executor",
"-f",
"default:Extract_pages_tables_images_pdf_docling:elastic_search_writer:0.1",
"default:Extract_pages_tables_images_pdf_docling:elastic_search_writer",
"--server-addr",
"indexify:8900"
]
Expand All @@ -73,7 +73,7 @@ services:
"indexify-cli",
"executor",
"-f",
"default:Extract_pages_tables_images_pdf_docling:text-embedding-extractor:0.1",
"default:Extract_pages_tables_images_pdf_docling:text-embedding-extractor",
"--server-addr",
"indexify:8900"
]
Expand All @@ -91,7 +91,7 @@ services:
"indexify-cli",
"executor",
"-f",
"default:Extract_pages_tables_images_pdf_docling:chunk_text_docling:0.1",
"default:Extract_pages_tables_images_pdf_docling:chunk_text_docling",
"--server-addr",
"indexify:8900"
]
Expand All @@ -115,21 +115,6 @@ services:
server:
volumes:
- data:/tmp/indexify-blob-storage
default-executor:
image: tensorlake/indexify-executor-default:3.13-0.3.0
command:
[
"indexify-cli",
"executor",
"-f",
"default:Extract_pages_tables_images_pdf_docling:default:0.1",
"--server-addr",
"indexify:8900"
]
networks:
server:
volumes:
- data:/tmp/indexify-blob-storage
pdf-parser-executor:
# Use this for GPU support
image: tensorlake/pdf-blueprint-pdf-parser-gpu:latest
Expand All @@ -139,7 +124,7 @@ services:
"indexify-cli",
"executor",
"-f",
"default:Extract_pages_tables_images_pdf_docling:pdf-parse-docling:0.1",
"default:Extract_pages_tables_images_pdf_docling:pdf-parse-docling",
"--server-addr",
"indexify:8900"
]
Expand Down
7 changes: 5 additions & 2 deletions examples/pdf_document_extraction/elastic_writer.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,3 @@
from elastic_transport import ApiError
from elasticsearch import Elasticsearch
from typing import Union
import base64
import uuid
Expand All @@ -17,6 +15,8 @@ class ElasticSearchWriter(TensorlakeCompute):
def __init__(self):
super().__init__()
# Connect to Elasticsearch
from elastic_transport import ApiError
from elasticsearch import Elasticsearch
self._client = Elasticsearch(
hosts=["http://elasticsearch:9200"], # <User Change>: default is service name in the docker compose file.
verify_certs=False,
Expand All @@ -31,6 +31,7 @@ def __init__(self):
self._create_indices_if_not_exists()

def _create_indices_if_not_exists(self):
from elastic_transport import ApiError
# Text index mapping
text_mapping = {
"mappings": {
Expand Down Expand Up @@ -98,6 +99,8 @@ def _create_indices_if_not_exists(self):
raise e

def run(self, input: Union[ImageWithEmbedding, TextChunk]) -> bool:
from elastic_transport import ApiError
from elasticsearch import Elasticsearch
try:
if isinstance(input, ImageWithEmbedding):
# Convert image bytes to base64 for storage
Expand Down
69 changes: 3 additions & 66 deletions examples/pdf_document_extraction/embedding.py
Original file line number Diff line number Diff line change
@@ -1,45 +1,10 @@
import base64
from typing import Any, List

from tensorlake.functions_sdk.functions import TensorlakeCompute, tensorlake_function
from sentence_transformers import SentenceTransformer
from common_objects import ImageWithEmbedding, TextChunk, PDFParserDoclingOutput
from inkwell.api.document import Document
from inkwell.api.page import PageFragmentType
import base64
from images import st_image

@tensorlake_function(image=st_image)
def chunk_text(document: Document) -> List[TextChunk]:
"""
Extract chunks from document
s"""
from langchain_text_splitters import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
chunks: List[TextChunk] = []
for page in document.pages:
page_text = ""
for fragment in page.page_fragments:
# Add the table or figure as a separate chunk, with the text extracted from OCR
if (
fragment.fragment_type == PageFragmentType.TABLE
or fragment.fragment_type == PageFragmentType.FIGURE
):
chunks.append(
TextChunk(chunk=fragment.content.text, page_number=page.page_number)
)

# Add all the text from the page to the page text, and chunk them later.
elif fragment.fragment_type == PageFragmentType.TEXT:
page_text += fragment.content.text

texts = text_splitter.split_text(page_text)
for text in texts:
chunk = TextChunk(chunk=text, page_number=page.page_number)
chunks.append(chunk)
return chunks


@tensorlake_function(image=st_image)
def chunk_text_docling(document: PDFParserDoclingOutput) -> List[TextChunk]:
"""
Expand Down Expand Up @@ -67,6 +32,7 @@ class TextEmbeddingExtractor(TensorlakeCompute):

def __init__(self):
super().__init__()
from sentence_transformers import SentenceTransformer
self.model = SentenceTransformer('Alibaba-NLP/gte-base-en-v1.5', trust_remote_code=True)

def run(self, input: TextChunk) -> TextChunk:
Expand All @@ -75,43 +41,14 @@ def run(self, input: TextChunk) -> TextChunk:
return input


class ImageEmbeddingExtractor(TensorlakeCompute):
name = "image-embedding"
description = "Extractor class that captures an embedding model"
image=st_image

def __init__(self):
super().__init__()
self.model = SentenceTransformer("clip-ViT-B-32")

def run(self, document: Document) -> List[ImageWithEmbedding]:
from PIL import Image as PILImage
import io
embedding = []
for page in document.pages:
for fragment in page.page_fragments:
if fragment.fragment_type == PageFragmentType.FIGURE:
img_bytes = io.BytesIO(base64.b64decode(fragment.content.image))
img_bytes.seek(0)
img_emb = self.model.encode(PILImage.open(img_bytes))
img_bytes.seek(0)
embedding.append(
ImageWithEmbedding(
embedding=img_emb,
image_bytes=img_bytes.getvalue(),
page_number=page.page_number,
)
)
return embedding


class ImageEmbeddingDoclingExtractor(TensorlakeCompute):
name = "image-embedding-docling"
description = "Extractor class that captures an embedding model"
image=st_image

def __init__(self):
super().__init__()
from sentence_transformers import SentenceTransformer
self.model = SentenceTransformer("clip-ViT-B-32")

def run(self, document: PDFParserDoclingOutput) -> List[ImageWithEmbedding]:
Expand Down
9 changes: 1 addition & 8 deletions examples/pdf_document_extraction/images.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,5 @@
from tensorlake import Image

http_client_image = (
Image()
.name("tensorlake/pdf-blueprint-download")
.base_image(f"python:3.11-slim-bookworm")
.run("pip install httpx")
)

chroma_image = (
Image()
.name("tensorlake/blueprints-chromadb")
Expand Down Expand Up @@ -37,7 +30,7 @@
inkwell_image_gpu = (
Image()
.name("tensorlake/pdf-blueprint-pdf-parser-gpu")
.base_image("pytorch/pytorch:2.4.1-cuda11.8-cudnn9-runtime")
.base_image("pytorch/pytorch:2.5.1-cuda12.4-cudnn9-runtime")
.run("apt update")
.run("apt install -y libgl1-mesa-glx")
.run('pip install docling')
Expand Down
17 changes: 2 additions & 15 deletions examples/pdf_document_extraction/workflow.py
Original file line number Diff line number Diff line change
@@ -1,21 +1,9 @@
from tensorlake import RemoteGraph
from elastic_writer import ElasticSearchWriter
from embedding import chunk_text_docling, ImageEmbeddingDoclingExtractor
from tensorlake import RemoteGraph
from tensorlake.functions_sdk.data_objects import File
from tensorlake.functions_sdk.graph import Graph
from tensorlake.functions_sdk.functions import tensorlake_function
from images import http_client_image


@tensorlake_function(image=http_client_image)
def download_pdf(url: str) -> File:
"""
Download pdf from url
"""
import httpx
resp = httpx.get(url=url, follow_redirects=True)
resp.raise_for_status()
return File(data=resp.content, mime_type="application/pdf")


# This graph is the alternate approach.
Expand Down Expand Up @@ -60,11 +48,10 @@ def create_graph() -> Graph:

import common_objects
import images
import elasticsearch # this additional module is needed if you're using the second graph

remote_graph = RemoteGraph.deploy(
graph,
additional_modules=[common_objects, elasticsearch, images],
additional_modules=[common_objects, images],
server_url="http://localhost:8900",
)

Expand Down

0 comments on commit 1f4debe

Please sign in to comment.