Merge pull request #44 from ucbepic/shreyashankar/paddleocr

feat: add paddleocr
ucbepic · Oct 2, 2024 · 5a78b84 · 5a78b84
2 parents 40846ff + 4fc27cb
commit 5a78b84
Show file tree

Hide file tree

Showing 6 changed files with 1,298 additions and 46 deletions.
diff --git a/docetl/parsing_tools.py b/docetl/parsing_tools.py
@@ -312,6 +312,71 @@ def azure_di_read(
         ]
 
 
+def paddleocr_pdf_to_string(
+    input_path: str,
+    doc_per_page: bool = False,
+    ocr_enabled: bool = True,
+    lang: str = "en",
+) -> List[str]:
+    """
+    Extract text and image information from a PDF file using PaddleOCR for image-based PDFs.
+
+    **Note: this is very slow!!**
+
+    Args:
+        input_path (str): Path to the input PDF file.
+        doc_per_page (bool): If True, return a list of strings, one per page.
+            If False, return a single string.
+        ocr_enabled (bool): Whether to enable OCR for image-based PDFs.
+        lang (str): Language of the PDF file.
+
+    Returns:
+        List[str]: Extracted content as a list of formatted strings.
+    """
+    from paddleocr import PaddleOCR
+    import fitz
+    import numpy as np
+
+    ocr = PaddleOCR(use_angle_cls=True, lang=lang)
+
+    pdf_content = []
+
+    with fitz.open(input_path) as pdf:
+        for page_num in range(len(pdf)):
+            page = pdf[page_num]
+            text = page.get_text()
+            images = []
+
+            # Extract image information
+            for img_index, img in enumerate(page.get_images(full=True)):
+                rect = page.get_image_bbox(img)
+                images.append(f"Image {img_index + 1}: bbox {rect}")
+
+            page_content = f"Page {page_num + 1}:\n"
+            page_content += f"Text:\n{text}\n"
+            page_content += "Images:\n" + "\n".join(images) + "\n"
+
+            if not text and ocr_enabled:
+                mat = fitz.Matrix(2, 2)
+                pix = page.get_pixmap(matrix=mat)
+                img = np.frombuffer(pix.samples, dtype=np.uint8).reshape(
+                    pix.height, pix.width, 3
+                )
+
+                ocr_result = ocr.ocr(img, cls=True)
+                page_content += "OCR Results:\n"
+                for line in ocr_result[0]:
+                    bbox, (text, _) = line
+                    page_content += f"{bbox}, {text}\n"
+
+            pdf_content.append(page_content)
+
+    if not doc_per_page:
+        return ["\n\n".join(pdf_content)]
+
+    return pdf_content
+
+
 # Define a dictionary mapping function names to their corresponding functions
 PARSING_TOOLS = {
     "whisper_speech_to_text": whisper_speech_to_text,
@@ -320,4 +385,5 @@ def azure_di_read(
     "docx_to_string": docx_to_string,
     "pptx_to_string": pptx_to_string,
     "azure_di_read": azure_di_read,
+    "paddleocr_pdf_to_string": paddleocr_pdf_to_string,
 }
diff --git a/docs/examples/custom-parsing.md b/docs/examples/custom-parsing.md
@@ -33,16 +33,35 @@ To use custom parsing, you need to define parsing tools in your DocETL configura
 
 ```yaml
 parsing_tools:
-  - name: ocr_parser
+  - name: top_products_report
     function_code: |
-      import pytesseract
-      from pdf2image import convert_from_path
-      def ocr_parser(filename: str) -> List[str]:
-          images = convert_from_path(filename)
-          text = ""
-          for image in images:
-              text += pytesseract.image_to_string(image)
-          return [text]
+      def top_products_report(filename: str) -> List[str]:
+          import pandas as pd
+          
+          # Read the Excel file
+          df = pd.read_excel(filename)
+          
+          # Calculate total sales
+          total_sales = df['Sales'].sum()
+          
+          # Find top 500 products by sales
+          top_products = df.groupby('Product')['Sales'].sum().nlargest(500)
+          
+          # Calculate month-over-month growth
+          df['Date'] = pd.to_datetime(df['Date'])
+          monthly_sales = df.groupby(df['Date'].dt.to_period('M'))['Sales'].sum()
+          mom_growth = monthly_sales.pct_change().fillna(0)
+          
+          # Prepare the analysis report
+          report = [
+              f"Total Sales: ${total_sales:,.2f}",
+              "\nTop 500 Products by Sales:",
+              top_products.to_string(),
+              "\nMonth-over-Month Growth:",
+              mom_growth.to_string()
+          ]
+          
+          return ["\n".join(report)]
 
 datasets:
   sales_reports:
@@ -51,25 +70,26 @@ datasets:
     path: "sales_data/sales_paths.json"
     parsing:
       - input_key: excel_path
-        function: xlsx_to_string
-        output_key: sales_data
-        function_kwargs:
-          orientation: "col"
+        function: top_products_report
+        output_key: sales_analysis
 
   receipts:
     type: file
     source: local
     path: "receipts/receipt_paths.json"
     parsing:
       - input_key: pdf_path
-        function: ocr_parser
+        function: paddleocr_pdf_to_string
         output_key: receipt_text
+        function_kwargs:
+          ocr_enabled: true
+          lang: "en"
 ```
 
 In this configuration:
 
-- We define a custom `ocr_parser` for PDF files.
-- We use the built-in `xlsx_to_string` parser for Excel files.
+- We define a custom `top_products_report` function for Excel files.
+- We use the built-in `paddleocr_pdf_to_string` parser for PDF files.
 - We apply these parsing tools to the external files referenced in the respective datasets.
 
 ### 2. Pipeline Integration
@@ -145,48 +165,55 @@ When you run your DocETL pipeline, the parsing tools you've specified in your co
 
 Let's look at how this works for our earlier examples:
 
-### Excel Files (using xlsx_to_string)
+### Excel Files (using top_products_report)
 
 For an Excel file like "sales_data/january_sales.xlsx":
 
-1. The `xlsx_to_string` function reads the Excel file.
-2. It converts the data to a string representation.
-3. The output might look like this:
+- The top_products_report function reads the Excel file.
+- It processes the sales data and generates a report of top-selling products.
+- The output might look like this:
 
-```
-Date:
-2023-01-01
-2023-01-02
-...
-
-Product:
-Widget A
-Widget B
-...
-
-Amount:
-100
-150
-...
+```markdown
+Top Products Report - January 2023
+
+1. Widget A - 1500 units sold
+2. Gadget B - 1200 units sold
+3. Gizmo C - 950 units sold
+4. Doohickey D - 800 units sold
+5. Thingamajig E - 650 units sold
+   ...
+
+Total Revenue: $245,000
+Best Selling Category: Electronics
 ```
 
-### PDF Files (using ocr_parser)
+### PDF Files (using paddleocr_pdf_to_string)
 
 For a PDF file like "receipts/receipt001.pdf":
 
-1. The `ocr_parser` function converts each page of the PDF to an image.
-2. It applies OCR to each image.
-3. The function combines the text from all pages.
-4. The output might look like this:
+- The paddleocr_pdf_to_string function reads the PDF file.
+- It uses PaddleOCR to perform optical character recognition on each page.
+- The function combines the extracted text from all pages into a single string.
+  The output might look like this:
 
-```
+```markdown
 RECEIPT
 Store: Example Store
 Date: 2023-05-15
 Items:
+
 1. Product A - $10.99
 2. Product B - $15.50
-Total: $26.49
+3. Product C - $7.25
+4. Product D - $22.00
+   Subtotal: $55.74
+   Tax (8%): $4.46
+   Total: $60.20
+
+Payment Method: Credit Card
+Card Number: \***\* \*\*** \*\*\*\* 1234
+
+Thank you for your purchase!
 ```
 
 This parsed and formatted data is then passed to the respective operations in your pipeline for further processing.
@@ -242,13 +269,17 @@ DocETL provides several built-in parsing tools to handle common file formats and
     options:
         show_root_heading: true
         heading_level: 3
-
 
 ::: docetl.parsing_tools.azure_di_read
     options:
         heading_level: 3
         show_root_heading: true
 
+::: docetl.parsing_tools.paddleocr_pdf_to_string
+    options:
+        heading_level: 3
+        show_root_heading: true
+
 ### Using Function Arguments with Parsing Tools
 
 When using parsing tools in your DocETL configuration, you can pass additional arguments to the parsing functions using the function_kwargs field. This allows you to customize the behavior of the parsing tools without modifying their implementation.