diff --git a/ai/generative-ai-service/multi-modal-document-extraction/README.md b/ai/generative-ai-service/multi-modal-document-extraction/README.md new file mode 100644 index 000000000..56f86f328 --- /dev/null +++ b/ai/generative-ai-service/multi-modal-document-extraction/README.md @@ -0,0 +1,41 @@ +# Multi-modal Document Extraction + +*This Generative AI service application relies on OCI SDK alongside the new Llama 4 models (Scout and Maverick) to extract data from PDFs (or images) into structured data as JSON.* + +Reviewed: 20.05.2025 + +# When to use this asset? + +Developers, data scientists, or ML engineers who need to extract structured JSON from invoices or other document images and want to compare the performance of the new Llama 4 OCI vision models. + +# How to use this asset? + +1. Open the Streamlit app +2. Upload a PDF or image file +3. In the sidebar, select either **meta.llama-4-scout-17b-16e-instruct** or **meta.llama-4-maverick-17b-128e-instruct-fp8** +4. Wait for processing—JSON output will be displayed when finished + +# Setup + +To get started, clone the repository, install dependencies, and launch the app: + +```bash +git clone +cd +pip install -r requirements.txt +streamlit run .py +``` + +# Useful Links (Optional) + +* [More information on Llama 4 ](https://confluence.oraclecorp.com/confluence/display/EMEACSS/FAQ+for+Generative+AI+Service) + +* [Pretrained Foundational Models in Generative AI](https://docs.oracle.com/en-us/iaas/Content/generative-ai/pretrained-models.htm) + +# License + +Copyright (c) 2025 Oracle and/or its affiliates. + +Licensed under the Universal Permissive License (UPL), Version 1.0. + +See [LICENSE](https://github.com/oracle-devrel/technology-engineering/blob/main/LICENSE) for more details. diff --git a/ai/generative-ai-service/multi-modal-document-extraction/config.py b/ai/generative-ai-service/multi-modal-document-extraction/config.py new file mode 100644 index 000000000..b30228b99 --- /dev/null +++ b/ai/generative-ai-service/multi-modal-document-extraction/config.py @@ -0,0 +1,8 @@ +""" +config +""" + + +compartment_id = "ocid1.compartment.oc1..aaaaaaaaoi33ny4fvy2nxlrbkn5l2t6sw6yuy5tats7iipnb5hz6jmylqqnq" +service_endpoint = "https://inference.generativeai.us-chicago-1.oci.oraclecloud.com" +vision_models = ["meta.llama-4-scout-17b-16e-instruct", "meta.llama-4-maverick-17b-128e-instruct-fp8"] \ No newline at end of file diff --git a/ai/generative-ai-service/multi-modal-document-extraction/llama_scout.py b/ai/generative-ai-service/multi-modal-document-extraction/llama_scout.py new file mode 100644 index 000000000..44abdb249 --- /dev/null +++ b/ai/generative-ai-service/multi-modal-document-extraction/llama_scout.py @@ -0,0 +1,122 @@ +""" +Simple streamlit UI for comparison of vision models + +Converts PDF to image -> Extracts all data into a JSON + +Available models: + - Llama 4 Scout + - Llama 4 Maverick + +Author - Ali Ottoman +""" + +import io +import base64 +import oci +from pdf2image import convert_from_bytes +import streamlit as st +from oci_models import get_llm +from prompt import OVERALL_PROMPT +from config import compartment_id, vision_models + + +# ─── LLM Creation ───────────────────────────────────────────────────────────── +llm_client = get_llm() + +# ─── Helper Functions ───────────────────────────────────────────────────────────── +def save_images(images, output_format="JPEG"): + """ + Saves images locally for processing + """ + image_list = [] + for image in images: + img_byte_arr = io.BytesIO() + image.save(img_byte_arr, format=output_format) + img_byte_arr.seek(0) + image_list.append(img_byte_arr) + return image_list + +def encode_image(image_path): + """ + Encodes an image to base64 format. + """ + with open(image_path, "rb") as image_file: + return base64.b64encode(image_file.read()).decode("utf-8") + +def make_user_message(prompt: str, b64img: str): + """ + Builds UserMessage with text + image + """ + # Text part + txt = oci.generative_ai_inference.models.TextContent() + txt.text = prompt + + # Image part + img = oci.generative_ai_inference.models.ImageContent() + url = oci.generative_ai_inference.models.ImageUrl() + url.url = f"data:image/jpeg;base64,{b64img}" + img.image_url = url + + msg = oci.generative_ai_inference.models.UserMessage() + msg.content = [txt, img] + return msg + +def call_vision_model(frame, prompt: str, vision_model: str): + """ + Assemble and send the chat request + """ + user_msg = make_user_message(prompt, frame) + + # GenericChatRequest + chat_req = oci.generative_ai_inference.models.GenericChatRequest( + messages = [user_msg], + api_format = oci.generative_ai_inference.models.BaseChatRequest.API_FORMAT_GENERIC, + num_generations = 1, + is_stream = False, + temperature = 0.5, + top_p = 0.7, + top_k = -1, + frequency_penalty = 1.0 + ) + + details = oci.generative_ai_inference.models.ChatDetails( + serving_mode = oci.generative_ai_inference.models.OnDemandServingMode(model_id=vision_model), + compartment_id = compartment_id, + chat_request = chat_req + ) + + # Invoke the model + resp = llm_client.chat(details) + return resp.data.chat_response.choices[0].message.content[0].text + +# ─── Main Function ───────────────────────────────────────────────────────────── +def main(): + """ + Streamlit UI and model selection + Running & outputting the JSON + """ + st.title("Model Comparison") + + uploaded_image = st.file_uploader("Upload image here") + + prompt = OVERALL_PROMPT + + with st.sidebar: + st.subheader("Select your model for comparison") + vision_model = st.selectbox("Choose your model:", vision_models) + if uploaded_image is not None: + with st.spinner("Processing..."): + if uploaded_image.type == "application/pdf": + images = convert_from_bytes(uploaded_image.read(), fmt="jpeg") + else: + images = [convert_from_bytes(uploaded_image.read(), fmt="jpeg")[0]] + + image_list = save_images(images) + + encoded_frame = base64.b64encode(image_list[0].getvalue()).decode("utf-8") + + result = call_vision_model(encoded_frame, prompt, vision_model) + st.write(result) + +# ──────────────────────────────────────────────────────────────── +if __name__ == "__main__": + main() diff --git a/ai/generative-ai-service/multi-modal-document-extraction/oci_models.py b/ai/generative-ai-service/multi-modal-document-extraction/oci_models.py new file mode 100644 index 000000000..b4d839e1e --- /dev/null +++ b/ai/generative-ai-service/multi-modal-document-extraction/oci_models.py @@ -0,0 +1,29 @@ +""" + This module provides a function to initialize LLM + + Return an instance of the OCI GenAI language model. + + Author: Ali Ottoman +""" +# ─── Imports ──────────────────────────────────────────────────────────────────── +import oci +from config import service_endpoint + + +# ─── Configuration ───────────────────────────────────────────────────────────── +config = oci.config.from_file("~/.oci/config", "DEFAULT") + +def get_llm(): + """ + Initialize and return an instance of ChatOCIGenAI with the specified configuration. + + Returns: + ChatOCIGenAI: An instance of the OCI GenAI language model. + """ + llm = oci.generative_ai_inference.GenerativeAiInferenceClient( + config=config, + service_endpoint=service_endpoint, + retry_strategy=oci.retry.NoneRetryStrategy(), + timeout=(10, 240) + ) + return llm diff --git a/ai/generative-ai-service/multi-modal-document-extraction/prompt.py b/ai/generative-ai-service/multi-modal-document-extraction/prompt.py new file mode 100644 index 000000000..3793439c4 --- /dev/null +++ b/ai/generative-ai-service/multi-modal-document-extraction/prompt.py @@ -0,0 +1,150 @@ +""" + This file contains various prompt templates for different invoice parsing tasks. +""" + +OVERALL_PROMPT = """ + You are a high-precision invoice parser. + When given an image of an invoice, you will: + + 1. Detect all section headers on the invoice. + - A header is any line in larger or bold font, or followed by a blank line, colon, or underline. + + 2. Extract the content under each header until the next header or end of document. + - Key–Value blocks: single lines or small blocks → JSON properties. + - Tables: first row as column headers (snake_case) → array of objects. + - Multi-line notes: join lines with spaces. + + 3. For monetary fields, strip symbols/codes and output two properties: + - (number) + - _currency (string) + + 4. General rules: + - DO NOT output anything other than the valid JSON—no markdown, NO extra text. + - Use null for missing values. + - Dates must be ISO 8601 (YYYY-MM-DD). + + Example: + { + "company_info": { + "name": "Oman Insurance Management Services Ltd.", + "address": "Unit 407, Level 4, Gate District 03, DIFC, Dubai, United Arab Emirates", + "reference": "KFM97956124-E6", + "date": "2024-11-29" + }, + "attention_to": null, + "credit_note": "Endorsement #6 HANMIR", + "reinsured": { + "name": "Hanwha General Insurance Co., Ltd. (Korean Reinsurance Company)" + }, + "original_insured": "KOREA INSTITUTE OF MARITIME AND FISHERIES TECHNOLOGY (OWNER & MANAGER)", + "insurance_covers": "Hull Facultative Reinsurance", + "policy_no": null, + "insurance_period": "One year as from 2024-04-01", + "Line Items": + { + "description": "Premium", + "amount": 12345.67, + "amount_currency": "KRW" + }, + "Order Hereon": { + "percentage": "7.5%", + "amount": 131,797, + "amount_currency": "KRW", + + } + // …additional rows if present + ] + } + """ + +GENERIC_PROMPT = """ + Extract the following details and provide the response only in valid JSON format (no extra explanation or text): + - **Debit / Credit Note No.** + - **Policy Period** + - **Insured** + - **Vessel Name** + - **Details** + - **Currency** + - **Gross Premium 100%** + - **OIMSL Share** + - **Total Deductions** + - **Net Premium** + - **Premium Schedule** + - **Installment Amount** + + Ensure the extracted data is formatted correctly as JSON and include nothing else at all in the response, not even a greeting or closing. + + For example: + + "Debit / Credit Note No.": "296969", + "Policy Period": "Feb 20, 2024 to Jul 15, 2025", + "Insured": "Stealth Maritime Corp. S.A.", + "Vessel Name": "SUPRA DUKE - HULL & MACHINERY", (Make sure this is the entire vessel name only) + "Details": "SUPRA DUKE - Original Premium", + "Currency": "USD", + "Gross Premium 100%": 56973.63, + "OIMSL Share": 4557.89, + "Total Deductions": 979.92, + "Net Premium": 3577.97, + "Premium Schedule": ["Apr 20, 2024", "Jun 14, 2024", "Sep 13, 2024", "Dec 14, 2024", "Mar 16, 2025", "Jun 14, 2025"], + "Installment Amount": [372.87, 641.02, 641.02, 641.02, 641.02, 641.02] + + )" ensure your response is a system prompt format with an example of what the ouput should look like. Also ensure to mention in your gernerated prompt that no other content whatsover should appear except the JSON + """ + +NHS_PROMPT = """ + You are a high-precision invoice parser. + When given an invoice (image, PDF, or text), produce **one** valid JSON object with exactly the following fields, in this order: + + 1. invoice_number (string) + 2. account_reference (string) + 3. issue_date (ISO 8601 date: YYYY-MM-DD) + 4. due_date (ISO 8601 date: YYYY-MM-DD) + 5. supplier_name (string) + 6. supplier_address (string) + 7. VAT_registration_number (string) + 8. total_amount (number) + 9. currency (string) + 10. vat_amount (number) + 11. line_items (array of objects), each with: + - description (string) + - quantity (string) + - unit_price (number) + - total (number) + + **Rules:** + - **Output only** the JSON—no markdown, no extra text. + - Use `null` for any missing values. + - Dates **must** be in ISO 8601 (YYYY-MM-DD). + - Numeric fields must omit symbols and separators (e.g. `1500.0`, not “$1,500”). + - Preserve the array structure for `line_items` even if empty. + + **Example:** + ```json + { + "invoice_number": "INV-1001", + "account_reference": "AR-2024", + "issue_date": "2024-05-18", + "due_date": "2024-06-18", + "supplier_name": "Acme Corporation", + "supplier_address": "123 Main St, Anytown, Country", + "VAT_registration_number": "GB123456789", + "total_amount": 1500.0, + "currency": "GBP", + "vat_amount": 300.0, + "line_items": [ + { + "description": "Widget A", + "quantity": "10", + "unit_price": 50.0, + "total": 500.0 + }, + { + "description": "Widget B", + "quantity": "20", + "unit_price": 50.0, + "total": 1000.0 + } + ] + } + """ \ No newline at end of file diff --git a/ai/generative-ai-service/multi-modal-document-extraction/requirements.txt b/ai/generative-ai-service/multi-modal-document-extraction/requirements.txt new file mode 100644 index 000000000..cc9cbfb7e --- /dev/null +++ b/ai/generative-ai-service/multi-modal-document-extraction/requirements.txt @@ -0,0 +1,5 @@ +langchain_community==0.3.24 +langchain_core==0.3.59 +pdf2image==1.17.0 +streamlit==1.41.0 +oci==2.150.3 \ No newline at end of file