From 743b64bc2c0903bca356c0626c0c3177279d3968 Mon Sep 17 00:00:00 2001 From: "ds.chienyuanchang@gmail.com" Date: Wed, 30 Jul 2025 18:47:09 +0000 Subject: [PATCH 1/8] add function to get container sas url --- python/content_understanding_client.py | 33 ++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) diff --git a/python/content_understanding_client.py b/python/content_understanding_client.py index 38f61fd..d4d7027 100644 --- a/python/content_understanding_client.py +++ b/python/content_understanding_client.py @@ -6,10 +6,17 @@ import time from dataclasses import dataclass +from datetime import datetime, timedelta, timezone from requests.models import Response from typing import Any, Dict, List, Optional from pathlib import Path +from azure.identity import DefaultAzureCredential +from azure.storage.blob import ( + BlobServiceClient, + generate_container_sas, + ContainerSasPermissions +) from azure.storage.blob.aio import ContainerClient @@ -30,6 +37,7 @@ class AzureContentUnderstandingClient: OCR_RESULT_FILE_SUFFIX: str = ".result.json" LABEL_FILE_SUFFIX: str = ".labels.json" KNOWLEDGE_SOURCE_LIST_FILE_NAME: str = "sources.jsonl" + SAS_EXPIRY_HOURS: int = 1 # https://learn.microsoft.com/en-us/azure/ai-services/content-understanding/service-limits#document-and-text SUPPORTED_FILE_TYPES_DOCUMENT_TXT: List[str] = [ @@ -174,6 +182,31 @@ def is_supported_doc_type_by_file_path(file_path: Path, is_document: bool=False) file_ext = file_path.suffix.lower() return AzureContentUnderstandingClient.is_supported_doc_type_by_file_ext(file_ext, is_document) + @staticmethod + def generate_temp_container_sas_url( + account_name: str, + container_name: str, + ) -> str: + account_url = f"https://{account_name}.blob.core.windows.net" + blob_service_client = BlobServiceClient(account_url=account_url, credential=DefaultAzureCredential()) + + # Get user delegation key + start_time = datetime.now(timezone.utc) + expiry_time = start_time + timedelta(hours=AzureContentUnderstandingClient.SAS_EXPIRY_HOURS) + delegation_key = blob_service_client.get_user_delegation_key(start_time, expiry_time) + + sas_token = generate_container_sas( + account_name=account_name, + container_name=container_name, + user_delegation_key=delegation_key, + permission=ContainerSasPermissions(read=True, list=True, write=True), + expiry=expiry_time, + start=start_time, + ) + container_sas_url = f"{account_url}/{container_name}?{sas_token}" + + return container_sas_url + def get_all_analyzers(self) -> Dict[str, Any]: """ Retrieves a list of all available analyzers from the content understanding service. From 0eadbc6ea0ea37b6db456033dd44f089c76de1a7 Mon Sep 17 00:00:00 2001 From: "ds.chienyuanchang@gmail.com" Date: Wed, 30 Jul 2025 20:05:59 +0000 Subject: [PATCH 2/8] revise analyzer_training.ipynb --- notebooks/analyzer_training.ipynb | 1171 ++++++++++++++++++++++++++++- 1 file changed, 1158 insertions(+), 13 deletions(-) diff --git a/notebooks/analyzer_training.ipynb b/notebooks/analyzer_training.ipynb index 837006a..bc5e358 100644 --- a/notebooks/analyzer_training.ipynb +++ b/notebooks/analyzer_training.ipynb @@ -33,9 +33,45 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 1, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Defaulting to user installation because normal site-packages is not writeable\n", + "Requirement already satisfied: aiohttp in /home/vscode/.local/lib/python3.11/site-packages (from -r ../requirements.txt (line 1)) (3.12.14)\n", + "Requirement already satisfied: azure-identity in /home/vscode/.local/lib/python3.11/site-packages (from -r ../requirements.txt (line 2)) (1.23.0)\n", + "Requirement already satisfied: azure-storage-blob in /home/vscode/.local/lib/python3.11/site-packages (from -r ../requirements.txt (line 3)) (12.25.1)\n", + "Requirement already satisfied: python-dotenv in /home/vscode/.local/lib/python3.11/site-packages (from -r ../requirements.txt (line 4)) (1.1.1)\n", + "Requirement already satisfied: requests in /home/vscode/.local/lib/python3.11/site-packages (from -r ../requirements.txt (line 5)) (2.32.4)\n", + "Requirement already satisfied: Pillow in /home/vscode/.local/lib/python3.11/site-packages (from -r ../requirements.txt (line 6)) (11.3.0)\n", + "Requirement already satisfied: aiohappyeyeballs>=2.5.0 in /home/vscode/.local/lib/python3.11/site-packages (from aiohttp->-r ../requirements.txt (line 1)) (2.6.1)\n", + "Requirement already satisfied: aiosignal>=1.4.0 in /home/vscode/.local/lib/python3.11/site-packages (from aiohttp->-r ../requirements.txt (line 1)) (1.4.0)\n", + "Requirement already satisfied: attrs>=17.3.0 in /home/vscode/.local/lib/python3.11/site-packages (from aiohttp->-r ../requirements.txt (line 1)) (25.3.0)\n", + "Requirement already satisfied: frozenlist>=1.1.1 in /home/vscode/.local/lib/python3.11/site-packages (from aiohttp->-r ../requirements.txt (line 1)) (1.7.0)\n", + "Requirement already satisfied: multidict<7.0,>=4.5 in /home/vscode/.local/lib/python3.11/site-packages (from aiohttp->-r ../requirements.txt (line 1)) (6.6.3)\n", + "Requirement already satisfied: propcache>=0.2.0 in /home/vscode/.local/lib/python3.11/site-packages (from aiohttp->-r ../requirements.txt (line 1)) (0.3.2)\n", + "Requirement already satisfied: yarl<2.0,>=1.17.0 in /home/vscode/.local/lib/python3.11/site-packages (from aiohttp->-r ../requirements.txt (line 1)) (1.20.1)\n", + "Requirement already satisfied: idna>=2.0 in /home/vscode/.local/lib/python3.11/site-packages (from yarl<2.0,>=1.17.0->aiohttp->-r ../requirements.txt (line 1)) (3.10)\n", + "Requirement already satisfied: azure-core>=1.31.0 in /home/vscode/.local/lib/python3.11/site-packages (from azure-identity->-r ../requirements.txt (line 2)) (1.34.0)\n", + "Requirement already satisfied: cryptography>=2.5 in /home/vscode/.local/lib/python3.11/site-packages (from azure-identity->-r ../requirements.txt (line 2)) (45.0.4)\n", + "Requirement already satisfied: msal>=1.30.0 in /home/vscode/.local/lib/python3.11/site-packages (from azure-identity->-r ../requirements.txt (line 2)) (1.32.3)\n", + "Requirement already satisfied: msal-extensions>=1.2.0 in /home/vscode/.local/lib/python3.11/site-packages (from azure-identity->-r ../requirements.txt (line 2)) (1.3.1)\n", + "Requirement already satisfied: typing-extensions>=4.0.0 in /home/vscode/.local/lib/python3.11/site-packages (from azure-identity->-r ../requirements.txt (line 2)) (4.14.0)\n", + "Requirement already satisfied: isodate>=0.6.1 in /home/vscode/.local/lib/python3.11/site-packages (from azure-storage-blob->-r ../requirements.txt (line 3)) (0.7.2)\n", + "Requirement already satisfied: charset_normalizer<4,>=2 in /home/vscode/.local/lib/python3.11/site-packages (from requests->-r ../requirements.txt (line 5)) (3.4.2)\n", + "Requirement already satisfied: urllib3<3,>=1.21.1 in /home/vscode/.local/lib/python3.11/site-packages (from requests->-r ../requirements.txt (line 5)) (2.5.0)\n", + "Requirement already satisfied: certifi>=2017.4.17 in /home/vscode/.local/lib/python3.11/site-packages (from requests->-r ../requirements.txt (line 5)) (2025.6.15)\n", + "Requirement already satisfied: six>=1.11.0 in /home/vscode/.local/lib/python3.11/site-packages (from azure-core>=1.31.0->azure-identity->-r ../requirements.txt (line 2)) (1.17.0)\n", + "Requirement already satisfied: cffi>=1.14 in /home/vscode/.local/lib/python3.11/site-packages (from cryptography>=2.5->azure-identity->-r ../requirements.txt (line 2)) (1.17.1)\n", + "Requirement already satisfied: pycparser in /home/vscode/.local/lib/python3.11/site-packages (from cffi>=1.14->cryptography>=2.5->azure-identity->-r ../requirements.txt (line 2)) (2.22)\n", + "Requirement already satisfied: PyJWT<3,>=1.0.0 in /home/vscode/.local/lib/python3.11/site-packages (from PyJWT[crypto]<3,>=1.0.0->msal>=1.30.0->azure-identity->-r ../requirements.txt (line 2)) (2.10.1)\n", + "Note: you may need to restart the kernel to use updated packages.\n" + ] + } + ], "source": [ "%pip install -r ../requirements.txt" ] @@ -55,7 +91,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 2, "metadata": {}, "outputs": [], "source": [ @@ -80,9 +116,24 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 3, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:azure.identity._credentials.environment:No environment configuration found.\n", + "INFO:azure.identity._credentials.managed_identity:ManagedIdentityCredential will use IMDS\n", + "INFO:azure.core.pipeline.policies.http_logging_policy:Request URL: 'http://169.254.169.254/metadata/identity/oauth2/token?api-version=REDACTED&resource=REDACTED'\n", + "Request method: 'GET'\n", + "Request headers:\n", + " 'User-Agent': 'azsdk-python-identity/1.23.0 Python/3.11.12 (Linux-5.15.167.4-microsoft-standard-WSL2-x86_64-with-glibc2.36)'\n", + "No body was attached to the request\n", + "INFO:azure.identity._credentials.chained:DefaultAzureCredential acquired a token from AzureCliCredential\n" + ] + } + ], "source": [ "import logging\n", "import json\n", @@ -128,11 +179,217 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 4, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:azure.identity._credentials.environment:No environment configuration found.\n", + "INFO:azure.identity._credentials.managed_identity:ManagedIdentityCredential will use IMDS\n", + "INFO:azure.core.pipeline.policies.http_logging_policy:Request URL: 'http://169.254.169.254/metadata/identity/oauth2/token?api-version=REDACTED&resource=REDACTED'\n", + "Request method: 'GET'\n", + "Request headers:\n", + " 'User-Agent': 'azsdk-python-identity/1.23.0 Python/3.11.12 (Linux-5.15.167.4-microsoft-standard-WSL2-x86_64-with-glibc2.36)'\n", + "No body was attached to the request\n", + "INFO:azure.identity._credentials.chained:DefaultAzureCredential acquired a token from AzureCliCredential\n", + "INFO:azure.core.pipeline.policies.http_logging_policy:Request URL: 'https://mmigithubsamplesstorage.blob.core.windows.net/?restype=REDACTED&comp=REDACTED'\n", + "Request method: 'POST'\n", + "Request headers:\n", + " 'Content-Length': '130'\n", + " 'x-ms-version': 'REDACTED'\n", + " 'Content-Type': 'application/xml'\n", + " 'Accept': 'application/xml'\n", + " 'User-Agent': 'azsdk-python-storage-blob/12.25.1 Python/3.11.12 (Linux-5.15.167.4-microsoft-standard-WSL2-x86_64-with-glibc2.36)'\n", + " 'x-ms-date': 'REDACTED'\n", + " 'x-ms-client-request-id': '39d96820-6d73-11f0-b984-7ed684973587'\n", + " 'Authorization': 'REDACTED'\n", + "A body is sent with the request\n", + "INFO:azure.core.pipeline.policies.http_logging_policy:Response status: 200\n", + "Response headers:\n", + " 'Transfer-Encoding': 'chunked'\n", + " 'Content-Type': 'application/xml'\n", + " 'Server': 'Windows-Azure-Blob/1.0 Microsoft-HTTPAPI/2.0'\n", + " 'x-ms-request-id': '7f8414ef-201e-0026-6a7f-018681000000'\n", + " 'x-ms-client-request-id': '39d96820-6d73-11f0-b984-7ed684973587'\n", + " 'x-ms-version': 'REDACTED'\n", + " 'Date': 'Wed, 30 Jul 2025 18:30:11 GMT'\n", + "INFO:azure.core.pipeline.policies.http_logging_policy:Request URL: 'https://mmigithubsamplesstorage.blob.core.windows.net/mmi-github-samples-blob-container/test_training_20250730/29d60394-3da1-4714-abdc-ff0993009872.jpg?st=REDACTED&se=REDACTED&sp=REDACTED&sv=REDACTED&sr=REDACTED&skoid=REDACTED&sktid=REDACTED&skt=REDACTED&ske=REDACTED&sks=REDACTED&skv=REDACTED&sig=REDACTED'\n", + "Request method: 'PUT'\n", + "Request headers:\n", + " 'Content-Length': '822964'\n", + " 'x-ms-blob-type': 'REDACTED'\n", + " 'x-ms-version': 'REDACTED'\n", + " 'Content-Type': 'application/octet-stream'\n", + " 'Accept': 'application/xml'\n", + " 'User-Agent': 'azsdk-python-storage-blob/12.25.1 Python/3.11.12 (Linux-5.15.167.4-microsoft-standard-WSL2-x86_64-with-glibc2.36)'\n", + " 'x-ms-date': 'REDACTED'\n", + " 'x-ms-client-request-id': '3a2d44b8-6d73-11f0-b984-7ed684973587'\n", + "A body is sent with the request\n", + "INFO:azure.core.pipeline.policies.http_logging_policy:Response status: 201\n", + "Response headers:\n", + " 'Content-Length': '0'\n", + " 'Content-MD5': 'REDACTED'\n", + " 'Last-Modified': 'Wed, 30 Jul 2025 18:30:13 GMT'\n", + " 'Etag': '\"0x8DDCF971F6A8644\"'\n", + " 'Server': 'Windows-Azure-Blob/1.0 Microsoft-HTTPAPI/2.0'\n", + " 'x-ms-request-id': 'd55ac00f-a01e-0075-5f7f-01a5b5000000'\n", + " 'x-ms-client-request-id': '3a2d44b8-6d73-11f0-b984-7ed684973587'\n", + " 'x-ms-version': 'REDACTED'\n", + " 'x-ms-content-crc64': 'REDACTED'\n", + " 'x-ms-request-server-encrypted': 'REDACTED'\n", + " 'Date': 'Wed, 30 Jul 2025 18:30:12 GMT'\n", + "INFO:python.content_understanding_client:Uploaded file to test_training_20250730/29d60394-3da1-4714-abdc-ff0993009872.jpg\n", + "INFO:azure.core.pipeline.policies.http_logging_policy:Request URL: 'https://mmigithubsamplesstorage.blob.core.windows.net/mmi-github-samples-blob-container/test_training_20250730/29d60394-3da1-4714-abdc-ff0993009872.jpg.labels.json?st=REDACTED&se=REDACTED&sp=REDACTED&sv=REDACTED&sr=REDACTED&skoid=REDACTED&sktid=REDACTED&skt=REDACTED&ske=REDACTED&sks=REDACTED&skv=REDACTED&sig=REDACTED'\n", + "Request method: 'PUT'\n", + "Request headers:\n", + " 'Content-Length': '2014'\n", + " 'x-ms-blob-type': 'REDACTED'\n", + " 'x-ms-version': 'REDACTED'\n", + " 'Content-Type': 'application/octet-stream'\n", + " 'Accept': 'application/xml'\n", + " 'User-Agent': 'azsdk-python-storage-blob/12.25.1 Python/3.11.12 (Linux-5.15.167.4-microsoft-standard-WSL2-x86_64-with-glibc2.36)'\n", + " 'x-ms-date': 'REDACTED'\n", + " 'x-ms-client-request-id': '3a96d608-6d73-11f0-b984-7ed684973587'\n", + "A body is sent with the request\n", + "INFO:azure.core.pipeline.policies.http_logging_policy:Response status: 201\n", + "Response headers:\n", + " 'Content-Length': '0'\n", + " 'Content-MD5': 'REDACTED'\n", + " 'Last-Modified': 'Wed, 30 Jul 2025 18:30:13 GMT'\n", + " 'Etag': '\"0x8DDCF971F746FD4\"'\n", + " 'Server': 'Windows-Azure-Blob/1.0 Microsoft-HTTPAPI/2.0'\n", + " 'x-ms-request-id': 'd55ac210-a01e-0075-517f-01a5b5000000'\n", + " 'x-ms-client-request-id': '3a96d608-6d73-11f0-b984-7ed684973587'\n", + " 'x-ms-version': 'REDACTED'\n", + " 'x-ms-content-crc64': 'REDACTED'\n", + " 'x-ms-request-server-encrypted': 'REDACTED'\n", + " 'Date': 'Wed, 30 Jul 2025 18:30:12 GMT'\n", + "INFO:python.content_understanding_client:Uploaded file to test_training_20250730/29d60394-3da1-4714-abdc-ff0993009872.jpg.labels.json\n", + "INFO:azure.core.pipeline.policies.http_logging_policy:Request URL: 'https://mmigithubsamplesstorage.blob.core.windows.net/mmi-github-samples-blob-container/test_training_20250730/29d60394-3da1-4714-abdc-ff0993009872.jpg.result.json?st=REDACTED&se=REDACTED&sp=REDACTED&sv=REDACTED&sr=REDACTED&skoid=REDACTED&sktid=REDACTED&skt=REDACTED&ske=REDACTED&sks=REDACTED&skv=REDACTED&sig=REDACTED'\n", + "Request method: 'PUT'\n", + "Request headers:\n", + " 'Content-Length': '11545'\n", + " 'x-ms-blob-type': 'REDACTED'\n", + " 'x-ms-version': 'REDACTED'\n", + " 'Content-Type': 'application/octet-stream'\n", + " 'Accept': 'application/xml'\n", + " 'User-Agent': 'azsdk-python-storage-blob/12.25.1 Python/3.11.12 (Linux-5.15.167.4-microsoft-standard-WSL2-x86_64-with-glibc2.36)'\n", + " 'x-ms-date': 'REDACTED'\n", + " 'x-ms-client-request-id': '3a9f376c-6d73-11f0-b984-7ed684973587'\n", + "A body is sent with the request\n", + "INFO:azure.core.pipeline.policies.http_logging_policy:Response status: 201\n", + "Response headers:\n", + " 'Content-Length': '0'\n", + " 'Content-MD5': 'REDACTED'\n", + " 'Last-Modified': 'Wed, 30 Jul 2025 18:30:13 GMT'\n", + " 'Etag': '\"0x8DDCF971F7CAC04\"'\n", + " 'Server': 'Windows-Azure-Blob/1.0 Microsoft-HTTPAPI/2.0'\n", + " 'x-ms-request-id': 'd55ac24c-a01e-0075-0a7f-01a5b5000000'\n", + " 'x-ms-client-request-id': '3a9f376c-6d73-11f0-b984-7ed684973587'\n", + " 'x-ms-version': 'REDACTED'\n", + " 'x-ms-content-crc64': 'REDACTED'\n", + " 'x-ms-request-server-encrypted': 'REDACTED'\n", + " 'Date': 'Wed, 30 Jul 2025 18:30:12 GMT'\n", + "INFO:python.content_understanding_client:Uploaded file to test_training_20250730/29d60394-3da1-4714-abdc-ff0993009872.jpg.result.json\n", + "INFO:python.content_understanding_client:Uploaded training data for 29d60394-3da1-4714-abdc-ff0993009872.jpg\n", + "INFO:azure.core.pipeline.policies.http_logging_policy:Request URL: 'https://mmigithubsamplesstorage.blob.core.windows.net/mmi-github-samples-blob-container/test_training_20250730/17a84146-e910-460c-bf80-a625e6f64fea.jpg?st=REDACTED&se=REDACTED&sp=REDACTED&sv=REDACTED&sr=REDACTED&skoid=REDACTED&sktid=REDACTED&skt=REDACTED&ske=REDACTED&sks=REDACTED&skv=REDACTED&sig=REDACTED'\n", + "Request method: 'PUT'\n", + "Request headers:\n", + " 'Content-Length': '507561'\n", + " 'x-ms-blob-type': 'REDACTED'\n", + " 'x-ms-version': 'REDACTED'\n", + " 'Content-Type': 'application/octet-stream'\n", + " 'Accept': 'application/xml'\n", + " 'User-Agent': 'azsdk-python-storage-blob/12.25.1 Python/3.11.12 (Linux-5.15.167.4-microsoft-standard-WSL2-x86_64-with-glibc2.36)'\n", + " 'x-ms-date': 'REDACTED'\n", + " 'x-ms-client-request-id': '3aa7a2f8-6d73-11f0-b984-7ed684973587'\n", + "A body is sent with the request\n", + "INFO:azure.core.pipeline.policies.http_logging_policy:Response status: 201\n", + "Response headers:\n", + " 'Content-Length': '0'\n", + " 'Content-MD5': 'REDACTED'\n", + " 'Last-Modified': 'Wed, 30 Jul 2025 18:30:13 GMT'\n", + " 'Etag': '\"0x8DDCF971FA626CC\"'\n", + " 'Server': 'Windows-Azure-Blob/1.0 Microsoft-HTTPAPI/2.0'\n", + " 'x-ms-request-id': 'd55ac28b-a01e-0075-487f-01a5b5000000'\n", + " 'x-ms-client-request-id': '3aa7a2f8-6d73-11f0-b984-7ed684973587'\n", + " 'x-ms-version': 'REDACTED'\n", + " 'x-ms-content-crc64': 'REDACTED'\n", + " 'x-ms-request-server-encrypted': 'REDACTED'\n", + " 'Date': 'Wed, 30 Jul 2025 18:30:12 GMT'\n", + "INFO:python.content_understanding_client:Uploaded file to test_training_20250730/17a84146-e910-460c-bf80-a625e6f64fea.jpg\n", + "INFO:azure.core.pipeline.policies.http_logging_policy:Request URL: 'https://mmigithubsamplesstorage.blob.core.windows.net/mmi-github-samples-blob-container/test_training_20250730/17a84146-e910-460c-bf80-a625e6f64fea.jpg.labels.json?st=REDACTED&se=REDACTED&sp=REDACTED&sv=REDACTED&sr=REDACTED&skoid=REDACTED&sktid=REDACTED&skt=REDACTED&ske=REDACTED&sks=REDACTED&skv=REDACTED&sig=REDACTED'\n", + "Request method: 'PUT'\n", + "Request headers:\n", + " 'Content-Length': '1835'\n", + " 'x-ms-blob-type': 'REDACTED'\n", + " 'x-ms-version': 'REDACTED'\n", + " 'Content-Type': 'application/octet-stream'\n", + " 'Accept': 'application/xml'\n", + " 'User-Agent': 'azsdk-python-storage-blob/12.25.1 Python/3.11.12 (Linux-5.15.167.4-microsoft-standard-WSL2-x86_64-with-glibc2.36)'\n", + " 'x-ms-date': 'REDACTED'\n", + " 'x-ms-client-request-id': '3ad0fa18-6d73-11f0-b984-7ed684973587'\n", + "A body is sent with the request\n", + "INFO:azure.core.pipeline.policies.http_logging_policy:Response status: 201\n", + "Response headers:\n", + " 'Content-Length': '0'\n", + " 'Content-MD5': 'REDACTED'\n", + " 'Last-Modified': 'Wed, 30 Jul 2025 18:30:13 GMT'\n", + " 'Etag': '\"0x8DDCF971FAD78B0\"'\n", + " 'Server': 'Windows-Azure-Blob/1.0 Microsoft-HTTPAPI/2.0'\n", + " 'x-ms-request-id': 'd55ac397-a01e-0075-4c7f-01a5b5000000'\n", + " 'x-ms-client-request-id': '3ad0fa18-6d73-11f0-b984-7ed684973587'\n", + " 'x-ms-version': 'REDACTED'\n", + " 'x-ms-content-crc64': 'REDACTED'\n", + " 'x-ms-request-server-encrypted': 'REDACTED'\n", + " 'Date': 'Wed, 30 Jul 2025 18:30:12 GMT'\n", + "INFO:python.content_understanding_client:Uploaded file to test_training_20250730/17a84146-e910-460c-bf80-a625e6f64fea.jpg.labels.json\n", + "INFO:azure.core.pipeline.policies.http_logging_policy:Request URL: 'https://mmigithubsamplesstorage.blob.core.windows.net/mmi-github-samples-blob-container/test_training_20250730/17a84146-e910-460c-bf80-a625e6f64fea.jpg.result.json?st=REDACTED&se=REDACTED&sp=REDACTED&sv=REDACTED&sr=REDACTED&skoid=REDACTED&sktid=REDACTED&skt=REDACTED&ske=REDACTED&sks=REDACTED&skv=REDACTED&sig=REDACTED'\n", + "Request method: 'PUT'\n", + "Request headers:\n", + " 'Content-Length': '10683'\n", + " 'x-ms-blob-type': 'REDACTED'\n", + " 'x-ms-version': 'REDACTED'\n", + " 'Content-Type': 'application/octet-stream'\n", + " 'Accept': 'application/xml'\n", + " 'User-Agent': 'azsdk-python-storage-blob/12.25.1 Python/3.11.12 (Linux-5.15.167.4-microsoft-standard-WSL2-x86_64-with-glibc2.36)'\n", + " 'x-ms-date': 'REDACTED'\n", + " 'x-ms-client-request-id': '3ad81730-6d73-11f0-b984-7ed684973587'\n", + "A body is sent with the request\n", + "INFO:azure.core.pipeline.policies.http_logging_policy:Response status: 201\n", + "Response headers:\n", + " 'Content-Length': '0'\n", + " 'Content-MD5': 'REDACTED'\n", + " 'Last-Modified': 'Wed, 30 Jul 2025 18:30:13 GMT'\n", + " 'Etag': '\"0x8DDCF971FB6780D\"'\n", + " 'Server': 'Windows-Azure-Blob/1.0 Microsoft-HTTPAPI/2.0'\n", + " 'x-ms-request-id': 'd55ac3cc-a01e-0075-7e7f-01a5b5000000'\n", + " 'x-ms-client-request-id': '3ad81730-6d73-11f0-b984-7ed684973587'\n", + " 'x-ms-version': 'REDACTED'\n", + " 'x-ms-content-crc64': 'REDACTED'\n", + " 'x-ms-request-server-encrypted': 'REDACTED'\n", + " 'Date': 'Wed, 30 Jul 2025 18:30:12 GMT'\n", + "INFO:python.content_understanding_client:Uploaded file to test_training_20250730/17a84146-e910-460c-bf80-a625e6f64fea.jpg.result.json\n", + "INFO:python.content_understanding_client:Uploaded training data for 17a84146-e910-460c-bf80-a625e6f64fea.jpg\n" + ] + } + ], "source": [ "TRAINING_DATA_SAS_URL = os.getenv(\"TRAINING_DATA_SAS_URL\")\n", + "if not TRAINING_DATA_SAS_URL:\n", + " TRAINING_DATA_STORAGE_ACCOUNT_NAME = os.getenv(\"TRAINING_DATA_STORAGE_ACCCOUNT_NAME\")\n", + " TRAINING_DATA_CONTAINER_NAME = os.getenv(\"TRAINING_DATA_CONTAINER_NAME\")\n", + " if not TRAINING_DATA_STORAGE_ACCOUNT_NAME and not TRAINING_DATA_SAS_URL:\n", + " raise ValueError(\n", + " \"Please set either TRAINING_DATA_SAS_URL or both TRAINING_DATA_STORAGE_ACCCOUNT_NAME and TRAINING_DATA_CONTAINER_NAME environment variables.\"\n", + " )\n", + " TRAINING_DATA_SAS_URL = AzureContentUnderstandingClient.generate_temp_container_sas_url(\n", + " TRAINING_DATA_STORAGE_ACCOUNT_NAME,\n", + " TRAINING_DATA_CONTAINER_NAME\n", + " )\n", + "\n", "TRAINING_DATA_PATH = os.getenv(\"TRAINING_DATA_PATH\")\n", "\n", "await client.generate_training_data_on_blob(training_docs_folder, TRAINING_DATA_SAS_URL, TRAINING_DATA_PATH)" @@ -150,9 +407,88 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 5, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:python.content_understanding_client:Analyzer train-sample-58b42e42-e59e-41a1-862f-40e14e0ba0a3 create request accepted.\n", + "INFO:python.content_understanding_client:Request 9d46504a-0521-4940-897b-c56432562474 in progress ...\n", + "INFO:python.content_understanding_client:Request result is ready after 2.22 seconds.\n", + "INFO:root:Analyzer details for train-sample-58b42e42-e59e-41a1-862f-40e14e0ba0a3\n", + "INFO:root:{\n", + " \"id\": \"9d46504a-0521-4940-897b-c56432562474\",\n", + " \"status\": \"Succeeded\",\n", + " \"result\": {\n", + " \"analyzerId\": \"train-sample-58b42e42-e59e-41a1-862f-40e14e0ba0a3\",\n", + " \"description\": \"Extract useful information from receipt\",\n", + " \"createdAt\": \"2025-07-30T18:30:28Z\",\n", + " \"lastModifiedAt\": \"2025-07-30T18:30:29Z\",\n", + " \"baseAnalyzerId\": \"prebuilt-documentAnalyzer\",\n", + " \"config\": {\n", + " \"returnDetails\": true,\n", + " \"enableOcr\": true,\n", + " \"enableLayout\": true,\n", + " \"enableFormula\": false,\n", + " \"disableContentFiltering\": false,\n", + " \"tableFormat\": \"html\"\n", + " },\n", + " \"fieldSchema\": {\n", + " \"fields\": {\n", + " \"MerchantName\": {\n", + " \"type\": \"string\",\n", + " \"method\": \"extract\",\n", + " \"description\": \"\"\n", + " },\n", + " \"Items\": {\n", + " \"type\": \"array\",\n", + " \"method\": \"generate\",\n", + " \"description\": \"\",\n", + " \"items\": {\n", + " \"type\": \"object\",\n", + " \"method\": \"extract\",\n", + " \"properties\": {\n", + " \"Quantity\": {\n", + " \"type\": \"string\",\n", + " \"method\": \"extract\",\n", + " \"description\": \"\"\n", + " },\n", + " \"Name\": {\n", + " \"type\": \"string\",\n", + " \"method\": \"extract\",\n", + " \"description\": \"\"\n", + " },\n", + " \"Price\": {\n", + " \"type\": \"string\",\n", + " \"method\": \"extract\",\n", + " \"description\": \"\"\n", + " }\n", + " }\n", + " }\n", + " },\n", + " \"TotalPrice\": {\n", + " \"type\": \"string\",\n", + " \"method\": \"extract\",\n", + " \"description\": \"\"\n", + " }\n", + " }\n", + " },\n", + " \"trainingData\": {\n", + " \"containerUrl\": \"https://mmigithubsamplesstorage.blob.core.windows.net/mmi-github-samples-blob-container?st=2025-07-30T18%3A30%3A10Z&se=2025-07-30T19%3A30%3A10Z&sp=rwl&sv=2025-05-05&sr=c&skoid=83f4a972-1d44-4737-ba69-497f43bc66e2&sktid=72f988bf-86f1-41af-91ab-2d7cd011db47&skt=2025-07-30T18%3A30%3A10Z&ske=2025-07-30T19%3A30%3A10Z&sks=b&skv=2025-05-05&sig=kJBr0b3pz4MRMJOdDx8Rv0Pa5v3OcqC4KcmaPj/gM54%3D\",\n", + " \"kind\": \"blob\",\n", + " \"prefix\": \"test_training_20250730/\"\n", + " },\n", + " \"warnings\": [],\n", + " \"status\": \"ready\",\n", + " \"processingLocation\": \"geography\",\n", + " \"mode\": \"standard\"\n", + " }\n", + "}\n" + ] + } + ], "source": [ "import uuid\n", "CUSTOM_ANALYZER_ID = \"train-sample-\" + str(uuid.uuid4())\n", @@ -184,9 +520,800 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 6, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:python.content_understanding_client:Analyzing file ../data/receipt.png with analyzer: train-sample-58b42e42-e59e-41a1-862f-40e14e0ba0a3\n", + "INFO:python.content_understanding_client:Request f68cdf71-6db1-4e3b-8933-8cf2bf9a5e06 in progress ...\n", + "INFO:python.content_understanding_client:Request f68cdf71-6db1-4e3b-8933-8cf2bf9a5e06 in progress ...\n", + "INFO:python.content_understanding_client:Request result is ready after 4.45 seconds.\n", + "INFO:root:{\n", + " \"id\": \"f68cdf71-6db1-4e3b-8933-8cf2bf9a5e06\",\n", + " \"status\": \"Succeeded\",\n", + " \"result\": {\n", + " \"analyzerId\": \"train-sample-58b42e42-e59e-41a1-862f-40e14e0ba0a3\",\n", + " \"apiVersion\": \"2025-05-01-preview\",\n", + " \"createdAt\": \"2025-07-30T18:30:31Z\",\n", + " \"warnings\": [],\n", + " \"contents\": [\n", + " {\n", + " \"markdown\": \"Contoso\\n\\n123 Main Street\\nRedmond, WA 98052\\n\\n987-654-3210\\n\\n6/10/2019 13:59\\nSales Associate: Paul\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n
2 Surface Pro 6$1,998.00
3 Surface Pen$299.97
\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n
Sub-Total$2,297.97
Tax$218.31
Total$2,516.28
\\n\",\n", + " \"fields\": {\n", + " \"MerchantName\": {\n", + " \"type\": \"string\",\n", + " \"valueString\": \"Contoso\"\n", + " },\n", + " \"Items\": {\n", + " \"type\": \"array\",\n", + " \"valueArray\": [\n", + " {\n", + " \"type\": \"object\",\n", + " \"valueObject\": {\n", + " \"Quantity\": {\n", + " \"type\": \"string\",\n", + " \"valueString\": \"2\"\n", + " },\n", + " \"Name\": {\n", + " \"type\": \"string\",\n", + " \"valueString\": \"Surface Pro 6\"\n", + " },\n", + " \"Price\": {\n", + " \"type\": \"string\",\n", + " \"valueString\": \"$1,998.00\"\n", + " }\n", + " }\n", + " },\n", + " {\n", + " \"type\": \"object\",\n", + " \"valueObject\": {\n", + " \"Quantity\": {\n", + " \"type\": \"string\",\n", + " \"valueString\": \"3\"\n", + " },\n", + " \"Name\": {\n", + " \"type\": \"string\",\n", + " \"valueString\": \"Surface Pen\"\n", + " },\n", + " \"Price\": {\n", + " \"type\": \"string\",\n", + " \"valueString\": \"$299.97\"\n", + " }\n", + " }\n", + " }\n", + " ]\n", + " },\n", + " \"TotalPrice\": {\n", + " \"type\": \"string\",\n", + " \"valueString\": \"$2,516.28\"\n", + " }\n", + " },\n", + " \"kind\": \"document\",\n", + " \"startPageNumber\": 1,\n", + " \"endPageNumber\": 1,\n", + " \"unit\": \"pixel\",\n", + " \"pages\": [\n", + " {\n", + " \"pageNumber\": 1,\n", + " \"angle\": -0.0848,\n", + " \"width\": 1743,\n", + " \"height\": 878,\n", + " \"spans\": [\n", + " {\n", + " \"offset\": 0,\n", + " \"length\": 375\n", + " }\n", + " ],\n", + " \"words\": [\n", + " {\n", + " \"content\": \"Contoso\",\n", + " \"span\": {\n", + " \"offset\": 0,\n", + " \"length\": 7\n", + " },\n", + " \"confidence\": 0.995,\n", + " \"source\": \"D(1,774,72,974,70,974,111,774,113)\"\n", + " },\n", + " {\n", + " \"content\": \"123\",\n", + " \"span\": {\n", + " \"offset\": 9,\n", + " \"length\": 3\n", + " },\n", + " \"confidence\": 0.997,\n", + " \"source\": \"D(1,700,189,730,189,730,213,700,213)\"\n", + " },\n", + " {\n", + " \"content\": \"Main\",\n", + " \"span\": {\n", + " \"offset\": 13,\n", + " \"length\": 4\n", + " },\n", + " \"confidence\": 0.993,\n", + " \"source\": \"D(1,738,189,786,189,786,212,738,213)\"\n", + " },\n", + " {\n", + " \"content\": \"Street\",\n", + " \"span\": {\n", + " \"offset\": 18,\n", + " \"length\": 6\n", + " },\n", + " \"confidence\": 0.996,\n", + " \"source\": \"D(1,795,189,860,188,860,212,795,212)\"\n", + " },\n", + " {\n", + " \"content\": \"Redmond,\",\n", + " \"span\": {\n", + " \"offset\": 25,\n", + " \"length\": 8\n", + " },\n", + " \"confidence\": 0.993,\n", + " \"source\": \"D(1,699,224,800,224,800,249,699,249)\"\n", + " },\n", + " {\n", + " \"content\": \"WA\",\n", + " \"span\": {\n", + " \"offset\": 34,\n", + " \"length\": 2\n", + " },\n", + " \"confidence\": 0.973,\n", + " \"source\": \"D(1,808,223,841,223,841,248,808,249)\"\n", + " },\n", + " {\n", + " \"content\": \"98052\",\n", + " \"span\": {\n", + " \"offset\": 37,\n", + " \"length\": 5\n", + " },\n", + " \"confidence\": 0.997,\n", + " \"source\": \"D(1,847,223,912,222,912,247,847,248)\"\n", + " },\n", + " {\n", + " \"content\": \"987-654-3210\",\n", + " \"span\": {\n", + " \"offset\": 44,\n", + " \"length\": 12\n", + " },\n", + " \"confidence\": 0.993,\n", + " \"source\": \"D(1,699,298,843,298,843,322,699,322)\"\n", + " },\n", + " {\n", + " \"content\": \"6/10/2019\",\n", + " \"span\": {\n", + " \"offset\": 58,\n", + " \"length\": 9\n", + " },\n", + " \"confidence\": 0.992,\n", + " \"source\": \"D(1,699,372,794,372,794,399,699,399)\"\n", + " },\n", + " {\n", + " \"content\": \"13:59\",\n", + " \"span\": {\n", + " \"offset\": 68,\n", + " \"length\": 5\n", + " },\n", + " \"confidence\": 0.993,\n", + " \"source\": \"D(1,803,372,853,372,853,398,803,399)\"\n", + " },\n", + " {\n", + " \"content\": \"Sales\",\n", + " \"span\": {\n", + " \"offset\": 74,\n", + " \"length\": 5\n", + " },\n", + " \"confidence\": 0.995,\n", + " \"source\": \"D(1,699,409,756,409,756,433,699,433)\"\n", + " },\n", + " {\n", + " \"content\": \"Associate:\",\n", + " \"span\": {\n", + " \"offset\": 80,\n", + " \"length\": 10\n", + " },\n", + " \"confidence\": 0.989,\n", + " \"source\": \"D(1,764,409,868,409,868,433,764,433)\"\n", + " },\n", + " {\n", + " \"content\": \"Paul\",\n", + " \"span\": {\n", + " \"offset\": 91,\n", + " \"length\": 4\n", + " },\n", + " \"confidence\": 0.991,\n", + " \"source\": \"D(1,876,409,924,409,924,433,876,433)\"\n", + " },\n", + " {\n", + " \"content\": \"2\",\n", + " \"span\": {\n", + " \"offset\": 115,\n", + " \"length\": 1\n", + " },\n", + " \"confidence\": 0.995,\n", + " \"source\": \"D(1,704,483,717,483,717,508,704,508)\"\n", + " },\n", + " {\n", + " \"content\": \"Surface\",\n", + " \"span\": {\n", + " \"offset\": 117,\n", + " \"length\": 7\n", + " },\n", + " \"confidence\": 0.992,\n", + " \"source\": \"D(1,731,483,811,483,811,508,731,508)\"\n", + " },\n", + " {\n", + " \"content\": \"Pro\",\n", + " \"span\": {\n", + " \"offset\": 125,\n", + " \"length\": 3\n", + " },\n", + " \"confidence\": 0.995,\n", + " \"source\": \"D(1,820,483,854,483,854,508,820,508)\"\n", + " },\n", + " {\n", + " \"content\": \"6\",\n", + " \"span\": {\n", + " \"offset\": 129,\n", + " \"length\": 1\n", + " },\n", + " \"confidence\": 0.977,\n", + " \"source\": \"D(1,862,483,875,482,875,507,862,508)\"\n", + " },\n", + " {\n", + " \"content\": \"$1,998.00\",\n", + " \"span\": {\n", + " \"offset\": 140,\n", + " \"length\": 9\n", + " },\n", + " \"confidence\": 0.993,\n", + " \"source\": \"D(1,952,482,1048,482,1048,508,952,509)\"\n", + " },\n", + " {\n", + " \"content\": \"3\",\n", + " \"span\": {\n", + " \"offset\": 170,\n", + " \"length\": 1\n", + " },\n", + " \"confidence\": 0.995,\n", + " \"source\": \"D(1,703,522,716,522,715,546,703,546)\"\n", + " },\n", + " {\n", + " \"content\": \"Surface\",\n", + " \"span\": {\n", + " \"offset\": 172,\n", + " \"length\": 7\n", + " },\n", + " \"confidence\": 0.995,\n", + " \"source\": \"D(1,731,522,812,521,812,546,731,546)\"\n", + " },\n", + " {\n", + " \"content\": \"Pen\",\n", + " \"span\": {\n", + " \"offset\": 180,\n", + " \"length\": 3\n", + " },\n", + " \"confidence\": 0.996,\n", + " \"source\": \"D(1,820,521,859,522,859,546,820,546)\"\n", + " },\n", + " {\n", + " \"content\": \"$299.97\",\n", + " \"span\": {\n", + " \"offset\": 193,\n", + " \"length\": 7\n", + " },\n", + " \"confidence\": 0.994,\n", + " \"source\": \"D(1,969,521,1050,521,1050,546,969,546)\"\n", + " },\n", + " {\n", + " \"content\": \"Sub-Total\",\n", + " \"span\": {\n", + " \"offset\": 240,\n", + " \"length\": 9\n", + " },\n", + " \"confidence\": 0.994,\n", + " \"source\": \"D(1,764,597,869,597,869,621,764,621)\"\n", + " },\n", + " {\n", + " \"content\": \"$2,297.97\",\n", + " \"span\": {\n", + " \"offset\": 259,\n", + " \"length\": 9\n", + " },\n", + " \"confidence\": 0.992,\n", + " \"source\": \"D(1,952,597,1051,597,1051,622,952,623)\"\n", + " },\n", + " {\n", + " \"content\": \"Tax\",\n", + " \"span\": {\n", + " \"offset\": 289,\n", + " \"length\": 3\n", + " },\n", + " \"confidence\": 0.998,\n", + " \"source\": \"D(1,767,635,805,635,805,659,767,659)\"\n", + " },\n", + " {\n", + " \"content\": \"$218.31\",\n", + " \"span\": {\n", + " \"offset\": 302,\n", + " \"length\": 7\n", + " },\n", + " \"confidence\": 0.994,\n", + " \"source\": \"D(1,976,634,1051,634,1051,659,976,659)\"\n", + " },\n", + " {\n", + " \"content\": \"Total\",\n", + " \"span\": {\n", + " \"offset\": 330,\n", + " \"length\": 5\n", + " },\n", + " \"confidence\": 0.993,\n", + " \"source\": \"D(1,768,712,822,712,822,737,768,737)\"\n", + " },\n", + " {\n", + " \"content\": \"$2,516.28\",\n", + " \"span\": {\n", + " \"offset\": 345,\n", + " \"length\": 9\n", + " },\n", + " \"confidence\": 0.992,\n", + " \"source\": \"D(1,959,710,1054,709,1054,737,959,737)\"\n", + " }\n", + " ],\n", + " \"lines\": [\n", + " {\n", + " \"content\": \"Contoso\",\n", + " \"source\": \"D(1,774,71,973,70,974,111,774,113)\",\n", + " \"span\": {\n", + " \"offset\": 0,\n", + " \"length\": 7\n", + " }\n", + " },\n", + " {\n", + " \"content\": \"123 Main Street\",\n", + " \"source\": \"D(1,699,189,859,188,859,212,700,213)\",\n", + " \"span\": {\n", + " \"offset\": 9,\n", + " \"length\": 15\n", + " }\n", + " },\n", + " {\n", + " \"content\": \"Redmond, WA 98052\",\n", + " \"source\": \"D(1,699,224,911,222,911,247,699,249)\",\n", + " \"span\": {\n", + " \"offset\": 25,\n", + " \"length\": 17\n", + " }\n", + " },\n", + " {\n", + " \"content\": \"987-654-3210\",\n", + " \"source\": \"D(1,699,298,842,298,842,322,699,322)\",\n", + " \"span\": {\n", + " \"offset\": 44,\n", + " \"length\": 12\n", + " }\n", + " },\n", + " {\n", + " \"content\": \"6/10/2019 13:59\",\n", + " \"source\": \"D(1,699,372,853,372,853,399,699,399)\",\n", + " \"span\": {\n", + " \"offset\": 58,\n", + " \"length\": 15\n", + " }\n", + " },\n", + " {\n", + " \"content\": \"Sales Associate: Paul\",\n", + " \"source\": \"D(1,699,409,923,409,923,433,699,433)\",\n", + " \"span\": {\n", + " \"offset\": 74,\n", + " \"length\": 21\n", + " }\n", + " },\n", + " {\n", + " \"content\": \"2 Surface Pro 6\",\n", + " \"source\": \"D(1,703,483,874,482,874,507,704,508)\",\n", + " \"span\": {\n", + " \"offset\": 115,\n", + " \"length\": 15\n", + " }\n", + " },\n", + " {\n", + " \"content\": \"$1,998.00\",\n", + " \"source\": \"D(1,952,482,1048,482,1048,509,952,508)\",\n", + " \"span\": {\n", + " \"offset\": 140,\n", + " \"length\": 9\n", + " }\n", + " },\n", + " {\n", + " \"content\": \"3 Surface Pen\",\n", + " \"source\": \"D(1,703,522,859,521,859,546,703,546)\",\n", + " \"span\": {\n", + " \"offset\": 170,\n", + " \"length\": 13\n", + " }\n", + " },\n", + " {\n", + " \"content\": \"$299.97\",\n", + " \"source\": \"D(1,969,521,1049,521,1049,546,969,546)\",\n", + " \"span\": {\n", + " \"offset\": 193,\n", + " \"length\": 7\n", + " }\n", + " },\n", + " {\n", + " \"content\": \"Sub-Total\",\n", + " \"source\": \"D(1,764,597,868,597,868,621,764,621)\",\n", + " \"span\": {\n", + " \"offset\": 240,\n", + " \"length\": 9\n", + " }\n", + " },\n", + " {\n", + " \"content\": \"$2,297.97\",\n", + " \"source\": \"D(1,952,597,1050,597,1050,622,952,623)\",\n", + " \"span\": {\n", + " \"offset\": 259,\n", + " \"length\": 9\n", + " }\n", + " },\n", + " {\n", + " \"content\": \"Tax\",\n", + " \"source\": \"D(1,767,635,804,635,804,658,767,658)\",\n", + " \"span\": {\n", + " \"offset\": 289,\n", + " \"length\": 3\n", + " }\n", + " },\n", + " {\n", + " \"content\": \"$218.31\",\n", + " \"source\": \"D(1,976,634,1051,634,1051,659,976,659)\",\n", + " \"span\": {\n", + " \"offset\": 302,\n", + " \"length\": 7\n", + " }\n", + " },\n", + " {\n", + " \"content\": \"Total\",\n", + " \"source\": \"D(1,768,712,821,712,821,736,768,736)\",\n", + " \"span\": {\n", + " \"offset\": 330,\n", + " \"length\": 5\n", + " }\n", + " },\n", + " {\n", + " \"content\": \"$2,516.28\",\n", + " \"source\": \"D(1,959,710,1054,709,1054,737,959,737)\",\n", + " \"span\": {\n", + " \"offset\": 345,\n", + " \"length\": 9\n", + " }\n", + " }\n", + " ]\n", + " }\n", + " ],\n", + " \"paragraphs\": [\n", + " {\n", + " \"content\": \"Contoso\",\n", + " \"source\": \"D(1,774,71,973,70,974,111,774,113)\",\n", + " \"span\": {\n", + " \"offset\": 0,\n", + " \"length\": 7\n", + " }\n", + " },\n", + " {\n", + " \"content\": \"123 Main Street Redmond, WA 98052\",\n", + " \"source\": \"D(1,698,189,911,188,911,247,699,249)\",\n", + " \"span\": {\n", + " \"offset\": 9,\n", + " \"length\": 33\n", + " }\n", + " },\n", + " {\n", + " \"content\": \"987-654-3210\",\n", + " \"source\": \"D(1,699,298,842,298,842,322,699,322)\",\n", + " \"span\": {\n", + " \"offset\": 44,\n", + " \"length\": 12\n", + " }\n", + " },\n", + " {\n", + " \"content\": \"6/10/2019 13:59 Sales Associate: Paul\",\n", + " \"source\": \"D(1,699,372,923,372,923,433,699,433)\",\n", + " \"span\": {\n", + " \"offset\": 58,\n", + " \"length\": 37\n", + " }\n", + " },\n", + " {\n", + " \"content\": \"2 Surface Pro 6\",\n", + " \"source\": \"D(1,691,471,911,470,911,514,691,515)\",\n", + " \"span\": {\n", + " \"offset\": 115,\n", + " \"length\": 15\n", + " }\n", + " },\n", + " {\n", + " \"content\": \"$1,998.00\",\n", + " \"source\": \"D(1,911,470,1057,470,1057,513,911,514)\",\n", + " \"span\": {\n", + " \"offset\": 140,\n", + " \"length\": 9\n", + " }\n", + " },\n", + " {\n", + " \"content\": \"3 Surface Pen\",\n", + " \"source\": \"D(1,691,515,911,514,912,556,691,557)\",\n", + " \"span\": {\n", + " \"offset\": 170,\n", + " \"length\": 13\n", + " }\n", + " },\n", + " {\n", + " \"content\": \"$299.97\",\n", + " \"source\": \"D(1,911,514,1057,513,1057,555,912,556)\",\n", + " \"span\": {\n", + " \"offset\": 193,\n", + " \"length\": 7\n", + " }\n", + " },\n", + " {\n", + " \"content\": \"Sub-Total\",\n", + " \"source\": \"D(1,753,585,909,585,910,627,753,628)\",\n", + " \"span\": {\n", + " \"offset\": 240,\n", + " \"length\": 9\n", + " }\n", + " },\n", + " {\n", + " \"content\": \"$2,297.97\",\n", + " \"source\": \"D(1,909,585,1060,586,1060,627,910,627)\",\n", + " \"span\": {\n", + " \"offset\": 259,\n", + " \"length\": 9\n", + " }\n", + " },\n", + " {\n", + " \"content\": \"Tax\",\n", + " \"source\": \"D(1,753,628,910,627,910,683,754,684)\",\n", + " \"span\": {\n", + " \"offset\": 289,\n", + " \"length\": 3\n", + " }\n", + " },\n", + " {\n", + " \"content\": \"$218.31\",\n", + " \"source\": \"D(1,910,627,1060,627,1061,683,910,683)\",\n", + " \"span\": {\n", + " \"offset\": 302,\n", + " \"length\": 7\n", + " }\n", + " },\n", + " {\n", + " \"content\": \"Total\",\n", + " \"source\": \"D(1,754,684,910,683,910,747,754,748)\",\n", + " \"span\": {\n", + " \"offset\": 330,\n", + " \"length\": 5\n", + " }\n", + " },\n", + " {\n", + " \"content\": \"$2,516.28\",\n", + " \"source\": \"D(1,910,683,1061,683,1062,747,910,747)\",\n", + " \"span\": {\n", + " \"offset\": 345,\n", + " \"length\": 9\n", + " }\n", + " }\n", + " ],\n", + " \"sections\": [\n", + " {\n", + " \"span\": {\n", + " \"offset\": 0,\n", + " \"length\": 374\n", + " },\n", + " \"elements\": [\n", + " \"/paragraphs/0\",\n", + " \"/paragraphs/1\",\n", + " \"/paragraphs/2\",\n", + " \"/paragraphs/3\",\n", + " \"/tables/0\",\n", + " \"/tables/1\"\n", + " ]\n", + " }\n", + " ],\n", + " \"tables\": [\n", + " {\n", + " \"rowCount\": 2,\n", + " \"columnCount\": 2,\n", + " \"cells\": [\n", + " {\n", + " \"kind\": \"content\",\n", + " \"rowIndex\": 0,\n", + " \"columnIndex\": 0,\n", + " \"rowSpan\": 1,\n", + " \"columnSpan\": 1,\n", + " \"content\": \"2 Surface Pro 6\",\n", + " \"source\": \"D(1,691,471,911,470,911,514,691,515)\",\n", + " \"span\": {\n", + " \"offset\": 115,\n", + " \"length\": 15\n", + " },\n", + " \"elements\": [\n", + " \"/paragraphs/4\"\n", + " ]\n", + " },\n", + " {\n", + " \"kind\": \"content\",\n", + " \"rowIndex\": 0,\n", + " \"columnIndex\": 1,\n", + " \"rowSpan\": 1,\n", + " \"columnSpan\": 1,\n", + " \"content\": \"$1,998.00\",\n", + " \"source\": \"D(1,911,470,1057,470,1057,513,911,514)\",\n", + " \"span\": {\n", + " \"offset\": 140,\n", + " \"length\": 9\n", + " },\n", + " \"elements\": [\n", + " \"/paragraphs/5\"\n", + " ]\n", + " },\n", + " {\n", + " \"kind\": \"content\",\n", + " \"rowIndex\": 1,\n", + " \"columnIndex\": 0,\n", + " \"rowSpan\": 1,\n", + " \"columnSpan\": 1,\n", + " \"content\": \"3 Surface Pen\",\n", + " \"source\": \"D(1,691,515,911,514,912,556,691,557)\",\n", + " \"span\": {\n", + " \"offset\": 170,\n", + " \"length\": 13\n", + " },\n", + " \"elements\": [\n", + " \"/paragraphs/6\"\n", + " ]\n", + " },\n", + " {\n", + " \"kind\": \"content\",\n", + " \"rowIndex\": 1,\n", + " \"columnIndex\": 1,\n", + " \"rowSpan\": 1,\n", + " \"columnSpan\": 1,\n", + " \"content\": \"$299.97\",\n", + " \"source\": \"D(1,911,514,1057,513,1057,555,912,556)\",\n", + " \"span\": {\n", + " \"offset\": 193,\n", + " \"length\": 7\n", + " },\n", + " \"elements\": [\n", + " \"/paragraphs/7\"\n", + " ]\n", + " }\n", + " ],\n", + " \"source\": \"D(1,698,479,1050,478,1051,551,698,552)\",\n", + " \"span\": {\n", + " \"offset\": 98,\n", + " \"length\": 122\n", + " }\n", + " },\n", + " {\n", + " \"rowCount\": 3,\n", + " \"columnCount\": 2,\n", + " \"cells\": [\n", + " {\n", + " \"kind\": \"content\",\n", + " \"rowIndex\": 0,\n", + " \"columnIndex\": 0,\n", + " \"rowSpan\": 1,\n", + " \"columnSpan\": 1,\n", + " \"content\": \"Sub-Total\",\n", + " \"source\": \"D(1,753,585,909,585,910,627,753,628)\",\n", + " \"span\": {\n", + " \"offset\": 240,\n", + " \"length\": 9\n", + " },\n", + " \"elements\": [\n", + " \"/paragraphs/8\"\n", + " ]\n", + " },\n", + " {\n", + " \"kind\": \"content\",\n", + " \"rowIndex\": 0,\n", + " \"columnIndex\": 1,\n", + " \"rowSpan\": 1,\n", + " \"columnSpan\": 1,\n", + " \"content\": \"$2,297.97\",\n", + " \"source\": \"D(1,909,585,1060,586,1060,627,910,627)\",\n", + " \"span\": {\n", + " \"offset\": 259,\n", + " \"length\": 9\n", + " },\n", + " \"elements\": [\n", + " \"/paragraphs/9\"\n", + " ]\n", + " },\n", + " {\n", + " \"kind\": \"content\",\n", + " \"rowIndex\": 1,\n", + " \"columnIndex\": 0,\n", + " \"rowSpan\": 1,\n", + " \"columnSpan\": 1,\n", + " \"content\": \"Tax\",\n", + " \"source\": \"D(1,753,628,910,627,910,683,754,684)\",\n", + " \"span\": {\n", + " \"offset\": 289,\n", + " \"length\": 3\n", + " },\n", + " \"elements\": [\n", + " \"/paragraphs/10\"\n", + " ]\n", + " },\n", + " {\n", + " \"kind\": \"content\",\n", + " \"rowIndex\": 1,\n", + " \"columnIndex\": 1,\n", + " \"rowSpan\": 1,\n", + " \"columnSpan\": 1,\n", + " \"content\": \"$218.31\",\n", + " \"source\": \"D(1,910,627,1060,627,1061,683,910,683)\",\n", + " \"span\": {\n", + " \"offset\": 302,\n", + " \"length\": 7\n", + " },\n", + " \"elements\": [\n", + " \"/paragraphs/11\"\n", + " ]\n", + " },\n", + " {\n", + " \"kind\": \"content\",\n", + " \"rowIndex\": 2,\n", + " \"columnIndex\": 0,\n", + " \"rowSpan\": 1,\n", + " \"columnSpan\": 1,\n", + " \"content\": \"Total\",\n", + " \"source\": \"D(1,754,684,910,683,910,747,754,748)\",\n", + " \"span\": {\n", + " \"offset\": 330,\n", + " \"length\": 5\n", + " },\n", + " \"elements\": [\n", + " \"/paragraphs/12\"\n", + " ]\n", + " },\n", + " {\n", + " \"kind\": \"content\",\n", + " \"rowIndex\": 2,\n", + " \"columnIndex\": 1,\n", + " \"rowSpan\": 1,\n", + " \"columnSpan\": 1,\n", + " \"content\": \"$2,516.28\",\n", + " \"source\": \"D(1,910,683,1061,683,1062,747,910,747)\",\n", + " \"span\": {\n", + " \"offset\": 345,\n", + " \"length\": 9\n", + " },\n", + " \"elements\": [\n", + " \"/paragraphs/13\"\n", + " ]\n", + " }\n", + " ],\n", + " \"source\": \"D(1,759,593,1056,592,1057,741,760,742)\",\n", + " \"span\": {\n", + " \"offset\": 223,\n", + " \"length\": 151\n", + " }\n", + " }\n", + " ]\n", + " }\n", + " ]\n", + " }\n", + "}\n" + ] + } + ], "source": [ "response = client.begin_analyze(CUSTOM_ANALYZER_ID, file_location='../data/receipt.png')\n", "result_json = client.poll_result(response)\n", @@ -204,9 +1331,27 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 7, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:python.content_understanding_client:Analyzer train-sample-58b42e42-e59e-41a1-862f-40e14e0ba0a3 deleted.\n" + ] + }, + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "client.delete_analyzer(CUSTOM_ANALYZER_ID)" ] From f7e6acaf6a8fc2bd01689dbf5ec2acd9902d51a2 Mon Sep 17 00:00:00 2001 From: "ds.chienyuanchang@gmail.com" Date: Wed, 30 Jul 2025 20:29:12 +0000 Subject: [PATCH 3/8] revise field_extraction_pro_mode.ipynb --- notebooks/analyzer_training.ipynb | 1185 +-------------------- notebooks/field_extraction_pro_mode.ipynb | 32 +- 2 files changed, 47 insertions(+), 1170 deletions(-) diff --git a/notebooks/analyzer_training.ipynb b/notebooks/analyzer_training.ipynb index bc5e358..73bfa29 100644 --- a/notebooks/analyzer_training.ipynb +++ b/notebooks/analyzer_training.ipynb @@ -23,55 +23,18 @@ "\n", "## Prerequisites\n", "1. Ensure Azure AI service is configured following [steps](../README.md#configure-azure-ai-service-resource)\n", - "1. Follow steps in [Set env for trainging data](../docs/set_env_for_training_data_and_reference_doc.md) to add training data related env variables `TRAINING_DATA_SAS_URL` and `TRAINING_DATA_PATH` into the [.env](./.env) file.\n", - " - `TRAINING_DATA_SAS_URL`: SAS URL for your Azure Blob container. \n", - " - `TRAINING_DATA_PATH`: Folder path within the container to upload training data. \n", - "1. Install packages needed to run the sample\n", - "\n", - "\n" + "2. Follow steps in [Set env for trainging data](../docs/set_env_for_training_data_and_reference_doc.md) to add training data related environment variables into the [.env](./.env) file.\n", + " - You can either set `TRAINING_DATA_SAS_URL` directly with the SAS URL for your Azure Blob container,\n", + " - Or set both `TRAINING_DATA_STORAGE_ACCOUNT_NAME` and `TRAINING_DATA_CONTAINER_NAME`, so the SAS URL can be generated automatically during one of the later steps.\n", + " - Also set `TRAINING_DATA_PATH` to specify the folder path within the container where training data will be uploaded.\n", + "3. Install packages needed to run the sample\n" ] }, { "cell_type": "code", - "execution_count": 1, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Defaulting to user installation because normal site-packages is not writeable\n", - "Requirement already satisfied: aiohttp in /home/vscode/.local/lib/python3.11/site-packages (from -r ../requirements.txt (line 1)) (3.12.14)\n", - "Requirement already satisfied: azure-identity in /home/vscode/.local/lib/python3.11/site-packages (from -r ../requirements.txt (line 2)) (1.23.0)\n", - "Requirement already satisfied: azure-storage-blob in /home/vscode/.local/lib/python3.11/site-packages (from -r ../requirements.txt (line 3)) (12.25.1)\n", - "Requirement already satisfied: python-dotenv in /home/vscode/.local/lib/python3.11/site-packages (from -r ../requirements.txt (line 4)) (1.1.1)\n", - "Requirement already satisfied: requests in /home/vscode/.local/lib/python3.11/site-packages (from -r ../requirements.txt (line 5)) (2.32.4)\n", - "Requirement already satisfied: Pillow in /home/vscode/.local/lib/python3.11/site-packages (from -r ../requirements.txt (line 6)) (11.3.0)\n", - "Requirement already satisfied: aiohappyeyeballs>=2.5.0 in /home/vscode/.local/lib/python3.11/site-packages (from aiohttp->-r ../requirements.txt (line 1)) (2.6.1)\n", - "Requirement already satisfied: aiosignal>=1.4.0 in /home/vscode/.local/lib/python3.11/site-packages (from aiohttp->-r ../requirements.txt (line 1)) (1.4.0)\n", - "Requirement already satisfied: attrs>=17.3.0 in /home/vscode/.local/lib/python3.11/site-packages (from aiohttp->-r ../requirements.txt (line 1)) (25.3.0)\n", - "Requirement already satisfied: frozenlist>=1.1.1 in /home/vscode/.local/lib/python3.11/site-packages (from aiohttp->-r ../requirements.txt (line 1)) (1.7.0)\n", - "Requirement already satisfied: multidict<7.0,>=4.5 in /home/vscode/.local/lib/python3.11/site-packages (from aiohttp->-r ../requirements.txt (line 1)) (6.6.3)\n", - "Requirement already satisfied: propcache>=0.2.0 in /home/vscode/.local/lib/python3.11/site-packages (from aiohttp->-r ../requirements.txt (line 1)) (0.3.2)\n", - "Requirement already satisfied: yarl<2.0,>=1.17.0 in /home/vscode/.local/lib/python3.11/site-packages (from aiohttp->-r ../requirements.txt (line 1)) (1.20.1)\n", - "Requirement already satisfied: idna>=2.0 in /home/vscode/.local/lib/python3.11/site-packages (from yarl<2.0,>=1.17.0->aiohttp->-r ../requirements.txt (line 1)) (3.10)\n", - "Requirement already satisfied: azure-core>=1.31.0 in /home/vscode/.local/lib/python3.11/site-packages (from azure-identity->-r ../requirements.txt (line 2)) (1.34.0)\n", - "Requirement already satisfied: cryptography>=2.5 in /home/vscode/.local/lib/python3.11/site-packages (from azure-identity->-r ../requirements.txt (line 2)) (45.0.4)\n", - "Requirement already satisfied: msal>=1.30.0 in /home/vscode/.local/lib/python3.11/site-packages (from azure-identity->-r ../requirements.txt (line 2)) (1.32.3)\n", - "Requirement already satisfied: msal-extensions>=1.2.0 in /home/vscode/.local/lib/python3.11/site-packages (from azure-identity->-r ../requirements.txt (line 2)) (1.3.1)\n", - "Requirement already satisfied: typing-extensions>=4.0.0 in /home/vscode/.local/lib/python3.11/site-packages (from azure-identity->-r ../requirements.txt (line 2)) (4.14.0)\n", - "Requirement already satisfied: isodate>=0.6.1 in /home/vscode/.local/lib/python3.11/site-packages (from azure-storage-blob->-r ../requirements.txt (line 3)) (0.7.2)\n", - "Requirement already satisfied: charset_normalizer<4,>=2 in /home/vscode/.local/lib/python3.11/site-packages (from requests->-r ../requirements.txt (line 5)) (3.4.2)\n", - "Requirement already satisfied: urllib3<3,>=1.21.1 in /home/vscode/.local/lib/python3.11/site-packages (from requests->-r ../requirements.txt (line 5)) (2.5.0)\n", - "Requirement already satisfied: certifi>=2017.4.17 in /home/vscode/.local/lib/python3.11/site-packages (from requests->-r ../requirements.txt (line 5)) (2025.6.15)\n", - "Requirement already satisfied: six>=1.11.0 in /home/vscode/.local/lib/python3.11/site-packages (from azure-core>=1.31.0->azure-identity->-r ../requirements.txt (line 2)) (1.17.0)\n", - "Requirement already satisfied: cffi>=1.14 in /home/vscode/.local/lib/python3.11/site-packages (from cryptography>=2.5->azure-identity->-r ../requirements.txt (line 2)) (1.17.1)\n", - "Requirement already satisfied: pycparser in /home/vscode/.local/lib/python3.11/site-packages (from cffi>=1.14->cryptography>=2.5->azure-identity->-r ../requirements.txt (line 2)) (2.22)\n", - "Requirement already satisfied: PyJWT<3,>=1.0.0 in /home/vscode/.local/lib/python3.11/site-packages (from PyJWT[crypto]<3,>=1.0.0->msal>=1.30.0->azure-identity->-r ../requirements.txt (line 2)) (2.10.1)\n", - "Note: you may need to restart the kernel to use updated packages.\n" - ] - } - ], + "outputs": [], "source": [ "%pip install -r ../requirements.txt" ] @@ -91,7 +54,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -116,24 +79,9 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "INFO:azure.identity._credentials.environment:No environment configuration found.\n", - "INFO:azure.identity._credentials.managed_identity:ManagedIdentityCredential will use IMDS\n", - "INFO:azure.core.pipeline.policies.http_logging_policy:Request URL: 'http://169.254.169.254/metadata/identity/oauth2/token?api-version=REDACTED&resource=REDACTED'\n", - "Request method: 'GET'\n", - "Request headers:\n", - " 'User-Agent': 'azsdk-python-identity/1.23.0 Python/3.11.12 (Linux-5.15.167.4-microsoft-standard-WSL2-x86_64-with-glibc2.36)'\n", - "No body was attached to the request\n", - "INFO:azure.identity._credentials.chained:DefaultAzureCredential acquired a token from AzureCliCredential\n" - ] - } - ], + "outputs": [], "source": [ "import logging\n", "import json\n", @@ -170,216 +118,23 @@ "metadata": {}, "source": [ "## Prepare labeled data\n", - "In this step, we will \n", - "- Check whether document files in local folder have corresponding `.labels.json` and `.result.json` files\n", - "- Upload these files to the designated Azure blob storage.\n", - "\n", - "We use **TRAINING_DATA_SAS_URL** and **TRAINING_DATA_PATH** that's set in the Prerequisites step." + "In this step, we will\n", + "- Use `TRAINING_DATA_PATH` and SAS URL related environment variables that were set in the Prerequisites step.\n", + "- Try to get the SAS URL from the environment variable `TRAINING_DATA_SAS_URL`.\n", + "If this is not set, we attempt to generate the SAS URL automatically using the environment variables `TRAINING_DATA_STORAGE_ACCOUNT_NAME` and `TRAINING_DATA_CONTAINER_NAME`.\n", + "- Verify that document files in the local folder have corresponding `.labels.json` and `.result.json` files\n", + "- Upload these files to the Azure Blob storage container specified by the environment variables." ] }, { "cell_type": "code", - "execution_count": 4, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "INFO:azure.identity._credentials.environment:No environment configuration found.\n", - "INFO:azure.identity._credentials.managed_identity:ManagedIdentityCredential will use IMDS\n", - "INFO:azure.core.pipeline.policies.http_logging_policy:Request URL: 'http://169.254.169.254/metadata/identity/oauth2/token?api-version=REDACTED&resource=REDACTED'\n", - "Request method: 'GET'\n", - "Request headers:\n", - " 'User-Agent': 'azsdk-python-identity/1.23.0 Python/3.11.12 (Linux-5.15.167.4-microsoft-standard-WSL2-x86_64-with-glibc2.36)'\n", - "No body was attached to the request\n", - "INFO:azure.identity._credentials.chained:DefaultAzureCredential acquired a token from AzureCliCredential\n", - "INFO:azure.core.pipeline.policies.http_logging_policy:Request URL: 'https://mmigithubsamplesstorage.blob.core.windows.net/?restype=REDACTED&comp=REDACTED'\n", - "Request method: 'POST'\n", - "Request headers:\n", - " 'Content-Length': '130'\n", - " 'x-ms-version': 'REDACTED'\n", - " 'Content-Type': 'application/xml'\n", - " 'Accept': 'application/xml'\n", - " 'User-Agent': 'azsdk-python-storage-blob/12.25.1 Python/3.11.12 (Linux-5.15.167.4-microsoft-standard-WSL2-x86_64-with-glibc2.36)'\n", - " 'x-ms-date': 'REDACTED'\n", - " 'x-ms-client-request-id': '39d96820-6d73-11f0-b984-7ed684973587'\n", - " 'Authorization': 'REDACTED'\n", - "A body is sent with the request\n", - "INFO:azure.core.pipeline.policies.http_logging_policy:Response status: 200\n", - "Response headers:\n", - " 'Transfer-Encoding': 'chunked'\n", - " 'Content-Type': 'application/xml'\n", - " 'Server': 'Windows-Azure-Blob/1.0 Microsoft-HTTPAPI/2.0'\n", - " 'x-ms-request-id': '7f8414ef-201e-0026-6a7f-018681000000'\n", - " 'x-ms-client-request-id': '39d96820-6d73-11f0-b984-7ed684973587'\n", - " 'x-ms-version': 'REDACTED'\n", - " 'Date': 'Wed, 30 Jul 2025 18:30:11 GMT'\n", - "INFO:azure.core.pipeline.policies.http_logging_policy:Request URL: 'https://mmigithubsamplesstorage.blob.core.windows.net/mmi-github-samples-blob-container/test_training_20250730/29d60394-3da1-4714-abdc-ff0993009872.jpg?st=REDACTED&se=REDACTED&sp=REDACTED&sv=REDACTED&sr=REDACTED&skoid=REDACTED&sktid=REDACTED&skt=REDACTED&ske=REDACTED&sks=REDACTED&skv=REDACTED&sig=REDACTED'\n", - "Request method: 'PUT'\n", - "Request headers:\n", - " 'Content-Length': '822964'\n", - " 'x-ms-blob-type': 'REDACTED'\n", - " 'x-ms-version': 'REDACTED'\n", - " 'Content-Type': 'application/octet-stream'\n", - " 'Accept': 'application/xml'\n", - " 'User-Agent': 'azsdk-python-storage-blob/12.25.1 Python/3.11.12 (Linux-5.15.167.4-microsoft-standard-WSL2-x86_64-with-glibc2.36)'\n", - " 'x-ms-date': 'REDACTED'\n", - " 'x-ms-client-request-id': '3a2d44b8-6d73-11f0-b984-7ed684973587'\n", - "A body is sent with the request\n", - "INFO:azure.core.pipeline.policies.http_logging_policy:Response status: 201\n", - "Response headers:\n", - " 'Content-Length': '0'\n", - " 'Content-MD5': 'REDACTED'\n", - " 'Last-Modified': 'Wed, 30 Jul 2025 18:30:13 GMT'\n", - " 'Etag': '\"0x8DDCF971F6A8644\"'\n", - " 'Server': 'Windows-Azure-Blob/1.0 Microsoft-HTTPAPI/2.0'\n", - " 'x-ms-request-id': 'd55ac00f-a01e-0075-5f7f-01a5b5000000'\n", - " 'x-ms-client-request-id': '3a2d44b8-6d73-11f0-b984-7ed684973587'\n", - " 'x-ms-version': 'REDACTED'\n", - " 'x-ms-content-crc64': 'REDACTED'\n", - " 'x-ms-request-server-encrypted': 'REDACTED'\n", - " 'Date': 'Wed, 30 Jul 2025 18:30:12 GMT'\n", - "INFO:python.content_understanding_client:Uploaded file to test_training_20250730/29d60394-3da1-4714-abdc-ff0993009872.jpg\n", - "INFO:azure.core.pipeline.policies.http_logging_policy:Request URL: 'https://mmigithubsamplesstorage.blob.core.windows.net/mmi-github-samples-blob-container/test_training_20250730/29d60394-3da1-4714-abdc-ff0993009872.jpg.labels.json?st=REDACTED&se=REDACTED&sp=REDACTED&sv=REDACTED&sr=REDACTED&skoid=REDACTED&sktid=REDACTED&skt=REDACTED&ske=REDACTED&sks=REDACTED&skv=REDACTED&sig=REDACTED'\n", - "Request method: 'PUT'\n", - "Request headers:\n", - " 'Content-Length': '2014'\n", - " 'x-ms-blob-type': 'REDACTED'\n", - " 'x-ms-version': 'REDACTED'\n", - " 'Content-Type': 'application/octet-stream'\n", - " 'Accept': 'application/xml'\n", - " 'User-Agent': 'azsdk-python-storage-blob/12.25.1 Python/3.11.12 (Linux-5.15.167.4-microsoft-standard-WSL2-x86_64-with-glibc2.36)'\n", - " 'x-ms-date': 'REDACTED'\n", - " 'x-ms-client-request-id': '3a96d608-6d73-11f0-b984-7ed684973587'\n", - "A body is sent with the request\n", - "INFO:azure.core.pipeline.policies.http_logging_policy:Response status: 201\n", - "Response headers:\n", - " 'Content-Length': '0'\n", - " 'Content-MD5': 'REDACTED'\n", - " 'Last-Modified': 'Wed, 30 Jul 2025 18:30:13 GMT'\n", - " 'Etag': '\"0x8DDCF971F746FD4\"'\n", - " 'Server': 'Windows-Azure-Blob/1.0 Microsoft-HTTPAPI/2.0'\n", - " 'x-ms-request-id': 'd55ac210-a01e-0075-517f-01a5b5000000'\n", - " 'x-ms-client-request-id': '3a96d608-6d73-11f0-b984-7ed684973587'\n", - " 'x-ms-version': 'REDACTED'\n", - " 'x-ms-content-crc64': 'REDACTED'\n", - " 'x-ms-request-server-encrypted': 'REDACTED'\n", - " 'Date': 'Wed, 30 Jul 2025 18:30:12 GMT'\n", - "INFO:python.content_understanding_client:Uploaded file to test_training_20250730/29d60394-3da1-4714-abdc-ff0993009872.jpg.labels.json\n", - "INFO:azure.core.pipeline.policies.http_logging_policy:Request URL: 'https://mmigithubsamplesstorage.blob.core.windows.net/mmi-github-samples-blob-container/test_training_20250730/29d60394-3da1-4714-abdc-ff0993009872.jpg.result.json?st=REDACTED&se=REDACTED&sp=REDACTED&sv=REDACTED&sr=REDACTED&skoid=REDACTED&sktid=REDACTED&skt=REDACTED&ske=REDACTED&sks=REDACTED&skv=REDACTED&sig=REDACTED'\n", - "Request method: 'PUT'\n", - "Request headers:\n", - " 'Content-Length': '11545'\n", - " 'x-ms-blob-type': 'REDACTED'\n", - " 'x-ms-version': 'REDACTED'\n", - " 'Content-Type': 'application/octet-stream'\n", - " 'Accept': 'application/xml'\n", - " 'User-Agent': 'azsdk-python-storage-blob/12.25.1 Python/3.11.12 (Linux-5.15.167.4-microsoft-standard-WSL2-x86_64-with-glibc2.36)'\n", - " 'x-ms-date': 'REDACTED'\n", - " 'x-ms-client-request-id': '3a9f376c-6d73-11f0-b984-7ed684973587'\n", - "A body is sent with the request\n", - "INFO:azure.core.pipeline.policies.http_logging_policy:Response status: 201\n", - "Response headers:\n", - " 'Content-Length': '0'\n", - " 'Content-MD5': 'REDACTED'\n", - " 'Last-Modified': 'Wed, 30 Jul 2025 18:30:13 GMT'\n", - " 'Etag': '\"0x8DDCF971F7CAC04\"'\n", - " 'Server': 'Windows-Azure-Blob/1.0 Microsoft-HTTPAPI/2.0'\n", - " 'x-ms-request-id': 'd55ac24c-a01e-0075-0a7f-01a5b5000000'\n", - " 'x-ms-client-request-id': '3a9f376c-6d73-11f0-b984-7ed684973587'\n", - " 'x-ms-version': 'REDACTED'\n", - " 'x-ms-content-crc64': 'REDACTED'\n", - " 'x-ms-request-server-encrypted': 'REDACTED'\n", - " 'Date': 'Wed, 30 Jul 2025 18:30:12 GMT'\n", - "INFO:python.content_understanding_client:Uploaded file to test_training_20250730/29d60394-3da1-4714-abdc-ff0993009872.jpg.result.json\n", - "INFO:python.content_understanding_client:Uploaded training data for 29d60394-3da1-4714-abdc-ff0993009872.jpg\n", - "INFO:azure.core.pipeline.policies.http_logging_policy:Request URL: 'https://mmigithubsamplesstorage.blob.core.windows.net/mmi-github-samples-blob-container/test_training_20250730/17a84146-e910-460c-bf80-a625e6f64fea.jpg?st=REDACTED&se=REDACTED&sp=REDACTED&sv=REDACTED&sr=REDACTED&skoid=REDACTED&sktid=REDACTED&skt=REDACTED&ske=REDACTED&sks=REDACTED&skv=REDACTED&sig=REDACTED'\n", - "Request method: 'PUT'\n", - "Request headers:\n", - " 'Content-Length': '507561'\n", - " 'x-ms-blob-type': 'REDACTED'\n", - " 'x-ms-version': 'REDACTED'\n", - " 'Content-Type': 'application/octet-stream'\n", - " 'Accept': 'application/xml'\n", - " 'User-Agent': 'azsdk-python-storage-blob/12.25.1 Python/3.11.12 (Linux-5.15.167.4-microsoft-standard-WSL2-x86_64-with-glibc2.36)'\n", - " 'x-ms-date': 'REDACTED'\n", - " 'x-ms-client-request-id': '3aa7a2f8-6d73-11f0-b984-7ed684973587'\n", - "A body is sent with the request\n", - "INFO:azure.core.pipeline.policies.http_logging_policy:Response status: 201\n", - "Response headers:\n", - " 'Content-Length': '0'\n", - " 'Content-MD5': 'REDACTED'\n", - " 'Last-Modified': 'Wed, 30 Jul 2025 18:30:13 GMT'\n", - " 'Etag': '\"0x8DDCF971FA626CC\"'\n", - " 'Server': 'Windows-Azure-Blob/1.0 Microsoft-HTTPAPI/2.0'\n", - " 'x-ms-request-id': 'd55ac28b-a01e-0075-487f-01a5b5000000'\n", - " 'x-ms-client-request-id': '3aa7a2f8-6d73-11f0-b984-7ed684973587'\n", - " 'x-ms-version': 'REDACTED'\n", - " 'x-ms-content-crc64': 'REDACTED'\n", - " 'x-ms-request-server-encrypted': 'REDACTED'\n", - " 'Date': 'Wed, 30 Jul 2025 18:30:12 GMT'\n", - "INFO:python.content_understanding_client:Uploaded file to test_training_20250730/17a84146-e910-460c-bf80-a625e6f64fea.jpg\n", - "INFO:azure.core.pipeline.policies.http_logging_policy:Request URL: 'https://mmigithubsamplesstorage.blob.core.windows.net/mmi-github-samples-blob-container/test_training_20250730/17a84146-e910-460c-bf80-a625e6f64fea.jpg.labels.json?st=REDACTED&se=REDACTED&sp=REDACTED&sv=REDACTED&sr=REDACTED&skoid=REDACTED&sktid=REDACTED&skt=REDACTED&ske=REDACTED&sks=REDACTED&skv=REDACTED&sig=REDACTED'\n", - "Request method: 'PUT'\n", - "Request headers:\n", - " 'Content-Length': '1835'\n", - " 'x-ms-blob-type': 'REDACTED'\n", - " 'x-ms-version': 'REDACTED'\n", - " 'Content-Type': 'application/octet-stream'\n", - " 'Accept': 'application/xml'\n", - " 'User-Agent': 'azsdk-python-storage-blob/12.25.1 Python/3.11.12 (Linux-5.15.167.4-microsoft-standard-WSL2-x86_64-with-glibc2.36)'\n", - " 'x-ms-date': 'REDACTED'\n", - " 'x-ms-client-request-id': '3ad0fa18-6d73-11f0-b984-7ed684973587'\n", - "A body is sent with the request\n", - "INFO:azure.core.pipeline.policies.http_logging_policy:Response status: 201\n", - "Response headers:\n", - " 'Content-Length': '0'\n", - " 'Content-MD5': 'REDACTED'\n", - " 'Last-Modified': 'Wed, 30 Jul 2025 18:30:13 GMT'\n", - " 'Etag': '\"0x8DDCF971FAD78B0\"'\n", - " 'Server': 'Windows-Azure-Blob/1.0 Microsoft-HTTPAPI/2.0'\n", - " 'x-ms-request-id': 'd55ac397-a01e-0075-4c7f-01a5b5000000'\n", - " 'x-ms-client-request-id': '3ad0fa18-6d73-11f0-b984-7ed684973587'\n", - " 'x-ms-version': 'REDACTED'\n", - " 'x-ms-content-crc64': 'REDACTED'\n", - " 'x-ms-request-server-encrypted': 'REDACTED'\n", - " 'Date': 'Wed, 30 Jul 2025 18:30:12 GMT'\n", - "INFO:python.content_understanding_client:Uploaded file to test_training_20250730/17a84146-e910-460c-bf80-a625e6f64fea.jpg.labels.json\n", - "INFO:azure.core.pipeline.policies.http_logging_policy:Request URL: 'https://mmigithubsamplesstorage.blob.core.windows.net/mmi-github-samples-blob-container/test_training_20250730/17a84146-e910-460c-bf80-a625e6f64fea.jpg.result.json?st=REDACTED&se=REDACTED&sp=REDACTED&sv=REDACTED&sr=REDACTED&skoid=REDACTED&sktid=REDACTED&skt=REDACTED&ske=REDACTED&sks=REDACTED&skv=REDACTED&sig=REDACTED'\n", - "Request method: 'PUT'\n", - "Request headers:\n", - " 'Content-Length': '10683'\n", - " 'x-ms-blob-type': 'REDACTED'\n", - " 'x-ms-version': 'REDACTED'\n", - " 'Content-Type': 'application/octet-stream'\n", - " 'Accept': 'application/xml'\n", - " 'User-Agent': 'azsdk-python-storage-blob/12.25.1 Python/3.11.12 (Linux-5.15.167.4-microsoft-standard-WSL2-x86_64-with-glibc2.36)'\n", - " 'x-ms-date': 'REDACTED'\n", - " 'x-ms-client-request-id': '3ad81730-6d73-11f0-b984-7ed684973587'\n", - "A body is sent with the request\n", - "INFO:azure.core.pipeline.policies.http_logging_policy:Response status: 201\n", - "Response headers:\n", - " 'Content-Length': '0'\n", - " 'Content-MD5': 'REDACTED'\n", - " 'Last-Modified': 'Wed, 30 Jul 2025 18:30:13 GMT'\n", - " 'Etag': '\"0x8DDCF971FB6780D\"'\n", - " 'Server': 'Windows-Azure-Blob/1.0 Microsoft-HTTPAPI/2.0'\n", - " 'x-ms-request-id': 'd55ac3cc-a01e-0075-7e7f-01a5b5000000'\n", - " 'x-ms-client-request-id': '3ad81730-6d73-11f0-b984-7ed684973587'\n", - " 'x-ms-version': 'REDACTED'\n", - " 'x-ms-content-crc64': 'REDACTED'\n", - " 'x-ms-request-server-encrypted': 'REDACTED'\n", - " 'Date': 'Wed, 30 Jul 2025 18:30:12 GMT'\n", - "INFO:python.content_understanding_client:Uploaded file to test_training_20250730/17a84146-e910-460c-bf80-a625e6f64fea.jpg.result.json\n", - "INFO:python.content_understanding_client:Uploaded training data for 17a84146-e910-460c-bf80-a625e6f64fea.jpg\n" - ] - } - ], + "outputs": [], "source": [ "TRAINING_DATA_SAS_URL = os.getenv(\"TRAINING_DATA_SAS_URL\")\n", "if not TRAINING_DATA_SAS_URL:\n", - " TRAINING_DATA_STORAGE_ACCOUNT_NAME = os.getenv(\"TRAINING_DATA_STORAGE_ACCCOUNT_NAME\")\n", + " TRAINING_DATA_STORAGE_ACCOUNT_NAME = os.getenv(\"TRAINING_DATA_STORAGE_ACCOUNT_NAME\")\n", " TRAINING_DATA_CONTAINER_NAME = os.getenv(\"TRAINING_DATA_CONTAINER_NAME\")\n", " if not TRAINING_DATA_STORAGE_ACCOUNT_NAME and not TRAINING_DATA_SAS_URL:\n", " raise ValueError(\n", @@ -387,7 +142,7 @@ " )\n", " TRAINING_DATA_SAS_URL = AzureContentUnderstandingClient.generate_temp_container_sas_url(\n", " TRAINING_DATA_STORAGE_ACCOUNT_NAME,\n", - " TRAINING_DATA_CONTAINER_NAME\n", + " TRAINING_DATA_CONTAINER_NAME,\n", " )\n", "\n", "TRAINING_DATA_PATH = os.getenv(\"TRAINING_DATA_PATH\")\n", @@ -407,88 +162,9 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "INFO:python.content_understanding_client:Analyzer train-sample-58b42e42-e59e-41a1-862f-40e14e0ba0a3 create request accepted.\n", - "INFO:python.content_understanding_client:Request 9d46504a-0521-4940-897b-c56432562474 in progress ...\n", - "INFO:python.content_understanding_client:Request result is ready after 2.22 seconds.\n", - "INFO:root:Analyzer details for train-sample-58b42e42-e59e-41a1-862f-40e14e0ba0a3\n", - "INFO:root:{\n", - " \"id\": \"9d46504a-0521-4940-897b-c56432562474\",\n", - " \"status\": \"Succeeded\",\n", - " \"result\": {\n", - " \"analyzerId\": \"train-sample-58b42e42-e59e-41a1-862f-40e14e0ba0a3\",\n", - " \"description\": \"Extract useful information from receipt\",\n", - " \"createdAt\": \"2025-07-30T18:30:28Z\",\n", - " \"lastModifiedAt\": \"2025-07-30T18:30:29Z\",\n", - " \"baseAnalyzerId\": \"prebuilt-documentAnalyzer\",\n", - " \"config\": {\n", - " \"returnDetails\": true,\n", - " \"enableOcr\": true,\n", - " \"enableLayout\": true,\n", - " \"enableFormula\": false,\n", - " \"disableContentFiltering\": false,\n", - " \"tableFormat\": \"html\"\n", - " },\n", - " \"fieldSchema\": {\n", - " \"fields\": {\n", - " \"MerchantName\": {\n", - " \"type\": \"string\",\n", - " \"method\": \"extract\",\n", - " \"description\": \"\"\n", - " },\n", - " \"Items\": {\n", - " \"type\": \"array\",\n", - " \"method\": \"generate\",\n", - " \"description\": \"\",\n", - " \"items\": {\n", - " \"type\": \"object\",\n", - " \"method\": \"extract\",\n", - " \"properties\": {\n", - " \"Quantity\": {\n", - " \"type\": \"string\",\n", - " \"method\": \"extract\",\n", - " \"description\": \"\"\n", - " },\n", - " \"Name\": {\n", - " \"type\": \"string\",\n", - " \"method\": \"extract\",\n", - " \"description\": \"\"\n", - " },\n", - " \"Price\": {\n", - " \"type\": \"string\",\n", - " \"method\": \"extract\",\n", - " \"description\": \"\"\n", - " }\n", - " }\n", - " }\n", - " },\n", - " \"TotalPrice\": {\n", - " \"type\": \"string\",\n", - " \"method\": \"extract\",\n", - " \"description\": \"\"\n", - " }\n", - " }\n", - " },\n", - " \"trainingData\": {\n", - " \"containerUrl\": \"https://mmigithubsamplesstorage.blob.core.windows.net/mmi-github-samples-blob-container?st=2025-07-30T18%3A30%3A10Z&se=2025-07-30T19%3A30%3A10Z&sp=rwl&sv=2025-05-05&sr=c&skoid=83f4a972-1d44-4737-ba69-497f43bc66e2&sktid=72f988bf-86f1-41af-91ab-2d7cd011db47&skt=2025-07-30T18%3A30%3A10Z&ske=2025-07-30T19%3A30%3A10Z&sks=b&skv=2025-05-05&sig=kJBr0b3pz4MRMJOdDx8Rv0Pa5v3OcqC4KcmaPj/gM54%3D\",\n", - " \"kind\": \"blob\",\n", - " \"prefix\": \"test_training_20250730/\"\n", - " },\n", - " \"warnings\": [],\n", - " \"status\": \"ready\",\n", - " \"processingLocation\": \"geography\",\n", - " \"mode\": \"standard\"\n", - " }\n", - "}\n" - ] - } - ], + "outputs": [], "source": [ "import uuid\n", "CUSTOM_ANALYZER_ID = \"train-sample-\" + str(uuid.uuid4())\n", @@ -520,800 +196,9 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "INFO:python.content_understanding_client:Analyzing file ../data/receipt.png with analyzer: train-sample-58b42e42-e59e-41a1-862f-40e14e0ba0a3\n", - "INFO:python.content_understanding_client:Request f68cdf71-6db1-4e3b-8933-8cf2bf9a5e06 in progress ...\n", - "INFO:python.content_understanding_client:Request f68cdf71-6db1-4e3b-8933-8cf2bf9a5e06 in progress ...\n", - "INFO:python.content_understanding_client:Request result is ready after 4.45 seconds.\n", - "INFO:root:{\n", - " \"id\": \"f68cdf71-6db1-4e3b-8933-8cf2bf9a5e06\",\n", - " \"status\": \"Succeeded\",\n", - " \"result\": {\n", - " \"analyzerId\": \"train-sample-58b42e42-e59e-41a1-862f-40e14e0ba0a3\",\n", - " \"apiVersion\": \"2025-05-01-preview\",\n", - " \"createdAt\": \"2025-07-30T18:30:31Z\",\n", - " \"warnings\": [],\n", - " \"contents\": [\n", - " {\n", - " \"markdown\": \"Contoso\\n\\n123 Main Street\\nRedmond, WA 98052\\n\\n987-654-3210\\n\\n6/10/2019 13:59\\nSales Associate: Paul\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n
2 Surface Pro 6$1,998.00
3 Surface Pen$299.97
\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n
Sub-Total$2,297.97
Tax$218.31
Total$2,516.28
\\n\",\n", - " \"fields\": {\n", - " \"MerchantName\": {\n", - " \"type\": \"string\",\n", - " \"valueString\": \"Contoso\"\n", - " },\n", - " \"Items\": {\n", - " \"type\": \"array\",\n", - " \"valueArray\": [\n", - " {\n", - " \"type\": \"object\",\n", - " \"valueObject\": {\n", - " \"Quantity\": {\n", - " \"type\": \"string\",\n", - " \"valueString\": \"2\"\n", - " },\n", - " \"Name\": {\n", - " \"type\": \"string\",\n", - " \"valueString\": \"Surface Pro 6\"\n", - " },\n", - " \"Price\": {\n", - " \"type\": \"string\",\n", - " \"valueString\": \"$1,998.00\"\n", - " }\n", - " }\n", - " },\n", - " {\n", - " \"type\": \"object\",\n", - " \"valueObject\": {\n", - " \"Quantity\": {\n", - " \"type\": \"string\",\n", - " \"valueString\": \"3\"\n", - " },\n", - " \"Name\": {\n", - " \"type\": \"string\",\n", - " \"valueString\": \"Surface Pen\"\n", - " },\n", - " \"Price\": {\n", - " \"type\": \"string\",\n", - " \"valueString\": \"$299.97\"\n", - " }\n", - " }\n", - " }\n", - " ]\n", - " },\n", - " \"TotalPrice\": {\n", - " \"type\": \"string\",\n", - " \"valueString\": \"$2,516.28\"\n", - " }\n", - " },\n", - " \"kind\": \"document\",\n", - " \"startPageNumber\": 1,\n", - " \"endPageNumber\": 1,\n", - " \"unit\": \"pixel\",\n", - " \"pages\": [\n", - " {\n", - " \"pageNumber\": 1,\n", - " \"angle\": -0.0848,\n", - " \"width\": 1743,\n", - " \"height\": 878,\n", - " \"spans\": [\n", - " {\n", - " \"offset\": 0,\n", - " \"length\": 375\n", - " }\n", - " ],\n", - " \"words\": [\n", - " {\n", - " \"content\": \"Contoso\",\n", - " \"span\": {\n", - " \"offset\": 0,\n", - " \"length\": 7\n", - " },\n", - " \"confidence\": 0.995,\n", - " \"source\": \"D(1,774,72,974,70,974,111,774,113)\"\n", - " },\n", - " {\n", - " \"content\": \"123\",\n", - " \"span\": {\n", - " \"offset\": 9,\n", - " \"length\": 3\n", - " },\n", - " \"confidence\": 0.997,\n", - " \"source\": \"D(1,700,189,730,189,730,213,700,213)\"\n", - " },\n", - " {\n", - " \"content\": \"Main\",\n", - " \"span\": {\n", - " \"offset\": 13,\n", - " \"length\": 4\n", - " },\n", - " \"confidence\": 0.993,\n", - " \"source\": \"D(1,738,189,786,189,786,212,738,213)\"\n", - " },\n", - " {\n", - " \"content\": \"Street\",\n", - " \"span\": {\n", - " \"offset\": 18,\n", - " \"length\": 6\n", - " },\n", - " \"confidence\": 0.996,\n", - " \"source\": \"D(1,795,189,860,188,860,212,795,212)\"\n", - " },\n", - " {\n", - " \"content\": \"Redmond,\",\n", - " \"span\": {\n", - " \"offset\": 25,\n", - " \"length\": 8\n", - " },\n", - " \"confidence\": 0.993,\n", - " \"source\": \"D(1,699,224,800,224,800,249,699,249)\"\n", - " },\n", - " {\n", - " \"content\": \"WA\",\n", - " \"span\": {\n", - " \"offset\": 34,\n", - " \"length\": 2\n", - " },\n", - " \"confidence\": 0.973,\n", - " \"source\": \"D(1,808,223,841,223,841,248,808,249)\"\n", - " },\n", - " {\n", - " \"content\": \"98052\",\n", - " \"span\": {\n", - " \"offset\": 37,\n", - " \"length\": 5\n", - " },\n", - " \"confidence\": 0.997,\n", - " \"source\": \"D(1,847,223,912,222,912,247,847,248)\"\n", - " },\n", - " {\n", - " \"content\": \"987-654-3210\",\n", - " \"span\": {\n", - " \"offset\": 44,\n", - " \"length\": 12\n", - " },\n", - " \"confidence\": 0.993,\n", - " \"source\": \"D(1,699,298,843,298,843,322,699,322)\"\n", - " },\n", - " {\n", - " \"content\": \"6/10/2019\",\n", - " \"span\": {\n", - " \"offset\": 58,\n", - " \"length\": 9\n", - " },\n", - " \"confidence\": 0.992,\n", - " \"source\": \"D(1,699,372,794,372,794,399,699,399)\"\n", - " },\n", - " {\n", - " \"content\": \"13:59\",\n", - " \"span\": {\n", - " \"offset\": 68,\n", - " \"length\": 5\n", - " },\n", - " \"confidence\": 0.993,\n", - " \"source\": \"D(1,803,372,853,372,853,398,803,399)\"\n", - " },\n", - " {\n", - " \"content\": \"Sales\",\n", - " \"span\": {\n", - " \"offset\": 74,\n", - " \"length\": 5\n", - " },\n", - " \"confidence\": 0.995,\n", - " \"source\": \"D(1,699,409,756,409,756,433,699,433)\"\n", - " },\n", - " {\n", - " \"content\": \"Associate:\",\n", - " \"span\": {\n", - " \"offset\": 80,\n", - " \"length\": 10\n", - " },\n", - " \"confidence\": 0.989,\n", - " \"source\": \"D(1,764,409,868,409,868,433,764,433)\"\n", - " },\n", - " {\n", - " \"content\": \"Paul\",\n", - " \"span\": {\n", - " \"offset\": 91,\n", - " \"length\": 4\n", - " },\n", - " \"confidence\": 0.991,\n", - " \"source\": \"D(1,876,409,924,409,924,433,876,433)\"\n", - " },\n", - " {\n", - " \"content\": \"2\",\n", - " \"span\": {\n", - " \"offset\": 115,\n", - " \"length\": 1\n", - " },\n", - " \"confidence\": 0.995,\n", - " \"source\": \"D(1,704,483,717,483,717,508,704,508)\"\n", - " },\n", - " {\n", - " \"content\": \"Surface\",\n", - " \"span\": {\n", - " \"offset\": 117,\n", - " \"length\": 7\n", - " },\n", - " \"confidence\": 0.992,\n", - " \"source\": \"D(1,731,483,811,483,811,508,731,508)\"\n", - " },\n", - " {\n", - " \"content\": \"Pro\",\n", - " \"span\": {\n", - " \"offset\": 125,\n", - " \"length\": 3\n", - " },\n", - " \"confidence\": 0.995,\n", - " \"source\": \"D(1,820,483,854,483,854,508,820,508)\"\n", - " },\n", - " {\n", - " \"content\": \"6\",\n", - " \"span\": {\n", - " \"offset\": 129,\n", - " \"length\": 1\n", - " },\n", - " \"confidence\": 0.977,\n", - " \"source\": \"D(1,862,483,875,482,875,507,862,508)\"\n", - " },\n", - " {\n", - " \"content\": \"$1,998.00\",\n", - " \"span\": {\n", - " \"offset\": 140,\n", - " \"length\": 9\n", - " },\n", - " \"confidence\": 0.993,\n", - " \"source\": \"D(1,952,482,1048,482,1048,508,952,509)\"\n", - " },\n", - " {\n", - " \"content\": \"3\",\n", - " \"span\": {\n", - " \"offset\": 170,\n", - " \"length\": 1\n", - " },\n", - " \"confidence\": 0.995,\n", - " \"source\": \"D(1,703,522,716,522,715,546,703,546)\"\n", - " },\n", - " {\n", - " \"content\": \"Surface\",\n", - " \"span\": {\n", - " \"offset\": 172,\n", - " \"length\": 7\n", - " },\n", - " \"confidence\": 0.995,\n", - " \"source\": \"D(1,731,522,812,521,812,546,731,546)\"\n", - " },\n", - " {\n", - " \"content\": \"Pen\",\n", - " \"span\": {\n", - " \"offset\": 180,\n", - " \"length\": 3\n", - " },\n", - " \"confidence\": 0.996,\n", - " \"source\": \"D(1,820,521,859,522,859,546,820,546)\"\n", - " },\n", - " {\n", - " \"content\": \"$299.97\",\n", - " \"span\": {\n", - " \"offset\": 193,\n", - " \"length\": 7\n", - " },\n", - " \"confidence\": 0.994,\n", - " \"source\": \"D(1,969,521,1050,521,1050,546,969,546)\"\n", - " },\n", - " {\n", - " \"content\": \"Sub-Total\",\n", - " \"span\": {\n", - " \"offset\": 240,\n", - " \"length\": 9\n", - " },\n", - " \"confidence\": 0.994,\n", - " \"source\": \"D(1,764,597,869,597,869,621,764,621)\"\n", - " },\n", - " {\n", - " \"content\": \"$2,297.97\",\n", - " \"span\": {\n", - " \"offset\": 259,\n", - " \"length\": 9\n", - " },\n", - " \"confidence\": 0.992,\n", - " \"source\": \"D(1,952,597,1051,597,1051,622,952,623)\"\n", - " },\n", - " {\n", - " \"content\": \"Tax\",\n", - " \"span\": {\n", - " \"offset\": 289,\n", - " \"length\": 3\n", - " },\n", - " \"confidence\": 0.998,\n", - " \"source\": \"D(1,767,635,805,635,805,659,767,659)\"\n", - " },\n", - " {\n", - " \"content\": \"$218.31\",\n", - " \"span\": {\n", - " \"offset\": 302,\n", - " \"length\": 7\n", - " },\n", - " \"confidence\": 0.994,\n", - " \"source\": \"D(1,976,634,1051,634,1051,659,976,659)\"\n", - " },\n", - " {\n", - " \"content\": \"Total\",\n", - " \"span\": {\n", - " \"offset\": 330,\n", - " \"length\": 5\n", - " },\n", - " \"confidence\": 0.993,\n", - " \"source\": \"D(1,768,712,822,712,822,737,768,737)\"\n", - " },\n", - " {\n", - " \"content\": \"$2,516.28\",\n", - " \"span\": {\n", - " \"offset\": 345,\n", - " \"length\": 9\n", - " },\n", - " \"confidence\": 0.992,\n", - " \"source\": \"D(1,959,710,1054,709,1054,737,959,737)\"\n", - " }\n", - " ],\n", - " \"lines\": [\n", - " {\n", - " \"content\": \"Contoso\",\n", - " \"source\": \"D(1,774,71,973,70,974,111,774,113)\",\n", - " \"span\": {\n", - " \"offset\": 0,\n", - " \"length\": 7\n", - " }\n", - " },\n", - " {\n", - " \"content\": \"123 Main Street\",\n", - " \"source\": \"D(1,699,189,859,188,859,212,700,213)\",\n", - " \"span\": {\n", - " \"offset\": 9,\n", - " \"length\": 15\n", - " }\n", - " },\n", - " {\n", - " \"content\": \"Redmond, WA 98052\",\n", - " \"source\": \"D(1,699,224,911,222,911,247,699,249)\",\n", - " \"span\": {\n", - " \"offset\": 25,\n", - " \"length\": 17\n", - " }\n", - " },\n", - " {\n", - " \"content\": \"987-654-3210\",\n", - " \"source\": \"D(1,699,298,842,298,842,322,699,322)\",\n", - " \"span\": {\n", - " \"offset\": 44,\n", - " \"length\": 12\n", - " }\n", - " },\n", - " {\n", - " \"content\": \"6/10/2019 13:59\",\n", - " \"source\": \"D(1,699,372,853,372,853,399,699,399)\",\n", - " \"span\": {\n", - " \"offset\": 58,\n", - " \"length\": 15\n", - " }\n", - " },\n", - " {\n", - " \"content\": \"Sales Associate: Paul\",\n", - " \"source\": \"D(1,699,409,923,409,923,433,699,433)\",\n", - " \"span\": {\n", - " \"offset\": 74,\n", - " \"length\": 21\n", - " }\n", - " },\n", - " {\n", - " \"content\": \"2 Surface Pro 6\",\n", - " \"source\": \"D(1,703,483,874,482,874,507,704,508)\",\n", - " \"span\": {\n", - " \"offset\": 115,\n", - " \"length\": 15\n", - " }\n", - " },\n", - " {\n", - " \"content\": \"$1,998.00\",\n", - " \"source\": \"D(1,952,482,1048,482,1048,509,952,508)\",\n", - " \"span\": {\n", - " \"offset\": 140,\n", - " \"length\": 9\n", - " }\n", - " },\n", - " {\n", - " \"content\": \"3 Surface Pen\",\n", - " \"source\": \"D(1,703,522,859,521,859,546,703,546)\",\n", - " \"span\": {\n", - " \"offset\": 170,\n", - " \"length\": 13\n", - " }\n", - " },\n", - " {\n", - " \"content\": \"$299.97\",\n", - " \"source\": \"D(1,969,521,1049,521,1049,546,969,546)\",\n", - " \"span\": {\n", - " \"offset\": 193,\n", - " \"length\": 7\n", - " }\n", - " },\n", - " {\n", - " \"content\": \"Sub-Total\",\n", - " \"source\": \"D(1,764,597,868,597,868,621,764,621)\",\n", - " \"span\": {\n", - " \"offset\": 240,\n", - " \"length\": 9\n", - " }\n", - " },\n", - " {\n", - " \"content\": \"$2,297.97\",\n", - " \"source\": \"D(1,952,597,1050,597,1050,622,952,623)\",\n", - " \"span\": {\n", - " \"offset\": 259,\n", - " \"length\": 9\n", - " }\n", - " },\n", - " {\n", - " \"content\": \"Tax\",\n", - " \"source\": \"D(1,767,635,804,635,804,658,767,658)\",\n", - " \"span\": {\n", - " \"offset\": 289,\n", - " \"length\": 3\n", - " }\n", - " },\n", - " {\n", - " \"content\": \"$218.31\",\n", - " \"source\": \"D(1,976,634,1051,634,1051,659,976,659)\",\n", - " \"span\": {\n", - " \"offset\": 302,\n", - " \"length\": 7\n", - " }\n", - " },\n", - " {\n", - " \"content\": \"Total\",\n", - " \"source\": \"D(1,768,712,821,712,821,736,768,736)\",\n", - " \"span\": {\n", - " \"offset\": 330,\n", - " \"length\": 5\n", - " }\n", - " },\n", - " {\n", - " \"content\": \"$2,516.28\",\n", - " \"source\": \"D(1,959,710,1054,709,1054,737,959,737)\",\n", - " \"span\": {\n", - " \"offset\": 345,\n", - " \"length\": 9\n", - " }\n", - " }\n", - " ]\n", - " }\n", - " ],\n", - " \"paragraphs\": [\n", - " {\n", - " \"content\": \"Contoso\",\n", - " \"source\": \"D(1,774,71,973,70,974,111,774,113)\",\n", - " \"span\": {\n", - " \"offset\": 0,\n", - " \"length\": 7\n", - " }\n", - " },\n", - " {\n", - " \"content\": \"123 Main Street Redmond, WA 98052\",\n", - " \"source\": \"D(1,698,189,911,188,911,247,699,249)\",\n", - " \"span\": {\n", - " \"offset\": 9,\n", - " \"length\": 33\n", - " }\n", - " },\n", - " {\n", - " \"content\": \"987-654-3210\",\n", - " \"source\": \"D(1,699,298,842,298,842,322,699,322)\",\n", - " \"span\": {\n", - " \"offset\": 44,\n", - " \"length\": 12\n", - " }\n", - " },\n", - " {\n", - " \"content\": \"6/10/2019 13:59 Sales Associate: Paul\",\n", - " \"source\": \"D(1,699,372,923,372,923,433,699,433)\",\n", - " \"span\": {\n", - " \"offset\": 58,\n", - " \"length\": 37\n", - " }\n", - " },\n", - " {\n", - " \"content\": \"2 Surface Pro 6\",\n", - " \"source\": \"D(1,691,471,911,470,911,514,691,515)\",\n", - " \"span\": {\n", - " \"offset\": 115,\n", - " \"length\": 15\n", - " }\n", - " },\n", - " {\n", - " \"content\": \"$1,998.00\",\n", - " \"source\": \"D(1,911,470,1057,470,1057,513,911,514)\",\n", - " \"span\": {\n", - " \"offset\": 140,\n", - " \"length\": 9\n", - " }\n", - " },\n", - " {\n", - " \"content\": \"3 Surface Pen\",\n", - " \"source\": \"D(1,691,515,911,514,912,556,691,557)\",\n", - " \"span\": {\n", - " \"offset\": 170,\n", - " \"length\": 13\n", - " }\n", - " },\n", - " {\n", - " \"content\": \"$299.97\",\n", - " \"source\": \"D(1,911,514,1057,513,1057,555,912,556)\",\n", - " \"span\": {\n", - " \"offset\": 193,\n", - " \"length\": 7\n", - " }\n", - " },\n", - " {\n", - " \"content\": \"Sub-Total\",\n", - " \"source\": \"D(1,753,585,909,585,910,627,753,628)\",\n", - " \"span\": {\n", - " \"offset\": 240,\n", - " \"length\": 9\n", - " }\n", - " },\n", - " {\n", - " \"content\": \"$2,297.97\",\n", - " \"source\": \"D(1,909,585,1060,586,1060,627,910,627)\",\n", - " \"span\": {\n", - " \"offset\": 259,\n", - " \"length\": 9\n", - " }\n", - " },\n", - " {\n", - " \"content\": \"Tax\",\n", - " \"source\": \"D(1,753,628,910,627,910,683,754,684)\",\n", - " \"span\": {\n", - " \"offset\": 289,\n", - " \"length\": 3\n", - " }\n", - " },\n", - " {\n", - " \"content\": \"$218.31\",\n", - " \"source\": \"D(1,910,627,1060,627,1061,683,910,683)\",\n", - " \"span\": {\n", - " \"offset\": 302,\n", - " \"length\": 7\n", - " }\n", - " },\n", - " {\n", - " \"content\": \"Total\",\n", - " \"source\": \"D(1,754,684,910,683,910,747,754,748)\",\n", - " \"span\": {\n", - " \"offset\": 330,\n", - " \"length\": 5\n", - " }\n", - " },\n", - " {\n", - " \"content\": \"$2,516.28\",\n", - " \"source\": \"D(1,910,683,1061,683,1062,747,910,747)\",\n", - " \"span\": {\n", - " \"offset\": 345,\n", - " \"length\": 9\n", - " }\n", - " }\n", - " ],\n", - " \"sections\": [\n", - " {\n", - " \"span\": {\n", - " \"offset\": 0,\n", - " \"length\": 374\n", - " },\n", - " \"elements\": [\n", - " \"/paragraphs/0\",\n", - " \"/paragraphs/1\",\n", - " \"/paragraphs/2\",\n", - " \"/paragraphs/3\",\n", - " \"/tables/0\",\n", - " \"/tables/1\"\n", - " ]\n", - " }\n", - " ],\n", - " \"tables\": [\n", - " {\n", - " \"rowCount\": 2,\n", - " \"columnCount\": 2,\n", - " \"cells\": [\n", - " {\n", - " \"kind\": \"content\",\n", - " \"rowIndex\": 0,\n", - " \"columnIndex\": 0,\n", - " \"rowSpan\": 1,\n", - " \"columnSpan\": 1,\n", - " \"content\": \"2 Surface Pro 6\",\n", - " \"source\": \"D(1,691,471,911,470,911,514,691,515)\",\n", - " \"span\": {\n", - " \"offset\": 115,\n", - " \"length\": 15\n", - " },\n", - " \"elements\": [\n", - " \"/paragraphs/4\"\n", - " ]\n", - " },\n", - " {\n", - " \"kind\": \"content\",\n", - " \"rowIndex\": 0,\n", - " \"columnIndex\": 1,\n", - " \"rowSpan\": 1,\n", - " \"columnSpan\": 1,\n", - " \"content\": \"$1,998.00\",\n", - " \"source\": \"D(1,911,470,1057,470,1057,513,911,514)\",\n", - " \"span\": {\n", - " \"offset\": 140,\n", - " \"length\": 9\n", - " },\n", - " \"elements\": [\n", - " \"/paragraphs/5\"\n", - " ]\n", - " },\n", - " {\n", - " \"kind\": \"content\",\n", - " \"rowIndex\": 1,\n", - " \"columnIndex\": 0,\n", - " \"rowSpan\": 1,\n", - " \"columnSpan\": 1,\n", - " \"content\": \"3 Surface Pen\",\n", - " \"source\": \"D(1,691,515,911,514,912,556,691,557)\",\n", - " \"span\": {\n", - " \"offset\": 170,\n", - " \"length\": 13\n", - " },\n", - " \"elements\": [\n", - " \"/paragraphs/6\"\n", - " ]\n", - " },\n", - " {\n", - " \"kind\": \"content\",\n", - " \"rowIndex\": 1,\n", - " \"columnIndex\": 1,\n", - " \"rowSpan\": 1,\n", - " \"columnSpan\": 1,\n", - " \"content\": \"$299.97\",\n", - " \"source\": \"D(1,911,514,1057,513,1057,555,912,556)\",\n", - " \"span\": {\n", - " \"offset\": 193,\n", - " \"length\": 7\n", - " },\n", - " \"elements\": [\n", - " \"/paragraphs/7\"\n", - " ]\n", - " }\n", - " ],\n", - " \"source\": \"D(1,698,479,1050,478,1051,551,698,552)\",\n", - " \"span\": {\n", - " \"offset\": 98,\n", - " \"length\": 122\n", - " }\n", - " },\n", - " {\n", - " \"rowCount\": 3,\n", - " \"columnCount\": 2,\n", - " \"cells\": [\n", - " {\n", - " \"kind\": \"content\",\n", - " \"rowIndex\": 0,\n", - " \"columnIndex\": 0,\n", - " \"rowSpan\": 1,\n", - " \"columnSpan\": 1,\n", - " \"content\": \"Sub-Total\",\n", - " \"source\": \"D(1,753,585,909,585,910,627,753,628)\",\n", - " \"span\": {\n", - " \"offset\": 240,\n", - " \"length\": 9\n", - " },\n", - " \"elements\": [\n", - " \"/paragraphs/8\"\n", - " ]\n", - " },\n", - " {\n", - " \"kind\": \"content\",\n", - " \"rowIndex\": 0,\n", - " \"columnIndex\": 1,\n", - " \"rowSpan\": 1,\n", - " \"columnSpan\": 1,\n", - " \"content\": \"$2,297.97\",\n", - " \"source\": \"D(1,909,585,1060,586,1060,627,910,627)\",\n", - " \"span\": {\n", - " \"offset\": 259,\n", - " \"length\": 9\n", - " },\n", - " \"elements\": [\n", - " \"/paragraphs/9\"\n", - " ]\n", - " },\n", - " {\n", - " \"kind\": \"content\",\n", - " \"rowIndex\": 1,\n", - " \"columnIndex\": 0,\n", - " \"rowSpan\": 1,\n", - " \"columnSpan\": 1,\n", - " \"content\": \"Tax\",\n", - " \"source\": \"D(1,753,628,910,627,910,683,754,684)\",\n", - " \"span\": {\n", - " \"offset\": 289,\n", - " \"length\": 3\n", - " },\n", - " \"elements\": [\n", - " \"/paragraphs/10\"\n", - " ]\n", - " },\n", - " {\n", - " \"kind\": \"content\",\n", - " \"rowIndex\": 1,\n", - " \"columnIndex\": 1,\n", - " \"rowSpan\": 1,\n", - " \"columnSpan\": 1,\n", - " \"content\": \"$218.31\",\n", - " \"source\": \"D(1,910,627,1060,627,1061,683,910,683)\",\n", - " \"span\": {\n", - " \"offset\": 302,\n", - " \"length\": 7\n", - " },\n", - " \"elements\": [\n", - " \"/paragraphs/11\"\n", - " ]\n", - " },\n", - " {\n", - " \"kind\": \"content\",\n", - " \"rowIndex\": 2,\n", - " \"columnIndex\": 0,\n", - " \"rowSpan\": 1,\n", - " \"columnSpan\": 1,\n", - " \"content\": \"Total\",\n", - " \"source\": \"D(1,754,684,910,683,910,747,754,748)\",\n", - " \"span\": {\n", - " \"offset\": 330,\n", - " \"length\": 5\n", - " },\n", - " \"elements\": [\n", - " \"/paragraphs/12\"\n", - " ]\n", - " },\n", - " {\n", - " \"kind\": \"content\",\n", - " \"rowIndex\": 2,\n", - " \"columnIndex\": 1,\n", - " \"rowSpan\": 1,\n", - " \"columnSpan\": 1,\n", - " \"content\": \"$2,516.28\",\n", - " \"source\": \"D(1,910,683,1061,683,1062,747,910,747)\",\n", - " \"span\": {\n", - " \"offset\": 345,\n", - " \"length\": 9\n", - " },\n", - " \"elements\": [\n", - " \"/paragraphs/13\"\n", - " ]\n", - " }\n", - " ],\n", - " \"source\": \"D(1,759,593,1056,592,1057,741,760,742)\",\n", - " \"span\": {\n", - " \"offset\": 223,\n", - " \"length\": 151\n", - " }\n", - " }\n", - " ]\n", - " }\n", - " ]\n", - " }\n", - "}\n" - ] - } - ], + "outputs": [], "source": [ "response = client.begin_analyze(CUSTOM_ANALYZER_ID, file_location='../data/receipt.png')\n", "result_json = client.poll_result(response)\n", @@ -1331,27 +216,9 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "INFO:python.content_understanding_client:Analyzer train-sample-58b42e42-e59e-41a1-862f-40e14e0ba0a3 deleted.\n" - ] - }, - { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": 7, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "client.delete_analyzer(CUSTOM_ANALYZER_ID)" ] diff --git a/notebooks/field_extraction_pro_mode.ipynb b/notebooks/field_extraction_pro_mode.ipynb index 66b4f53..5111407 100644 --- a/notebooks/field_extraction_pro_mode.ipynb +++ b/notebooks/field_extraction_pro_mode.ipynb @@ -28,9 +28,10 @@ "source": [ "## Prerequisites\n", "1. Ensure Azure AI service is configured following [steps](../README.md#configure-azure-ai-service-resource)\n", - "1. If using reference documents, please follow [Set env for reference doc](../docs/set_env_for_training_data_and_reference_doc.md) to set up `REFERENCE_DOC_SAS_URL` and `REFERENCE_DOC_PATH` in the [.env](./.env) file.\n", - " - `REFERENCE_DOC_SAS_URL`: SAS URL for your Azure Blob container.\n", - " - `REFERENCE_DOC_PATH`: Folder path within the container for uploading reference docs.\n", + "1. If using reference documents, please follow [Set env for reference doc](../docs/set_env_for_training_data_and_reference_doc.md) to set up reference document related environment variables in the [.env](./.env) file.\n", + " - You can either set `REFERENCE_DOC_SAS_URL` directly with the SAS URL for your Azure Blob container,\n", + " - Or set both `REFERENCE_DOC_STORAGE_ACCOUNT_NAME` and `REFERENCE_DOC_CONTAINER_NAME`, so the SAS URL can be generated automatically during one of the later steps.\n", + " - Also set `REFERENCE_DOC_PATH` to specify the folder path within the container where reference documents will be uploaded.\n", " > ⚠️ Note: Reference documents are optional in Pro mode. You can run Pro mode using just input documents. For example, the service can reason across two or more input files even without any reference data.\n", "1. Install the required packages to run the sample." ] @@ -157,12 +158,12 @@ "source": [ "## Prepare reference data\n", "In this step, we will \n", + "- Use `REFERENCE_DOC_PATH` and SAS URL related environment variables that were set in the Prerequisites step.\n", + "- Try to get the SAS URL from the environment variable `REFERENCE_DOC_SAS_URL`.\n", + "If this is not set, we attempt to generate the SAS URL automatically using the environment variables `REFERENCE_DOC_STORAGE_ACCOUNT_NAME` and `REFERENCE_DOC_CONTAINER_NAME`.\n", "- Use Azure AI service to Extract OCR results from reference documents (if needed).\n", "- Generate a reference `.jsonl` file.\n", - "- Upload these files to the designated Azure blob storage.\n", - "\n", - "We use **REFERENCE_DOC_SAS_URL** and **REFERENCE_DOC_PATH** that's set in the Prerequisites step.\n", - "\n" + "- Upload these files to the designated Azure blob storage.\n" ] }, { @@ -172,8 +173,17 @@ "outputs": [], "source": [ "# Load reference storage configuration from environment\n", + "REFERENCE_DOC_PATH = os.getenv(\"REFERENCE_DOC_PATH\")\n", + "\n", "REFERENCE_DOC_SAS_URL = os.getenv(\"REFERENCE_DOC_SAS_URL\")\n", - "REFERENCE_DOC_PATH = os.getenv(\"REFERENCE_DOC_PATH\")" + "if not REFERENCE_DOC_SAS_URL:\n", + " REFERENCE_DOC_STORAGE_ACCOUNT_NAME = os.getenv(\"REFERENCE_DOC_STORAGE_ACCOUNT_NAME\")\n", + " REFERENCE_DOC_CONTAINER_NAME = os.getenv(\"REFERENCE_DOC_CONTAINER_NAME\")\n", + " if REFERENCE_DOC_STORAGE_ACCOUNT_NAME and REFERENCE_DOC_CONTAINER_NAME:\n", + " REFERENCE_DOC_SAS_URL = AzureContentUnderstandingClient.generate_temp_container_sas_url(\n", + " REFERENCE_DOC_STORAGE_ACCOUNT_NAME,\n", + " REFERENCE_DOC_CONTAINER_NAME,\n", + " )" ] }, { @@ -332,7 +342,6 @@ "reference_docs_2 = \"../data/field_extraction_pro_mode/insurance_claims_review/reference_docs\"\n", "\n", "# Load reference storage configuration from environment\n", - "REFERENCE_DOC_SAS_URL_2 = os.getenv(\"REFERENCE_DOC_SAS_URL\") # Reuse the same blob container\n", "REFERENCE_DOC_PATH_2 = os.getenv(\"REFERENCE_DOC_PATH\").rstrip(\"/\") + \"_2/\" # NOTE: Use a different path for the second sample\n", "CUSTOM_ANALYZER_ID_2 = \"pro-mode-sample-\" + str(uuid.uuid4())" ] @@ -352,7 +361,8 @@ "outputs": [], "source": [ "logging.info(\"Start generating knowledge base for the second sample...\")\n", - "await client.generate_knowledge_base_on_blob(reference_docs_2, REFERENCE_DOC_SAS_URL_2, REFERENCE_DOC_PATH_2, skip_analyze=True)" + "# Reuse the same blob container\n", + "await client.generate_knowledge_base_on_blob(reference_docs_2, REFERENCE_DOC_SAS_URL, REFERENCE_DOC_PATH_2, skip_analyze=True)" ] }, { @@ -372,7 +382,7 @@ "response = client.begin_create_analyzer(\n", " CUSTOM_ANALYZER_ID_2,\n", " analyzer_template_path=analyzer_template_2,\n", - " pro_mode_reference_docs_storage_container_sas_url=REFERENCE_DOC_SAS_URL_2,\n", + " pro_mode_reference_docs_storage_container_sas_url=REFERENCE_DOC_SAS_URL,\n", " pro_mode_reference_docs_storage_container_path_prefix=REFERENCE_DOC_PATH_2,\n", ")\n", "result = client.poll_result(response)\n", From b7fbbe7491141f44ced9e6ab7b6fa22ac3df8b0c Mon Sep 17 00:00:00 2001 From: "ds.chienyuanchang@gmail.com" Date: Wed, 30 Jul 2025 22:38:57 +0000 Subject: [PATCH 4/8] revise instruction of set env --- ...env_for_training_data_and_reference_doc.md | 48 ++++++++++++++----- 1 file changed, 35 insertions(+), 13 deletions(-) diff --git a/docs/set_env_for_training_data_and_reference_doc.md b/docs/set_env_for_training_data_and_reference_doc.md index 2fd8c71..b4f81d0 100644 --- a/docs/set_env_for_training_data_and_reference_doc.md +++ b/docs/set_env_for_training_data_and_reference_doc.md @@ -6,23 +6,45 @@ Folders [document_training](../data/document_training/) and [field_extraction_pr 2. *Install Azure Storage Explorer:* Azure Storage Explorer is a tool which makes it easy to work with Azure Storage data. Install it and login with your credential, follow the [guide](https://aka.ms/download-and-install-Azure-Storage-Explorer). 3. *Create or Choose a Blob Container:* Create a blob container from Azure Storage Explorer or use an existing one. -4. *Generate a Shared Access Signature (SAS) URL:* - - Right-click on blob container and select the `Get Shared Access Signature...` in the menu. - - Check the required permissions: `Read`, `Write` and `List` - - Click the `Create` button. - -5. *Copy the SAS URL:* After creating the SAS, click `Copy` to get the URL with token. This will be used as the value for **TRAINING_DATA_SAS_URL** or **REFERENCE_DOC_SAS_URL** when running the sample code. - -6. *Set Environment Variables in ".env" File:* Depending on the sample that you will run, you will need to set required environment variables in [.env](../notebooks/.env). - > NOTE: **REFERENCE_DOC_SAS_URL** can be the same as the **TRAINING_DATA_SAS_URL** to re-use the same blob container - - [analyzer_training](../notebooks/analyzer_training.ipynb): Add the SAS URL as value of **TRAINIGN_DATA_SAS_URL**, and a prefix for **TRAINING_DATA_PATH**. You can choose any folder name you like for **TRAINING_DATA_PATH**. For example, you could use "training_files". +4. *Set SAS URL Related Environment Variables in ".env" File:* Depending on the sample that you will run, you will need to set required environment variables in [.env](../notebooks/.env). There are two options to set up environment variables to utilize required Shared Access Signature (SAS) URL. + - Option A - Generate a SAS URL manually on Azure Storage Explorer + - Right-click on blob container and select the `Get Shared Access Signature...` in the menu. + - Check the required permissions: `Read`, `Write` and `List` + - Click the `Create` button. + + - *Copy the SAS URL:* After creating the SAS, click `Copy` to get the URL with token. This will be used as the value for **TRAINING_DATA_SAS_URL** or **REFERENCE_DOC_SAS_URL** when running the sample code. + + - Set the following in [.env](../notebooks/.env). + > NOTE: **REFERENCE_DOC_SAS_URL** can be the same as the **TRAINING_DATA_SAS_URL** to re-use the same blob container + - For [analyzer_training](../notebooks/analyzer_training.ipynb): Add the SAS URL as value of **TRAINIGN_DATA_SAS_URL**. + ```env + TRAINING_DATA_SAS_URL= + ``` + - For [field_extraction_pro_mode](../notebooks/field_extraction_pro_mode.ipynb): Add the SAS URL as value of **REFERENCE_DOC_SAS_URL**. + ```env + REFERENCE_DOC_SAS_URL= + ``` + - Option B - Auto-generate the SAS URL via code in sample notebooks + - Instead of manually creating a SAS URL, you can set storage account and container information, and let the code generate a temporary SAS URL at runtime. + > NOTE: **TRAINING_DATA_STORAGE_ACCOUNT_NAME** and **TRAINING_DATA_CONTAINER_NAME** can be the same as the **REFERENCE_DOC_STORAGE_ACCOUNT_NAME** and **REFERENCE_DOC_CONTAINER_NAME** to re-use the same blob container + - For [analyzer_training](../notebooks/analyzer_training.ipynb): Add the storage account name as `TRAINING_DATA_STORAGE_ACCOUNT_NAME` and the container name under that storage account as `TRAINING_DATA_CONTAINER_NAME`. + ```env + TRAINING_DATA_STORAGE_ACCOUNT_NAME= + TRAINING_DATA_CONTAINER_NAME= + ``` + - For [field_extraction_pro_mode](../notebooks/field_extraction_pro_mode.ipynb): Add the storage account name as `REFERENCE_DOC_STORAGE_ACCOUNT_NAME` and the container name under that storage account as `REFERENCE_DOC_CONTAINER_NAME`. + ```env + REFERENCE_DOC_STORAGE_ACCOUNT_NAME= + REFERENCE_DOC_CONTAINER_NAME= + ``` + +5. *Set Folder Prefix in ".env" File:* Depending on the sample that you will run, you will need to set required environment variables in [.env](../notebooks/.env). + - For [analyzer_training](../notebooks/analyzer_training.ipynb): Add a prefix for **TRAINING_DATA_PATH**. You can choose any folder name you like for **TRAINING_DATA_PATH**. For example, you could use "training_files". ```env - TRAINING_DATA_SAS_URL= TRAINING_DATA_PATH= ``` - - [field_extraction_pro_mode](../notebooks/field_extraction_pro_mode.ipynb): Add the SAS URL as value of **REFERENCE_DOC_SAS_URL**, and a prefix for **REFERENCE_DOC_PATH**. You can choose any folder name you like for **REFERENCE_DOC_PATH**. For example, you could use "reference_docs". + - For [field_extraction_pro_mode](../notebooks/field_extraction_pro_mode.ipynb): Add a prefix for **REFERENCE_DOC_PATH**. You can choose any folder name you like for **REFERENCE_DOC_PATH**. For example, you could use "reference_docs". ```env - REFERENCE_DOC_SAS_URL= REFERENCE_DOC_PATH= ``` From a18d10ca1273c1118d1ee2a1f5b7804298c6b9e5 Mon Sep 17 00:00:00 2001 From: "ds.chienyuanchang@gmail.com" Date: Wed, 30 Jul 2025 22:43:09 +0000 Subject: [PATCH 5/8] fix indents --- docs/set_env_for_training_data_and_reference_doc.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/set_env_for_training_data_and_reference_doc.md b/docs/set_env_for_training_data_and_reference_doc.md index b4f81d0..5fb8eab 100644 --- a/docs/set_env_for_training_data_and_reference_doc.md +++ b/docs/set_env_for_training_data_and_reference_doc.md @@ -11,9 +11,9 @@ Folders [document_training](../data/document_training/) and [field_extraction_pr - Right-click on blob container and select the `Get Shared Access Signature...` in the menu. - Check the required permissions: `Read`, `Write` and `List` - Click the `Create` button. - + - *Copy the SAS URL:* After creating the SAS, click `Copy` to get the URL with token. This will be used as the value for **TRAINING_DATA_SAS_URL** or **REFERENCE_DOC_SAS_URL** when running the sample code. - + - Set the following in [.env](../notebooks/.env). > NOTE: **REFERENCE_DOC_SAS_URL** can be the same as the **TRAINING_DATA_SAS_URL** to re-use the same blob container - For [analyzer_training](../notebooks/analyzer_training.ipynb): Add the SAS URL as value of **TRAINIGN_DATA_SAS_URL**. From 74df73bbdd1575c7154cb01f26980faa1ffb531b Mon Sep 17 00:00:00 2001 From: Chien Yuan Chang Date: Wed, 30 Jul 2025 15:44:38 -0700 Subject: [PATCH 6/8] Update set_env_for_training_data_and_reference_doc.md --- docs/set_env_for_training_data_and_reference_doc.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/docs/set_env_for_training_data_and_reference_doc.md b/docs/set_env_for_training_data_and_reference_doc.md index 5fb8eab..84115a9 100644 --- a/docs/set_env_for_training_data_and_reference_doc.md +++ b/docs/set_env_for_training_data_and_reference_doc.md @@ -13,7 +13,8 @@ Folders [document_training](../data/document_training/) and [field_extraction_pr - Click the `Create` button. - *Copy the SAS URL:* After creating the SAS, click `Copy` to get the URL with token. This will be used as the value for **TRAINING_DATA_SAS_URL** or **REFERENCE_DOC_SAS_URL** when running the sample code. - + + - Set the following in [.env](../notebooks/.env). > NOTE: **REFERENCE_DOC_SAS_URL** can be the same as the **TRAINING_DATA_SAS_URL** to re-use the same blob container - For [analyzer_training](../notebooks/analyzer_training.ipynb): Add the SAS URL as value of **TRAINIGN_DATA_SAS_URL**. From 23a8eb60a146d65e4a0d15906f648939043d4465 Mon Sep 17 00:00:00 2001 From: "ds.chienyuanchang@gmail.com" Date: Thu, 31 Jul 2025 00:33:38 +0000 Subject: [PATCH 7/8] remove a commit with output --- ...env_for_training_data_and_reference_doc.md | 1 + notebooks/analyzer_training.ipynb | 26 ++++++++------- notebooks/field_extraction_pro_mode.ipynb | 32 +++++++++++-------- python/content_understanding_client.py | 27 +++++++++++++--- 4 files changed, 57 insertions(+), 29 deletions(-) diff --git a/docs/set_env_for_training_data_and_reference_doc.md b/docs/set_env_for_training_data_and_reference_doc.md index 84115a9..3622029 100644 --- a/docs/set_env_for_training_data_and_reference_doc.md +++ b/docs/set_env_for_training_data_and_reference_doc.md @@ -10,6 +10,7 @@ Folders [document_training](../data/document_training/) and [field_extraction_pr - Option A - Generate a SAS URL manually on Azure Storage Explorer - Right-click on blob container and select the `Get Shared Access Signature...` in the menu. - Check the required permissions: `Read`, `Write` and `List` + - We will need `Write` for uploading, modifying, or appending blobs - Click the `Create` button. - *Copy the SAS URL:* After creating the SAS, click `Copy` to get the URL with token. This will be used as the value for **TRAINING_DATA_SAS_URL** or **REFERENCE_DOC_SAS_URL** when running the sample code. diff --git a/notebooks/analyzer_training.ipynb b/notebooks/analyzer_training.ipynb index 73bfa29..773277f 100644 --- a/notebooks/analyzer_training.ipynb +++ b/notebooks/analyzer_training.ipynb @@ -132,22 +132,26 @@ "metadata": {}, "outputs": [], "source": [ - "TRAINING_DATA_SAS_URL = os.getenv(\"TRAINING_DATA_SAS_URL\")\n", - "if not TRAINING_DATA_SAS_URL:\n", + "training_data_sas_url = os.getenv(\"TRAINING_DATA_SAS_URL\")\n", + "if not training_data_sas_url:\n", " TRAINING_DATA_STORAGE_ACCOUNT_NAME = os.getenv(\"TRAINING_DATA_STORAGE_ACCOUNT_NAME\")\n", " TRAINING_DATA_CONTAINER_NAME = os.getenv(\"TRAINING_DATA_CONTAINER_NAME\")\n", - " if not TRAINING_DATA_STORAGE_ACCOUNT_NAME and not TRAINING_DATA_SAS_URL:\n", + " if not TRAINING_DATA_STORAGE_ACCOUNT_NAME and not training_data_sas_url:\n", " raise ValueError(\n", " \"Please set either TRAINING_DATA_SAS_URL or both TRAINING_DATA_STORAGE_ACCCOUNT_NAME and TRAINING_DATA_CONTAINER_NAME environment variables.\"\n", " )\n", - " TRAINING_DATA_SAS_URL = AzureContentUnderstandingClient.generate_temp_container_sas_url(\n", - " TRAINING_DATA_STORAGE_ACCOUNT_NAME,\n", - " TRAINING_DATA_CONTAINER_NAME,\n", + " from azure.storage.blob import ContainerSasPermissions\n", + " # We will need \"Write\" for uploading, modifying, or appending blobs\n", + " training_data_sas_url = AzureContentUnderstandingClient.generate_temp_container_sas_url(\n", + " account_name=TRAINING_DATA_STORAGE_ACCOUNT_NAME,\n", + " container_name=TRAINING_DATA_CONTAINER_NAME,\n", + " permissions=ContainerSasPermissions(read=True, write=True, list=True),\n", + " expiry_hours=1,\n", " )\n", "\n", - "TRAINING_DATA_PATH = os.getenv(\"TRAINING_DATA_PATH\")\n", + "training_data_path = os.getenv(\"TRAINING_DATA_PATH\")\n", "\n", - "await client.generate_training_data_on_blob(training_docs_folder, TRAINING_DATA_SAS_URL, TRAINING_DATA_PATH)" + "await client.generate_training_data_on_blob(training_docs_folder, training_data_sas_url, training_data_path)" ] }, { @@ -157,7 +161,7 @@ "## Create analyzer with defined schema\n", "Before creating the analyzer, you should fill in the constant ANALYZER_ID with a relevant name to your task. Here, we generate a unique suffix so this cell can be run multiple times to create different analyzers.\n", "\n", - "We use **TRAINING_DATA_SAS_URL** and **TRAINING_DATA_PATH** that's set up in the [.env](./.env) file and used in the previous step." + "We use **training_data_sas_url** and **training_data_path** that's set up in the [.env](./.env) file and used in the previous step." ] }, { @@ -172,8 +176,8 @@ "response = client.begin_create_analyzer(\n", " CUSTOM_ANALYZER_ID,\n", " analyzer_template_path=analyzer_template,\n", - " training_storage_container_sas_url=TRAINING_DATA_SAS_URL,\n", - " training_storage_container_path_prefix=TRAINING_DATA_PATH,\n", + " training_storage_container_sas_url=training_data_sas_url,\n", + " training_storage_container_path_prefix=training_data_path,\n", ")\n", "result = client.poll_result(response)\n", "if result is not None and \"status\" in result and result[\"status\"] == \"Succeeded\":\n", diff --git a/notebooks/field_extraction_pro_mode.ipynb b/notebooks/field_extraction_pro_mode.ipynb index 5111407..45a5044 100644 --- a/notebooks/field_extraction_pro_mode.ipynb +++ b/notebooks/field_extraction_pro_mode.ipynb @@ -173,16 +173,20 @@ "outputs": [], "source": [ "# Load reference storage configuration from environment\n", - "REFERENCE_DOC_PATH = os.getenv(\"REFERENCE_DOC_PATH\")\n", + "reference_doc_path = os.getenv(\"REFERENCE_DOC_PATH\")\n", "\n", - "REFERENCE_DOC_SAS_URL = os.getenv(\"REFERENCE_DOC_SAS_URL\")\n", - "if not REFERENCE_DOC_SAS_URL:\n", + "reference_doc_sas_url = os.getenv(\"REFERENCE_DOC_SAS_URL\")\n", + "if not reference_doc_sas_url:\n", " REFERENCE_DOC_STORAGE_ACCOUNT_NAME = os.getenv(\"REFERENCE_DOC_STORAGE_ACCOUNT_NAME\")\n", " REFERENCE_DOC_CONTAINER_NAME = os.getenv(\"REFERENCE_DOC_CONTAINER_NAME\")\n", " if REFERENCE_DOC_STORAGE_ACCOUNT_NAME and REFERENCE_DOC_CONTAINER_NAME:\n", - " REFERENCE_DOC_SAS_URL = AzureContentUnderstandingClient.generate_temp_container_sas_url(\n", - " REFERENCE_DOC_STORAGE_ACCOUNT_NAME,\n", - " REFERENCE_DOC_CONTAINER_NAME,\n", + " from azure.storage.blob import ContainerSasPermissions\n", + " # We will need \"Write\" for uploading, modifying, or appending blobs\n", + " reference_doc_sas_url = AzureContentUnderstandingClient.generate_temp_container_sas_url(\n", + " account_name=REFERENCE_DOC_STORAGE_ACCOUNT_NAME,\n", + " container_name=REFERENCE_DOC_CONTAINER_NAME,\n", + " permissions=ContainerSasPermissions(read=True, write=True, list=True),\n", + " expiry_hours=1,\n", " )" ] }, @@ -203,7 +207,7 @@ "# Please name the OCR result files with the same name as the original document files including its extension, and add the suffix \".result.json\"\n", "# For example, if the original document is \"invoice.pdf\", the OCR result file should be named \"invoice.pdf.result.json\"\n", "# NOTE: Please comment out the follwing line if you don't have any reference documents.\n", - "await client.generate_knowledge_base_on_blob(reference_docs, REFERENCE_DOC_SAS_URL, REFERENCE_DOC_PATH, skip_analyze=False)" + "await client.generate_knowledge_base_on_blob(reference_docs, reference_doc_sas_url, reference_doc_path, skip_analyze=False)" ] }, { @@ -213,7 +217,7 @@ "## Create analyzer with defined schema for Pro mode\n", "Before creating the analyzer, you should fill in the constant ANALYZER_ID with a relevant name to your task. Here, we generate a unique suffix so this cell can be run multiple times to create different analyzers.\n", "\n", - "We use **REFERENCE_DOC_SAS_URL** and **REFERENCE_DOC_PATH** that's set up in the [.env](./.env) file and used in the previous step." + "We use **reference_doc_sas_url** and **reference_doc_path** that's set up in the [.env](./.env) file and used in the previous step." ] }, { @@ -228,8 +232,8 @@ "response = client.begin_create_analyzer(\n", " CUSTOM_ANALYZER_ID,\n", " analyzer_template_path=analyzer_template,\n", - " pro_mode_reference_docs_storage_container_sas_url=REFERENCE_DOC_SAS_URL,\n", - " pro_mode_reference_docs_storage_container_path_prefix=REFERENCE_DOC_PATH,\n", + " pro_mode_reference_docs_storage_container_sas_url=reference_doc_sas_url,\n", + " pro_mode_reference_docs_storage_container_path_prefix=reference_doc_path,\n", ")\n", "result = client.poll_result(response)\n", "if result is not None and \"status\" in result and result[\"status\"] == \"Succeeded\":\n", @@ -342,7 +346,7 @@ "reference_docs_2 = \"../data/field_extraction_pro_mode/insurance_claims_review/reference_docs\"\n", "\n", "# Load reference storage configuration from environment\n", - "REFERENCE_DOC_PATH_2 = os.getenv(\"REFERENCE_DOC_PATH\").rstrip(\"/\") + \"_2/\" # NOTE: Use a different path for the second sample\n", + "reference_doc_path_2 = os.getenv(\"REFERENCE_DOC_PATH\").rstrip(\"/\") + \"_2/\" # NOTE: Use a different path for the second sample\n", "CUSTOM_ANALYZER_ID_2 = \"pro-mode-sample-\" + str(uuid.uuid4())" ] }, @@ -362,7 +366,7 @@ "source": [ "logging.info(\"Start generating knowledge base for the second sample...\")\n", "# Reuse the same blob container\n", - "await client.generate_knowledge_base_on_blob(reference_docs_2, REFERENCE_DOC_SAS_URL, REFERENCE_DOC_PATH_2, skip_analyze=True)" + "await client.generate_knowledge_base_on_blob(reference_docs_2, reference_doc_sas_url, reference_doc_path_2, skip_analyze=True)" ] }, { @@ -382,8 +386,8 @@ "response = client.begin_create_analyzer(\n", " CUSTOM_ANALYZER_ID_2,\n", " analyzer_template_path=analyzer_template_2,\n", - " pro_mode_reference_docs_storage_container_sas_url=REFERENCE_DOC_SAS_URL,\n", - " pro_mode_reference_docs_storage_container_path_prefix=REFERENCE_DOC_PATH_2,\n", + " pro_mode_reference_docs_storage_container_sas_url=reference_doc_sas_url,\n", + " pro_mode_reference_docs_storage_container_path_prefix=reference_doc_path_2,\n", ")\n", "result = client.poll_result(response)\n", "if result is not None and \"status\" in result and result[\"status\"] == \"Succeeded\":\n", diff --git a/python/content_understanding_client.py b/python/content_understanding_client.py index d4d7027..0558aa4 100644 --- a/python/content_understanding_client.py +++ b/python/content_understanding_client.py @@ -186,26 +186,45 @@ def is_supported_doc_type_by_file_path(file_path: Path, is_document: bool=False) def generate_temp_container_sas_url( account_name: str, container_name: str, + permissions: Optional[ContainerSasPermissions] = None, + expiry_hours: Optional[int] = None, ) -> str: + """ + Generate a temporary SAS URL for an Azure Blob container using Azure AD authentication. + + Args: + account_name (str): The Azure Storage account name. + container_name (str): The name of the container. + permissions (ContainerSasPermissions, optional): Permissions to assign to the SAS token. + Defaults to read, write, and list permissions. + expiry_hours (int, optional): Number of hours until the SAS token expires. + Defaults to `AzureContentUnderstandingClient.SAS_EXPIRY_HOURS`. + + Returns: + str: The SAS URL for the container. + """ + if permissions is None: + permissions = ContainerSasPermissions(read=True, write=True, list=True) + expiry_duration = timedelta(hours=expiry_hours or AzureContentUnderstandingClient.SAS_EXPIRY_HOURS) + account_url = f"https://{account_name}.blob.core.windows.net" blob_service_client = BlobServiceClient(account_url=account_url, credential=DefaultAzureCredential()) # Get user delegation key start_time = datetime.now(timezone.utc) - expiry_time = start_time + timedelta(hours=AzureContentUnderstandingClient.SAS_EXPIRY_HOURS) + expiry_time = start_time + expiry_duration delegation_key = blob_service_client.get_user_delegation_key(start_time, expiry_time) sas_token = generate_container_sas( account_name=account_name, container_name=container_name, user_delegation_key=delegation_key, - permission=ContainerSasPermissions(read=True, list=True, write=True), + permission=permissions, expiry=expiry_time, start=start_time, ) - container_sas_url = f"{account_url}/{container_name}?{sas_token}" - return container_sas_url + return f"{account_url}/{container_name}?{sas_token}" def get_all_analyzers(self) -> Dict[str, Any]: """ From 6ae6595f14ece67c56d363acb16748c52ef40e53 Mon Sep 17 00:00:00 2001 From: "ds.chienyuanchang@gmail.com" Date: Thu, 31 Jul 2025 00:44:54 +0000 Subject: [PATCH 8/8] set default to read and list --- python/content_understanding_client.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/content_understanding_client.py b/python/content_understanding_client.py index 0558aa4..83b518e 100644 --- a/python/content_understanding_client.py +++ b/python/content_understanding_client.py @@ -196,7 +196,7 @@ def generate_temp_container_sas_url( account_name (str): The Azure Storage account name. container_name (str): The name of the container. permissions (ContainerSasPermissions, optional): Permissions to assign to the SAS token. - Defaults to read, write, and list permissions. + Defaults to read and list permissions. expiry_hours (int, optional): Number of hours until the SAS token expires. Defaults to `AzureContentUnderstandingClient.SAS_EXPIRY_HOURS`. @@ -204,7 +204,7 @@ def generate_temp_container_sas_url( str: The SAS URL for the container. """ if permissions is None: - permissions = ContainerSasPermissions(read=True, write=True, list=True) + permissions = ContainerSasPermissions(read=True, list=True) expiry_duration = timedelta(hours=expiry_hours or AzureContentUnderstandingClient.SAS_EXPIRY_HOURS) account_url = f"https://{account_name}.blob.core.windows.net"