Skip to content

Commit 23a8eb6

Browse files
remove a commit with output
1 parent 74df73b commit 23a8eb6

File tree

4 files changed

+57
-29
lines changed

4 files changed

+57
-29
lines changed

docs/set_env_for_training_data_and_reference_doc.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@ Folders [document_training](../data/document_training/) and [field_extraction_pr
1010
- Option A - Generate a SAS URL manually on Azure Storage Explorer
1111
- Right-click on blob container and select the `Get Shared Access Signature...` in the menu.
1212
- Check the required permissions: `Read`, `Write` and `List`
13+
- We will need `Write` for uploading, modifying, or appending blobs
1314
- Click the `Create` button.
1415
<img src="./get-access-signature.png" height="600" /> <img src="./choose-signature-options.png" height="600" />
1516
- *Copy the SAS URL:* After creating the SAS, click `Copy` to get the URL with token. This will be used as the value for **TRAINING_DATA_SAS_URL** or **REFERENCE_DOC_SAS_URL** when running the sample code.

notebooks/analyzer_training.ipynb

Lines changed: 15 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -132,22 +132,26 @@
132132
"metadata": {},
133133
"outputs": [],
134134
"source": [
135-
"TRAINING_DATA_SAS_URL = os.getenv(\"TRAINING_DATA_SAS_URL\")\n",
136-
"if not TRAINING_DATA_SAS_URL:\n",
135+
"training_data_sas_url = os.getenv(\"TRAINING_DATA_SAS_URL\")\n",
136+
"if not training_data_sas_url:\n",
137137
" TRAINING_DATA_STORAGE_ACCOUNT_NAME = os.getenv(\"TRAINING_DATA_STORAGE_ACCOUNT_NAME\")\n",
138138
" TRAINING_DATA_CONTAINER_NAME = os.getenv(\"TRAINING_DATA_CONTAINER_NAME\")\n",
139-
" if not TRAINING_DATA_STORAGE_ACCOUNT_NAME and not TRAINING_DATA_SAS_URL:\n",
139+
" if not TRAINING_DATA_STORAGE_ACCOUNT_NAME and not training_data_sas_url:\n",
140140
" raise ValueError(\n",
141141
" \"Please set either TRAINING_DATA_SAS_URL or both TRAINING_DATA_STORAGE_ACCCOUNT_NAME and TRAINING_DATA_CONTAINER_NAME environment variables.\"\n",
142142
" )\n",
143-
" TRAINING_DATA_SAS_URL = AzureContentUnderstandingClient.generate_temp_container_sas_url(\n",
144-
" TRAINING_DATA_STORAGE_ACCOUNT_NAME,\n",
145-
" TRAINING_DATA_CONTAINER_NAME,\n",
143+
" from azure.storage.blob import ContainerSasPermissions\n",
144+
" # We will need \"Write\" for uploading, modifying, or appending blobs\n",
145+
" training_data_sas_url = AzureContentUnderstandingClient.generate_temp_container_sas_url(\n",
146+
" account_name=TRAINING_DATA_STORAGE_ACCOUNT_NAME,\n",
147+
" container_name=TRAINING_DATA_CONTAINER_NAME,\n",
148+
" permissions=ContainerSasPermissions(read=True, write=True, list=True),\n",
149+
" expiry_hours=1,\n",
146150
" )\n",
147151
"\n",
148-
"TRAINING_DATA_PATH = os.getenv(\"TRAINING_DATA_PATH\")\n",
152+
"training_data_path = os.getenv(\"TRAINING_DATA_PATH\")\n",
149153
"\n",
150-
"await client.generate_training_data_on_blob(training_docs_folder, TRAINING_DATA_SAS_URL, TRAINING_DATA_PATH)"
154+
"await client.generate_training_data_on_blob(training_docs_folder, training_data_sas_url, training_data_path)"
151155
]
152156
},
153157
{
@@ -157,7 +161,7 @@
157161
"## Create analyzer with defined schema\n",
158162
"Before creating the analyzer, you should fill in the constant ANALYZER_ID with a relevant name to your task. Here, we generate a unique suffix so this cell can be run multiple times to create different analyzers.\n",
159163
"\n",
160-
"We use **TRAINING_DATA_SAS_URL** and **TRAINING_DATA_PATH** that's set up in the [.env](./.env) file and used in the previous step."
164+
"We use **training_data_sas_url** and **training_data_path** that's set up in the [.env](./.env) file and used in the previous step."
161165
]
162166
},
163167
{
@@ -172,8 +176,8 @@
172176
"response = client.begin_create_analyzer(\n",
173177
" CUSTOM_ANALYZER_ID,\n",
174178
" analyzer_template_path=analyzer_template,\n",
175-
" training_storage_container_sas_url=TRAINING_DATA_SAS_URL,\n",
176-
" training_storage_container_path_prefix=TRAINING_DATA_PATH,\n",
179+
" training_storage_container_sas_url=training_data_sas_url,\n",
180+
" training_storage_container_path_prefix=training_data_path,\n",
177181
")\n",
178182
"result = client.poll_result(response)\n",
179183
"if result is not None and \"status\" in result and result[\"status\"] == \"Succeeded\":\n",

notebooks/field_extraction_pro_mode.ipynb

Lines changed: 18 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -173,16 +173,20 @@
173173
"outputs": [],
174174
"source": [
175175
"# Load reference storage configuration from environment\n",
176-
"REFERENCE_DOC_PATH = os.getenv(\"REFERENCE_DOC_PATH\")\n",
176+
"reference_doc_path = os.getenv(\"REFERENCE_DOC_PATH\")\n",
177177
"\n",
178-
"REFERENCE_DOC_SAS_URL = os.getenv(\"REFERENCE_DOC_SAS_URL\")\n",
179-
"if not REFERENCE_DOC_SAS_URL:\n",
178+
"reference_doc_sas_url = os.getenv(\"REFERENCE_DOC_SAS_URL\")\n",
179+
"if not reference_doc_sas_url:\n",
180180
" REFERENCE_DOC_STORAGE_ACCOUNT_NAME = os.getenv(\"REFERENCE_DOC_STORAGE_ACCOUNT_NAME\")\n",
181181
" REFERENCE_DOC_CONTAINER_NAME = os.getenv(\"REFERENCE_DOC_CONTAINER_NAME\")\n",
182182
" if REFERENCE_DOC_STORAGE_ACCOUNT_NAME and REFERENCE_DOC_CONTAINER_NAME:\n",
183-
" REFERENCE_DOC_SAS_URL = AzureContentUnderstandingClient.generate_temp_container_sas_url(\n",
184-
" REFERENCE_DOC_STORAGE_ACCOUNT_NAME,\n",
185-
" REFERENCE_DOC_CONTAINER_NAME,\n",
183+
" from azure.storage.blob import ContainerSasPermissions\n",
184+
" # We will need \"Write\" for uploading, modifying, or appending blobs\n",
185+
" reference_doc_sas_url = AzureContentUnderstandingClient.generate_temp_container_sas_url(\n",
186+
" account_name=REFERENCE_DOC_STORAGE_ACCOUNT_NAME,\n",
187+
" container_name=REFERENCE_DOC_CONTAINER_NAME,\n",
188+
" permissions=ContainerSasPermissions(read=True, write=True, list=True),\n",
189+
" expiry_hours=1,\n",
186190
" )"
187191
]
188192
},
@@ -203,7 +207,7 @@
203207
"# Please name the OCR result files with the same name as the original document files including its extension, and add the suffix \".result.json\"\n",
204208
"# For example, if the original document is \"invoice.pdf\", the OCR result file should be named \"invoice.pdf.result.json\"\n",
205209
"# NOTE: Please comment out the follwing line if you don't have any reference documents.\n",
206-
"await client.generate_knowledge_base_on_blob(reference_docs, REFERENCE_DOC_SAS_URL, REFERENCE_DOC_PATH, skip_analyze=False)"
210+
"await client.generate_knowledge_base_on_blob(reference_docs, reference_doc_sas_url, reference_doc_path, skip_analyze=False)"
207211
]
208212
},
209213
{
@@ -213,7 +217,7 @@
213217
"## Create analyzer with defined schema for Pro mode\n",
214218
"Before creating the analyzer, you should fill in the constant ANALYZER_ID with a relevant name to your task. Here, we generate a unique suffix so this cell can be run multiple times to create different analyzers.\n",
215219
"\n",
216-
"We use **REFERENCE_DOC_SAS_URL** and **REFERENCE_DOC_PATH** that's set up in the [.env](./.env) file and used in the previous step."
220+
"We use **reference_doc_sas_url** and **reference_doc_path** that's set up in the [.env](./.env) file and used in the previous step."
217221
]
218222
},
219223
{
@@ -228,8 +232,8 @@
228232
"response = client.begin_create_analyzer(\n",
229233
" CUSTOM_ANALYZER_ID,\n",
230234
" analyzer_template_path=analyzer_template,\n",
231-
" pro_mode_reference_docs_storage_container_sas_url=REFERENCE_DOC_SAS_URL,\n",
232-
" pro_mode_reference_docs_storage_container_path_prefix=REFERENCE_DOC_PATH,\n",
235+
" pro_mode_reference_docs_storage_container_sas_url=reference_doc_sas_url,\n",
236+
" pro_mode_reference_docs_storage_container_path_prefix=reference_doc_path,\n",
233237
")\n",
234238
"result = client.poll_result(response)\n",
235239
"if result is not None and \"status\" in result and result[\"status\"] == \"Succeeded\":\n",
@@ -342,7 +346,7 @@
342346
"reference_docs_2 = \"../data/field_extraction_pro_mode/insurance_claims_review/reference_docs\"\n",
343347
"\n",
344348
"# Load reference storage configuration from environment\n",
345-
"REFERENCE_DOC_PATH_2 = os.getenv(\"REFERENCE_DOC_PATH\").rstrip(\"/\") + \"_2/\" # NOTE: Use a different path for the second sample\n",
349+
"reference_doc_path_2 = os.getenv(\"REFERENCE_DOC_PATH\").rstrip(\"/\") + \"_2/\" # NOTE: Use a different path for the second sample\n",
346350
"CUSTOM_ANALYZER_ID_2 = \"pro-mode-sample-\" + str(uuid.uuid4())"
347351
]
348352
},
@@ -362,7 +366,7 @@
362366
"source": [
363367
"logging.info(\"Start generating knowledge base for the second sample...\")\n",
364368
"# Reuse the same blob container\n",
365-
"await client.generate_knowledge_base_on_blob(reference_docs_2, REFERENCE_DOC_SAS_URL, REFERENCE_DOC_PATH_2, skip_analyze=True)"
369+
"await client.generate_knowledge_base_on_blob(reference_docs_2, reference_doc_sas_url, reference_doc_path_2, skip_analyze=True)"
366370
]
367371
},
368372
{
@@ -382,8 +386,8 @@
382386
"response = client.begin_create_analyzer(\n",
383387
" CUSTOM_ANALYZER_ID_2,\n",
384388
" analyzer_template_path=analyzer_template_2,\n",
385-
" pro_mode_reference_docs_storage_container_sas_url=REFERENCE_DOC_SAS_URL,\n",
386-
" pro_mode_reference_docs_storage_container_path_prefix=REFERENCE_DOC_PATH_2,\n",
389+
" pro_mode_reference_docs_storage_container_sas_url=reference_doc_sas_url,\n",
390+
" pro_mode_reference_docs_storage_container_path_prefix=reference_doc_path_2,\n",
387391
")\n",
388392
"result = client.poll_result(response)\n",
389393
"if result is not None and \"status\" in result and result[\"status\"] == \"Succeeded\":\n",

python/content_understanding_client.py

Lines changed: 23 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -186,26 +186,45 @@ def is_supported_doc_type_by_file_path(file_path: Path, is_document: bool=False)
186186
def generate_temp_container_sas_url(
187187
account_name: str,
188188
container_name: str,
189+
permissions: Optional[ContainerSasPermissions] = None,
190+
expiry_hours: Optional[int] = None,
189191
) -> str:
192+
"""
193+
Generate a temporary SAS URL for an Azure Blob container using Azure AD authentication.
194+
195+
Args:
196+
account_name (str): The Azure Storage account name.
197+
container_name (str): The name of the container.
198+
permissions (ContainerSasPermissions, optional): Permissions to assign to the SAS token.
199+
Defaults to read, write, and list permissions.
200+
expiry_hours (int, optional): Number of hours until the SAS token expires.
201+
Defaults to `AzureContentUnderstandingClient.SAS_EXPIRY_HOURS`.
202+
203+
Returns:
204+
str: The SAS URL for the container.
205+
"""
206+
if permissions is None:
207+
permissions = ContainerSasPermissions(read=True, write=True, list=True)
208+
expiry_duration = timedelta(hours=expiry_hours or AzureContentUnderstandingClient.SAS_EXPIRY_HOURS)
209+
190210
account_url = f"https://{account_name}.blob.core.windows.net"
191211
blob_service_client = BlobServiceClient(account_url=account_url, credential=DefaultAzureCredential())
192212

193213
# Get user delegation key
194214
start_time = datetime.now(timezone.utc)
195-
expiry_time = start_time + timedelta(hours=AzureContentUnderstandingClient.SAS_EXPIRY_HOURS)
215+
expiry_time = start_time + expiry_duration
196216
delegation_key = blob_service_client.get_user_delegation_key(start_time, expiry_time)
197217

198218
sas_token = generate_container_sas(
199219
account_name=account_name,
200220
container_name=container_name,
201221
user_delegation_key=delegation_key,
202-
permission=ContainerSasPermissions(read=True, list=True, write=True),
222+
permission=permissions,
203223
expiry=expiry_time,
204224
start=start_time,
205225
)
206-
container_sas_url = f"{account_url}/{container_name}?{sas_token}"
207226

208-
return container_sas_url
227+
return f"{account_url}/{container_name}?{sas_token}"
209228

210229
def get_all_analyzers(self) -> Dict[str, Any]:
211230
"""

0 commit comments

Comments
 (0)