-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathextract.py
115 lines (97 loc) · 3.63 KB
/
extract.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
# Importing required libraries
import os
import time
import requests
import string
import random
import argparse
from dotenv import load_dotenv
from process import processDataToCSV, processDataToObj
# Loading environment variables
load_dotenv()
# Getting environment variables
ENDPOINT = os.getenv("ENDPOINT")
KEY = os.getenv("KEY")
BLOB_ENDPOINT = os.getenv("BLOB_ENDPOINT")
BLOB_QUERY = os.getenv("BLOB_QUERY")
# Uploads the file to Azure blob storage and returns the file url
def uploadToAzureBlob(fileName):
with open(fileName, "rb") as finput:
data = finput.read()
randomStr = "".join(random.choices(string.ascii_letters, k=15))
requests.put(
url=BLOB_ENDPOINT + randomStr + fileName + BLOB_QUERY,
data=data,
headers={
"Content-type": "image/jpeg",
"x-ms-blob-type": "BlockBlob",
},
)
return BLOB_ENDPOINT + randomStr + fileName
# Deletes the file at the given url in Azure blob storage
def deleteFromAzureBlob(fileUrl):
r = requests.delete(
url=fileUrl + BLOB_QUERY,
headers={
"Content-type": "image/jpeg",
"x-ms-blob-type": "BlockBlob",
},
)
# Send the document to the Azure form recognizer service for analysis
# This returns a URL that we can use to get the results of the analysis when it is done
def analyze(fileName):
# Azure form analyzer can only work with a file URL and doesn't supporting binary data directly
# So we upload the file to azure blob storage and get the link
fileUrl = uploadToAzureBlob(fileName)
res = requests.post(
url=f"{ENDPOINT}/formrecognizer/documentModels/prebuilt-document:analyze?api-version=2022-08-31",
headers={"Content-Type": "application/json", "Ocp-Apim-Subscription-Key": KEY},
json={"urlSource": fileUrl},
)
return (res.headers["Operation-Location"], fileUrl)
# Analyze a given document
def getAnalyzeResult(filename):
# Send the file to Azure to start analyzing
analyze_url, fileUrl = analyze(filename)
# It takes a while to complete analyzing so we keep on re-requesting every 1 second
while True:
res = requests.get(
url=analyze_url,
headers={
"Content-Type": "application/json",
"Ocp-Apim-Subscription-Key": KEY,
},
)
result = res.json()
if result["status"] == "succeeded":
# If analyzing is completed, we delete the file from Azure blob storage and return the data
deleteFromAzureBlob(fileUrl)
return {
"tables": result["analyzeResult"]["tables"],
"keyValuePairs": result["analyzeResult"]["keyValuePairs"],
"paragraphs": result["analyzeResult"]["paragraphs"],
}
else:
# Else we wait 1s and try again
time.sleep(1)
# Covert the data from the provided form into CSV
def formToData(formName):
data = getAnalyzeResult(formName) # Get the data from Azure form analyzer as JSON
csvData = processDataToCSV(data)
objData = processDataToObj(data)
return {"csv": csvData, "obj": objData, "status": "succeeded"}
# Run if the file is executed directly
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("-i")
parser.add_argument("-o")
args = parser.parse_args()
if args.i and args.o:
data = formToData(args.i)
with open(args.o, "w", newline="") as f:
f.write(data["csv"])
print("Done!")
else:
print(
"Arguments missing!\nUse the format given below:\npy main.py -i input.xyz -o output.csv"
)