forked from manikanthp/LayoutLMV3_Fine_Tuning
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathCreate_LMv3_dataset_with_paddleOCR.py
125 lines (88 loc) · 4.24 KB
/
Create_LMv3_dataset_with_paddleOCR.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
import os
from paddleocr import PaddleOCR
from PIL import Image, ImageDraw, ImageFont
import json
from uuid import uuid4
import numpy as np
# loading the engine
# OCR enginer
ocr = PaddleOCR(use_angle_cls=False,
lang='en',
rec=False,
) # need to run only once to download and load model into memory
images_folder_path = "D:/Projects/AI_Projects/NLP/Document_AI/LayoutLM_Models/image"
def create_image_url(filename):
"""
Label Studio requires image URLs, so this defines the mapping from filesystem to URLs
if you use ./serve_local_files.sh <my-images-dir>, the image URLs are localhost:8081/filename.png
Otherwise you can build links like /data/upload/filename.png to refer to the files
"""
return f'http://localhost:8080/{filename}'
def convert_bounding_box(bounding_box):
"""Converts a bounding box of [x1, y1, x2, y2] into [x, y, height, width].
Args:
bounding_box: A list of four numbers, representing the x1, y1, x2, and y2
coordinates of the bounding box.
Returns:
A list of four numbers, representing the x, y, height, and width of the
bounding box.
"""
x1, y1, x2, y2 = bounding_box
x = min(x1, x2)
y = min(y1, y2)
width = x2 - x1
height = y2 - y1
return [x, y, width, height]
def create_image_url(filename):
"""
Label Studio requires image URLs, so this defines the mapping from filesystem to URLs
if you use ./serve_local_files.sh <my-images-dir>, the image URLs are localhost:8081/filename.png
Otherwise you can build links like /data/upload/filename.png to refer to the files
"""
return f'http://localhost:8080/{filename}'
def extracted_tables_to_label_studio_json_file_with_paddleOCR(images_folder_path):
label_studio_task_list = []
for images in os.listdir(images_folder_path):
if images.endswith('.png'):
output_json = {}
annotation_result = []
print(images)
output_json['data'] = {"ocr":create_image_url(images)}
img = Image.open(f'D:/Projects/AI_Projects/NLP/Document_AI/LayoutLM_Models/image/{images}')
img = np.asarray(img)
image_height, image_width = img.shape[:2]
result = ocr.ocr(img,cls=False)
#print(result)
for output in result:
for item in output:
co_ord = item[0]
text = item[1][0]
four_co_ord = [co_ord[0][0],co_ord[1][1],co_ord[2][0]-co_ord[0][0],co_ord[2][1]-co_ord[1][1]]
#print(four_co_ord)
#print(text)
bbox = {
'x': 100 * four_co_ord[0] / image_width,
'y': 100 * four_co_ord[1] / image_height,
'width': 100 * four_co_ord[2] / image_width,
'height': 100 * four_co_ord[3] / image_height,
'rotation': 0
}
if not text:
continue
region_id = str(uuid4())[:10]
score = 0.5
bbox_result = {
'id': region_id, 'from_name': 'bbox', 'to_name': 'image', 'type': 'rectangle',
'value': bbox}
transcription_result = {
'id': region_id, 'from_name': 'transcription', 'to_name': 'image', 'type': 'textarea',
'value': dict(text=[text], **bbox), 'score': score}
annotation_result.extend([bbox_result, transcription_result])
#print('annotation_result :\n',annotation_result)
output_json['predictions'] = [ {"result": annotation_result, "score":0.97}]
label_studio_task_list.append(output_json)
# saving label_stdui_task_list as json file to import in label_studio
with open('TC_label-studio_input_file.json', 'w') as f:
json.dump(label_studio_task_list, f, indent=4)
extracted_tables_to_label_studio_json_file_with_paddleOCR(images_folder_path)
#