-
Notifications
You must be signed in to change notification settings - Fork 0
/
functions.py
192 lines (176 loc) · 17.4 KB
/
functions.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
# from transformers import TrOCRProcessor
# from optimum.onnxruntime import ORTModelForVision2Seq
import os
import re
import zipfile
import latex2mathml.converter
import warnings
warnings.filterwarnings('ignore')
from PIL import Image
import requests
from io import BytesIO
from PIL import Image
from pix2text import Pix2Text, merge_line_texts
p2t = Pix2Text.from_config()
# processor = TrOCRProcessor.from_pretrained('breezedeus/pix2text-mfr')
# model = ORTModelForVision2Seq.from_pretrained('breezedeus/pix2text-mfr', use_cache=False)
prohibited_symbols = {'+', '-', '*', '/','±','=','>','<'}
def download_img(url):
return Image.open(url).convert('RGB')
def extractImage(imageList, prs, ppt_filename):
for pageNum,slide in enumerate(prs.slides):
pageNum+=1
for imgNum,shape in enumerate(slide.shapes):
if hasattr(shape,"image"):
imgNum+=1
if shape.image:
image_data = shape.image.blob
image_name = shape.image.filename
temp_img_name = f"{ppt_filename.split('.')[0]}_{pageNum}_{imgNum}_{image_name}"
imageList.append(temp_img_name)
with open(temp_img_name, "wb") as f:
f.write(image_data)
def processImage(data,fileName):
try:
prcessedtextFile = latexGeneration(data,fileName)
return prcessedtextFile
except Exception as e:
print(e.args)
def processImagezip(image , fileName):
try:
prcessedtextFile = latexGenerationfunczip(image,fileName)
return prcessedtextFile
except Exception as e:
print(e.args)
def readImages(file):
success = False
if not os.path.exists("output_text"):
os.mkdir('output_text')
try:
with zipfile.ZipFile(file, "r") as zip_ref:
for filename in zip_ref.namelist():
if (filename.endswith(".jpg") or filename.endswith(".JPG") or filename.endswith(".png") or filename.endswith(".PNG")) and (not filename.startswith("__MACOSX/")):
with zip_ref.open(filename, "r") as image_data:
csv = processImagezip(image_data , filename)
del image_data
success=True
return success
except Exception as e:
print(e.args)
def latexGeneration(data,fileName):
if not os.path.exists("output_text"):
os.mkdir('output_text')
try:
imgName = fileName.split('.')[0]
targetTxt = f"output_text/{imgName}.txt"
images=download_img(data)
outs2 = p2t.recognize(images, file_type='text_formula', return_text=True, save_analysis_res='en1-out.jpg') # recognize mixed images
matches = re.findall(r'\$\$(.*?)\$\$', outs2, re.DOTALL)
# Clean up the matches by stripping whitespace
cleaned_matches = [match.strip() for match in matches]
# Print the extracted strings
with open(targetTxt, 'w') as f:
for match in cleaned_matches:
latexinput =rf"{match}"
mathml_output = latex2mathml.converter.convert(latexinput)
f.write(mathml_output + '\n')
f.close()
return targetTxt
except Exception as e:
print(e.args)
# apt-get update
# apt-get install vim
def latexGenerationfunczip(data,fileName):
if not os.path.exists("output_text1"):
os.mkdir('output_text1')
try:
imgName = fileName.split('.')[0]
images=download_img(data)
outs2 = p2t.recognize(images, file_type='text_formula', return_text=True, save_analysis_res='en1-out.jpg') # recognize mixed images
matches = re.findall(r'\$\$(.*?)\$\$', outs2, re.DOTALL)
cleaned_matches = [match.strip() for match in matches]
# pixel_values = processor(images=images, return_tensors="pt").pixel_values
# generated_ids = model.generate(pixel_values)
# generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)
targetTxt = f"output_text1/{imgName}.txt"
# for i in range(0,len(generated_text)):
# generated_texts = generated_text[i].split('\\\\')
# with open(targetTxt, 'w') as f:
# for text in generated_texts:
# if any(symbol in text for symbol in prohibited_symbols):
# text=text.replace('$$','').replace('sin','999999999').replace('cos','999999998').replace('tan','9999999997').replace('cosec','9999999996').replace('sec','9999999995').replace('cot','9999999994').replace(' ','').replace('cdot',' ').replace('\prime','2').replace('begin{aligned}','').replace('\circ','2').replace('{{}','').replace("{}",'').replace('\\','').replace('&','').replace(',,,','').replace('pm','±').replace('leqslant','≤').replace('leq','≤')
# text=text.strip()
# mathml_output = latex2mathml.converter.convert(text)
# # mathml_output=mathml_output.replace('|','|').replace('!','').replace('(','(').replace('999999999','sin').replace('999999998','cos').replace('9999999997','tan').replace('9999999996','cosec').replace('9999999995','sec').replace('9999999994','cot').replace('+','+').replace(')',')').replace('/','/').replace('*','*').replace('<mi>','').replace(',','').replace('</mi>','').replace('stretchy="false"','').replace(' ','').replace('>','>').replace('<','<').replace('=','=').replace('−','-').replace(' ','~').replace('displaystyle','')
# mathml_output=mathml_output.replace('999999999','sin').replace('999999998','cos').replace('9999999997','tan').replace('9999999996','cosec').replace('9999999995','sec').replace('9999999994','cot').replace('stretchy="false"','').replace(' ','').replace('displaystyle','')
with open(targetTxt, 'w') as f:
for match in cleaned_matches:
latexinput =rf"{match}"
mathml_output = latex2mathml.converter.convert(latexinput)
f.write(mathml_output + '\n')
f.close()
return targetTxt
except Exception as e:
print(e.args)
# def textGenerationfunc(data,fileName):
# if not os.path.exists("output_text"):
# os.mkdir('output_text')
# try:
# imgName = fileName.split('.')[0]
# images=[download_img(data)]
# pixel_values = processor(images=images, return_tensors="pt").pixel_values
# # print(f'pixel_values', pixel_values)
# generated_ids = model.generate(pixel_values)
# generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)
# targetTxt = f"output_text/{imgName}.txt"
# for i in range(0,len(generated_text)):
# generated_texts = generated_text[i].split('\\\\')
# with open(targetTxt, 'w') as f:
# # Iterate through each element in the list
# for text in generated_texts:
# if any(symbol in text for symbol in prohibited_symbols):
# text=text.replace('$$','').replace('frac',"N").replace('sin','999999999').replace('cos','999999998').replace('tan','9999999997').replace('cosec','9999999996').replace('sec','9999999995').replace('cot','9999999994').replace(' ','').replace('cdot',' ').replace('\prime','2').replace('begin{aligned}','').replace('\circ','2').replace('{{}','').replace("{}",'').replace('\\','').replace('&','').replace(',,,','').replace('pm','±').replace('leqslant','≤').replace('leq','≤')
# text=text.strip()
# pattern = r"[a-zA-Z]{4,}"
# # Find all matches in the text
# matches = re.findall(pattern, text)
# dictofwords ={}
# for idx,i in enumerate(matches):
# constant=9999999999999999999
# text =text.replace(f'{i}',f" {constant-idx}")
# dictofwords[i]=f" {constant-idx}"
# mathml_output = latex2mathml.converter.convert(text)
# for item in dictofwords.items():
# mathml_output=mathml_output.replace(f"<mn>{item[1].strip()}</mn>",f"<mi>{item[0]}</mi>")
# # mathml_output=mathml_output.replace('|','|').replace('!','').replace('(','(').replace('999999999','sin').replace('999999998','cos').replace('9999999997','tan').replace('9999999996','cosec').replace('9999999995','sec').replace('9999999994','cot').replace('+','+').replace(')',')').replace('/','/').replace('*','*').replace('<mi>','').replace(',','').replace('</mi>','').replace('stretchy="false"','').replace(' ','').replace('>','>').replace('<','<').replace('=','=').replace('−','-').replace(' ','~').replace('displaystyle','')
# mathml_output=mathml_output.replace('999999999','sin').replace("N",'frac').replace('999999998','cos').replace('9999999997','tan').replace('9999999996','cosec').replace('9999999995','sec').replace('9999999994','cot').replace('stretchy="false"','').replace(' ','').replace('displaystyle','').replace('<mi>','').replace('</mi>','')
# f.write(mathml_output + '\n')
# f.close()
# return targetTxt
# except Exception as e:
# print(e.args)
# def textGenerationfunczip(data,fileName):
# if not os.path.exists("output_text1"):
# os.mkdir('output_text1')
# try:
# imgName = fileName.split('.')[0]
# images=[download_img(data)]
# pixel_values = processor(images=images, return_tensors="pt").pixel_values
# generated_ids = model.generate(pixel_values)
# generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)
# targetTxt = f"output_text1/{imgName}.txt"
# for i in range(0,len(generated_text)):
# generated_texts = generated_text[i].split('\\\\')
# with open(targetTxt, 'w') as f:
# for text in generated_texts:
# if any(symbol in text for symbol in prohibited_symbols):
# text=text.replace('$$','').replace('sin','999999999').replace('cos','999999998').replace('tan','9999999997').replace('cosec','9999999996').replace('sec','9999999995').replace('cot','9999999994').replace(' ','').replace('cdot',' ').replace('\prime','2').replace('begin{aligned}','').replace('\circ','2').replace('{{}','').replace("{}",'').replace('\\','').replace('&','').replace(',,,','').replace('pm','±').replace('leqslant','≤').replace('leq','≤')
# text=text.strip()
# mathml_output = latex2mathml.converter.convert(text)
# # mathml_output=mathml_output.replace('|','|').replace('!','').replace('(','(').replace('999999999','sin').replace('999999998','cos').replace('9999999997','tan').replace('9999999996','cosec').replace('9999999995','sec').replace('9999999994','cot').replace('+','+').replace(')',')').replace('/','/').replace('*','*').replace('<mi>','').replace(',','').replace('</mi>','').replace('stretchy="false"','').replace(' ','').replace('>','>').replace('<','<').replace('=','=').replace('−','-').replace(' ','~').replace('displaystyle','')
# mathml_output=mathml_output.replace('999999999','sin').replace('999999998','cos').replace('9999999997','tan').replace('9999999996','cosec').replace('9999999995','sec').replace('9999999994','cot').replace('stretchy="false"','').replace(' ','').replace('displaystyle','')
# f.write(mathml_output + '\n')
# f.close()
# return targetTxt
# except Exception as e:
# print(e.args)