-
Notifications
You must be signed in to change notification settings - Fork 3
/
index.py
33 lines (30 loc) · 1.23 KB
/
index.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
import streamlit as st
from PIL import Image
import pytesseract
import pdf2image
from pdf2image import convert_from_path, convert_from_bytes
from pdf2image.exceptions import (
PDFInfoNotInstalledError,
PDFPageCountError,
PDFSyntaxError,
)
from process.easy_ocr import process as easy_ocr_process
from process.tesseract_ocr import process as tesseract_ocr_process
st.title("Parse Vision")
st.write("Visualise what OCR libraries see in your documents")
uploaded_file = st.file_uploader("Upload a file", type=["pdf"])
if uploaded_file is not None:
images = convert_from_bytes(uploaded_file.getvalue())
with st.spinner("Processing... this may take several seconds"):
processed_images = easy_ocr_process(images)
processed_images2 = tesseract_ocr_process(images)
# create 2 tabs for each OCR library
# tab 1 = easy_ocr
# tab 2 = pytesseract
tab1, tab2 = st.tabs(["EasyOCR", "Tesseract"])
with tab1:
for i, image in enumerate(processed_images):
st.image(image, caption=f"Page {i}", use_column_width=True)
with tab2:
for i, image in enumerate(processed_images2):
st.image(image, caption=f"Page {i}", use_column_width=True)