-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathfile_reader.py
56 lines (41 loc) · 1.39 KB
/
file_reader.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
import docx
import fitz
import pdfplumber
from streamlit.uploaded_file_manager import UploadedFile
import text_preprocessing
class ResumeReader(object):
def __init__(self, need_clean=True):
self.need_clean = need_clean
pass
def extract_text_from_pdf_file(self, file):
cv_content = ""
with pdfplumber.open(file) as pdf:
for page in pdf.pages:
cv_content += page.extract_text() + " "
return cv_content
def extract_text_from_pdf(self, filename):
"""Extract raw text content from pdf file
Args:
filename (_type_): _description_
"""
pdf = fitz.open(filename)
cv_text = ""
for page in pdf:
cv_text += page.get_text() + " "
return cv_text
def extract_text_from_doc(self, file):
document = docx.Document(file)
text = ''
for p in document.paragraphs:
text += ' ' + p.text
return text
def read_text_from_file(self, resume_file: UploadedFile):
text = ''
mime_type = resume_file.name.split('.')[-1]
if mime_type == 'pdf':
text = self.extract_text_from_pdf_file(resume_file)
elif mime_type == 'docx':
text = self.extract_text_from_doc(resume_file)
if self.need_clean:
text = text_preprocessing.clean_text(text)
return text