Merge pull request #28 from 8baby8/main

Modifying Ocr Scripts
SmartFlowAI · Feb 29, 2024 · 5193ad6 · 5193ad6
2 parents 8dfe1fe + 681f94d
commit 5193ad6
Showing 1 changed file with 66 additions and 68 deletions.
diff --git a/scripts/ptf2txt.py → scripts/pdf2txt.py b/scripts/ptf2txt.py → scripts/pdf2txt.py
@@ -1,69 +1,67 @@
-import os 
-import sys 
-import glob 
-try :
- import cv2 
-except:
- os.system('pip install opencv-python')
-try :
- from paddleocr import PaddleOCR , draw_ocr , download_with_progressbar 
-except:
- os.system('pip install paddleocr')
-output_folder_path = 'res/'
-if not os.path.exists(output_folder_path): 
- os.makedirs(output_folder_path) 
-
-def get_pdf_files_in_directory(directory_path): 
- # 确保路径存在 
- if os.path.exists(directory_path) and os.path.isdir(directory_path): 
- # 使用glob模块搜索所有PDF文件 
- pdf_files = glob.glob(os.path.join(directory_path, '**', '*.pdf'), recursive=True) 
- return pdf_files 
- else: 
- return [] 
-def ocr_pdf_folder(folder_path):
- ocr = PaddleOCR ( use_angle_cls = True , lang = "ch" , page_num = 0 ) # 只需运行一次即可将模型下载并加载到内存中
- print("ppocrv4 加载完毕！！！")
- pdf_paths = get_pdf_files_in_directory(folder_path)
- print(f"共检测到 {len(pdf_paths)} 个PDF文件")
- # 打印所有PDF文件的路径 
- for pdf_path in pdf_paths: 
- print(f'正在处理文件：{pdf_path}')
-
- result = ocr . ocr ( pdf_path , cls = True ) 
- for idx in range ( len ( result )): 
- res = result [ idx ] 
- for line in res : 
- print ( line ) 
- print(f'{pdf_path} 处理完毕')
- ocr_result = ""
- for idx in range(len(result)):
- res = result[idx]
- for line in res:
- # print(line[1][0])
- ocr_result = ocr_result + " " + str(line[1][0])
-
- filename = os.path.splitext(os.path.basename(pdf_path))[0] 
-
- # 构建TXT文件的完整路径 
- txt_path = os.path.join('res/', filename + '.txt') 
-
- # 将提取的文本写入TXT文件 
- with open(txt_path, 'w', encoding='utf-8') as txt_file: 
- txt_file.write(ocr_result) 
-
- print(f'生成的txt文档保存在{txt_path}')
- # break
- # print(ocr_result)
- # with open('my_file.txt', 'a') as f: 
- # # 写入字符串 
- # f.write(ocr_result)
-
-
-if __name__ == "__main__": 
- if len(sys.argv) > 1: 
- # sys.argv[0] 是脚本名，sys.argv[1:] 是传递给脚本的参数列表 
- pdf_path = sys.argv[1] 
- print(f'需要处理的文件夹是：{pdf_path}')
- ocr_pdf_folder(pdf_path)
+import os
+import sys
+import glob
+try:
+ import cv2
+except :
+ os.system('pip install opencv-python')
+try :
+ from paddleocr import PaddleOCR , draw_ocr , download_with_progressbar 
+except:
+ os.system('pip install paddleocr')
+output_folder_path = 'res/'
+if not os.path.exists(output_folder_path): 
+ os.makedirs(output_folder_path) 
+
+def get_pdf_files_in_directory(directory_path): 
+ # 确保路径存在
+ if os.path.exists(directory_path) and os.path.isdir(directory_path): 
+ return glob.glob(os.path.join(directory_path, '**', '*.pdf'), recursive=True)
+ else: 
+ return [] 
+def ocr_pdf_folder(folder_path):
+ ocr = PaddleOCR ( use_angle_cls = True , lang = "ch" , page_num = 0 ) # 只需运行一次即可将模型下载并加载到内存中
+ print("ppocrv4 加载完毕！！！")
+ pdf_paths = get_pdf_files_in_directory(folder_path)
+ print(f"共检测到 {len(pdf_paths)} 个PDF文件")
+ # 打印所有PDF文件的路径
+ for pdf_path in pdf_paths: 
+ print(f'正在处理文件：{pdf_path}')
+
+ result = ocr.ocr (pdf_path , cls = True )
+ for idx in range(len(result)): 
+ res = result[idx] 
+ for line in res : 
+ print(line)
+ print(f'{pdf_path} 处理完毕')
+ ocr_result = ""
+ for idx in range(len(result)):
+ res = result[idx]
+ for line in res:
+ # print(line[1][0])
+ ocr_result = f"{ocr_result} {str(line[1][0])}"
+
+ filename = os.path.splitext(os.path.basename(pdf_path))[0] 
+
+ # 构建TXT文件的完整路径
+ txt_path = os.path.join('res/', f'{filename}.txt') 
+
+ # 将提取的文本写入TXT文件 
+ with open(txt_path, 'w', encoding='utf-8') as txt_file: 
+ txt_file.write(ocr_result) 
+
+ print(f'生成的txt文档保存在{txt_path}')
+ # break
+ # print(ocr_result)
+ # with open('my_file.txt', 'a') as f: 
+ # # 写入字符串 
+ # f.write(ocr_result)
+
+
+if __name__ == "__main__": 
+ if len(sys.argv) > 1: 
+ # sys.argv[0] 是脚本名，sys.argv[1:] 是传递给脚本的参数列表 
+ pdf_path = sys.argv[1] 
+ print(f'需要处理的文件夹是：{pdf_path}')
+ ocr_pdf_folder(pdf_path)