-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmain.py
566 lines (426 loc) · 24.7 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
from utils import *
from extractors import *
from thefuzz import fuzz, process
import argparse as ag
import sys
import pkg_resources
def get_summary(doc, print_results=False, include_images=False, images_folder="tmp-images", show_progress=True, threshold=50, show_image_process=False):
"""
Extracts a summarized view of a PDF document, including text highlights, headings, and optionally images.
Arguments:
+ doc -- The PDF document object to process.
+ print_results (bool) -- Whether to print intermediate results for debugging (default: False).
+ include_images (bool) -- Whether to include images from the PDF pages (default: False).
+ images_folder (str) -- The folder to save extracted images (default: "tmp-images").
+ show_progress (bool) -- Whether to display progress messages for each page (default: True).
+ threshold (int) -- Minimum similarity score for fuzzy matching between highlights and actual text (default: 50).
+ show_image_process (bool) -- Whether to display intermediate image processing results (default: False).
Functionality:
1. **Text Processing:**
- Extracts the text from each page of the PDF.
- Identifies and extracts highlighted text from the page.
- Performs fuzzy matching to align the highlights with the main text, filtering matches based on a similarity threshold.
2. **Headings Processing:**
- Extracts potential headings from the page.
- Matches headings to the highlighted text to structure the results, inserting them at appropriate positions.
3. **Image Processing (Optional):**
- Converts the page into an image and displays it.
- Extracts images from the page, saves them in the specified folder, and optionally displays contours and captions.
- Allows user interaction for selecting captions and managing images.
4. **Results Compilation:**
- Constructs a structured summary for each page, including:
- Page number
- Highlighted text matches
- Original text and highlight counts
- Extracted headings
- Extracted images and captions (if enabled)
- Appends all results into a single list for the entire document.
5. **Progress and Debugging:**
- Prints progress messages and intermediate results (optional).
Returns:
+ results -- A list containing the summarized text, headings, and images (if included) for the entire document.
Notes:
- Image and text extraction relies on helper functions such as `pdf_page_to_image`, `getTextFromPDFAsParagraphs`, `getHighlightedText`, `getImages`, `getHeadings`, and others.
- Requires the `cv2` library for image processing and the `fuzzywuzzy` library for text similarity scoring.
- Handles multi-page documents and includes functionality for user interaction in the image processing phase.
"""
results = []
# Processing each page
for page_num in range(1, doc.page_count+1):
print(f"\n[INFO] Processing page {page_num} of {doc.page_count}...") if show_progress else None
highlight_matches = [] # Reset for each page
original_count = 0
highlight_matches_count = 0
# Acquiring an image version of the page
img = pdf_page_to_image(doc, page_num)
# Geting the main text from the given page, for reference
actual_page_text, actual_page_text_for_headings = getTextFromPDFAsParagraphs(doc, page_num)
original_count = get_count(actual_page_text, original_count)
displayResult("ORIGINAL TEXT", actual_page_text) if print_results else None
# Extracting highlighted text on the page
hText = getHighlightedText(doc, page_num, img, show_result_image=show_image_process, show_process=show_image_process)
displayResult("HIGHLIGHTED TEXT", hText) if print_results else None
# Performing fuzzy matching between highlights and actual text
for text in hText:
highlight_matches.extend(process.extract(text, actual_page_text, scorer=fuzz.token_sort_ratio))
# Filter and sort matches to allow the main txt to appear in order
highlight_matches = list(dict.fromkeys([match for match, score in highlight_matches if score >= threshold]))
highlight_matches = sorted(highlight_matches, key=lambda x: actual_page_text.index(x))
displayResult("HIGHLIGHTED TEXT MATCHES", highlight_matches) if print_results else None
if include_images:
while True:
cv2.imshow('Original Image', img)
key = cv2.waitKey(1) & 0xFF
if key == 27 or key in (ord('q'), ord('x')): # Press ESC to close or click on an option
break
getImages(img, images_folder, show_contours=show_image_process, show_result=show_image_process)
possible_captions = getImageCaption(actual_page_text_for_headings)
images, paths = load_images_from_folder(images_folder)
image_path_result = display_images_grid(images, paths, close_image_window=False, pdf_image_path='')
image_caption_result = display_strings(possible_captions, window_name="Select The Correct Caption: ", close_caption_window = False, pdf_caption_text = '')
# Match headings with the highlighted text
headings = getHeadings(actual_page_text_for_headings)
displayResult("PAGE HEADINGS", headings) if print_results else None
for index, match in enumerate(highlight_matches):
for heading in headings:
if heading in match:
highlight_matches[index] = match.replace(match[ match.find(heading) : match.find(heading)+len(heading) ], "") # Replace current Heading Sentence with sentence without heading.
highlight_matches.insert(index, f"\n\n\n####### {heading} ##############\n\n\n") # Insert the New Heading
highlight_matches_count = get_count(highlight_matches, highlight_matches_count)
# Append page results to the main results list
results.append(f"Page {page_num}:\n")
results.extend(highlight_matches)
if include_images:
results.append(image_path_result)
results.append(image_caption_result)
results.append(f"\n\nOriginal Text Length {original_count}\n' | \nHighlighted Text Count: {highlight_matches_count}")
results.append("\n\n==============================================================\n\n")
print(f"[INFO] Page {page_num} of {doc.page_count}: Extraction completed with {highlight_matches_count} highlights from {original_count} original text and {len(headings)} headings.\n") if show_progress else None
displayResult("RESULT", results) if print_results else None
cv2.destroyAllWindows()
return results
def home_screen():
"""
Display the home screen of the OSPDF application, including a gradient ASCII
title and a welcome message with an overview of the application's features.
The home screen displays:
- A gradient-colored ASCII title.
- A brief welcome message with the version and usage tips.
This function will call `gradient_text` to apply a color gradient to both
the ASCII art title and the welcome message text.
Example:
>>> home_screen()
Welcome to OSPDF!
Version: 0.0.1
OSPDF helps you manage and work with PDFs...
"""
title = r'''
_____ _____
( ___ )---------------------------( ___ )
| | | |
| | _ __ | |
| | ___ ___ _ __ __| |/ _| | |
| | / _ \/ __| '_ \ / _` | |_ | |
| | | (_) \__ \ |_) | (_| | _| | |
| | \___/|___/ .__/ \__,_|_| | |
| | |_| | |
|___| |___|
(_____)---------------------------(_____)
'''
# Apply the gradient to the ASCII art
gradient_title = gradient_text(title, [196, 214, 226])
title_lines = gradient_title.splitlines()
for line in title_lines:
print(line)
welcome_message = '''
Welcome to OSPDF!
Version: 0.0.1
OSPDF helps you manage and work with PDFs. Here are some of the things you can do:
- Summarize PDF content based on highlighted text.
- Split a PDF into individual pages or ranges.
- Merge multiple PDFs into one.
- Convert a PDF page into an image.
Tips
------
- Use `init` so that you would not need to state the input file every time you want to do an operation.
- You can reset or clear you session by running `ospdf init -r | --reset-state`.
- Use the -h | --help when in doubt or confused.
'''
gradient_title = gradient_text(welcome_message, [211, 220, 228])
title_lines = gradient_title.splitlines()
for line in title_lines:
print(line)
def main():
check_and_create_folder(".ospdf-tmp-images")
parser = ag.ArgumentParser(prog='ospdf', description='Manage and summarize PDF files effectively with custom commands.')
# parser.add_argument('-v', '--version', action='version', version=f"%{parser.prog}s {pkg_resources.get_distribution('ospdf').version}")0.0.1'
parser.add_argument('-v', '--version', action='version', version=f"{parser.prog} {'0.0.1'}")
sub_parser = parser.add_subparsers(dest='command', required=False, parser_class=ag.ArgumentParser)
init_parser = sub_parser.add_parser('init', help='Set up the tool with a specific PDF file and optionally adjust its state.')
init_parser.add_argument('pdf_file', nargs='?', default='', type=str, help='Path to the target PDF file to be used. Leave empty to use the current state.')
init_parser.add_argument('-d','--dont-persist-state', action='store_true', help='Specify if the current state should not be saved. By default, the state will persist.')
init_parser.add_argument('-r','--reset-state', action='store_true', help='Reset the saved state to start fresh with the specified or default PDF file.')
summariser_parser = sub_parser.add_parser('summarize', help='Summarize PDF content based on highlighted text.', description='Use this command to generate summaries from highlighted sections of a PDF. You can include images, customize the output format, and control various processing options.')
summariser_parser.add_argument('-i', '--include-images', action='store_true', help='Include images from the PDF in the summary output (PDF or DOCX formats).')
summariser_parser.add_argument('-o', '--output-path', type=str, help='Path to save the summary file. Supports PDF, DOCX, or TXT formats based on selected options.')
summariser_parser.add_argument('-u', '--input-path', type=str, help='Path to the input PDF file for summarization. Overrides any saved state.')
summariser_parser.add_argument('-p', '--print-results', action='store_true', help='Display the summarization results directly in the terminal.')
summariser_parser.add_argument('-s', '--show-progress', action='store_true', help='Display progress updates in the terminal during summarization.')
summariser_parser.add_argument('-a', '--show-image-process', action='store_true', help='Preview each image during processing. Use cautiously as it may slow down operations and clutter the screen.')
summariser_parser.add_argument('--pdf', action='store_true', help='Save the summary as a PDF file.')
summariser_parser.add_argument('--txt', action='store_true', help='Save the summary as a plain text file.')
summariser_parser.add_argument('--docx', action='store_true', help='Save the summary as a Word document.')
summariser_parser.add_argument('-v', '--verbose', action='store_true', help='Enable verbose mode to display all debug and status messages.')
summariser_parser.add_argument('-t', '--threshold', type=int, default=50, help='Set the summarization accuracy threshold (0-100). Higher values prioritize precision.')
split_parser = sub_parser.add_parser('split', help='Split a PDF to extract a single page or a range of pages.', description='Use this command to split a PDF file, extracting either a single page or a specified range of pages into a new file.')
split_parser.add_argument('input_pdf_file', nargs='?', type=str, help='Path to the input PDF file. Overrides any saved state if provided.')
split_parser.add_argument('output_pdf_file', type=str, help='Path to save the extracted pages as a new PDF file.')
split_parser.add_argument('-s', '--start-page', type=int, default=None, help='Specify the starting page number for extraction. Defaults to the first page.')
split_parser.add_argument('-e', '--end-page', type=int, default=None, help='Specify the ending page number for extraction. Defaults to the last page.')
merge_parser = sub_parser.add_parser('merge', help='Merge multiple PDF files into a single PDF.', description='Combine multiple PDF files into one. Specify the output file and input files to merge.')
merge_parser.add_argument('output_pdf_file', help="Path to save the merged PDF file.")
merge_parser.add_argument('input_pdf_files', nargs='*', help="Paths to the input PDF files to be merged. Provide two or more PDF file paths.")
pdf2img_parser = sub_parser.add_parser('pdf2img', help='Convert a single PDF page to an image.', description='Extract a specific page from a PDF and convert it into an image file.')
pdf2img_parser.add_argument('input_pdf_file', nargs='?', type=str, help='Path to the input PDF file for page-to-image conversion. Overrides saved state if provided.')
pdf2img_parser.add_argument('output_img_path', type=str, help='Path to save the converted image file.')
pdf2img_parser.add_argument('page_number', type=int, default=None, help='The page number to extract and convert to an image.')
parser.set_defaults(func=home_screen)
args = parser.parse_args()
if args.command == 'init':
if args.reset_state:
if not os.path.exists(STATE_FILE):
print("Error: No saved state to reset.")
sys.exit(1)
else:
clear_state()
sys.exit(0)
if not args.pdf_file:
print("Error: You must specify a PDF file unless using --reset-state/-r.")
sys.exit(1)
pdf_path = args.pdf_file
if not os.path.exists(pdf_path):
print(f"Error: File '{pdf_path}' does not exist.")
sys.exit(1)
print(f"Initialized with PDF file: {pdf_path}")
if args.dont_persist_state:
save_state(pdf_path, False)
print("Current State would Not be preserved.")
else:
save_state(pdf_path, True)
elif args.command == 'summarize':
try:
pdf_path, persist_state = load_state()
except ValueError as e:
if args.input_path:
pdf_path = args.input_path
persist_state = False
else:
# print(str(e), file=sys.stderr)
print("No input file stated")
sys.exit(1)
include_images=False
output_path = "OS_PDF_SUMMARIZATION_OUTPUT.pdf"
print_results = False
show_progress = True
show_image_process = False
pdf = True
txt = False
docx = False
if args.input_path:
pdf_path = args.input_path
if args.include_images:
include_images = True
output_path = args.output_path or output_path
output_dir = os.path.dirname(output_path)
if output_dir and not os.path.exists(output_dir):
print(f"Error: The directory '{output_dir}' doesn't exist.")
sys.exit(1)
if not output_path.lower().endswith('.pdf') and args.pdf:
print("Error: The output file must have a `.pdf` extension.")
sys.exit(1)
if not output_path.lower().endswith('.docx') and args.docx:
print("Error: The output file must have a `.docx` extension.")
sys.exit(1)
if not output_path.lower().endswith('.txt') and args.txt:
print("Error: The output file must have a `.txt` extension.")
sys.exit(1)
if args.print_results:
print_results = True
if args.show_progress:
show_progress = False
if args.show_image_process:
show_image_process = True
if args.verbose:
print_results = True
show_progress = True
show_image_process = True
if args.threshold is not None:
if not 0<= args.threshold <= 100:
print(f"Error: {args.threshold} is invalid. Threshold must be between 0 and 100.")
sys.exit(1)
if args.txt:
txt = True
pdf = False
if args.docx:
docx = True
pdf = False
if not (pdf or txt or docx):
pdf = True
doc = fitz.open(pdf_path)
print(pdf, txt, docx)
results = get_summary(doc, include_images=include_images, images_folder=".ospdf-tmp-images", print_results=print_results, show_progress=show_progress, threshold=args.threshold, show_image_process=show_image_process)
if docx:
save_to_docx(results, output_path)
elif txt:
save_to_txt(results, output_path)
else:
save_to_pdf(results, output_path)
doc.close()
if not persist_state:
print("Clearing state...")
clear_state()
elif args.command == 'split':
start_page = None
end_page = None
try:
# Attempt to load the input PDF file path from the saved state
input_pdf_file, persist_state = load_state()
except ValueError as e:
# If the saved state doesn't have the input, check the user-provided input
if args.input_pdf_file:
input_pdf_file = args.input_pdf_file
else:
print("Error: No input file stated. Please provide an input file.")
sys.exit(1)
if args.input_pdf_file:
input_pdf_file = args.input_pdf_file
# Validate input PDF file
if not os.path.isfile(input_pdf_file):
print(f"Error: {input_pdf_file} doesn't exist")
sys.exit(1)
elif not input_pdf_file.lower().endswith('.pdf'):
print(f"Error: {input_pdf_file} is not a PDF file. It must end with `.pdf`")
sys.exit(1)
output_pdf_file = args.output_pdf_file
output_dir = os.path.dirname(output_pdf_file)
if output_dir and not os.path.exists(output_dir):
print(f"Error: The directory '{output_dir}' doesn't exist.")
sys.exit(1)
if not output_pdf_file.lower().endswith('.pdf'):
print("Error: The output file must have a `.pdf` extension.")
sys.exit(1)
if args.start_page:
start_page_str = str(args.start_page) # Ensure it's treated as a string
if not start_page_str.isdigit():
print(f"Error: {args.start_page} must be an integer.")
sys.exit(1)
start_page = int(args.start_page) # Convert to integer
if start_page < 1:
print(f"Error: {args.start_page} must be an integer greater than 0.")
sys.exit(1)
if args.end_page:
end_page_str = str(args.end_page) # Ensure it's treated as a string
if not end_page_str.isdigit():
print(f"Error: {args.end_page} must be an integer.")
sys.exit(1)
end_page = int(args.end_page) # Convert to integer
if end_page < 1:
print(f"Error: {args.end_page} must be an integer greater than 0.")
sys.exit(1)
if args.start_page and end_page < start_page:
print(f"Error: End page must be greater than or equal to start page.")
sys.exit(1)
if args.end_page and not args.start_page:
print("If you want to split only one page, then set the start page only, else you would need both start and end pages set.")
sys.exit(1)
if not args.start_page and not args.end_page:
print("Both Start and End Page Cannot be Empty")
sys.exit(1)
split_pdf(input_pdf_file, output_pdf_file, start_page=start_page, end_page=end_page)
print(f"Successfully Split {input_pdf_file} and is saved at {output_pdf_file}")
if not persist_state:
print("Clearing state...")
clear_state()
elif args.command == 'merge':
if len(args.input_pdf_files) < 2:
print("Error: At least two input PDFs must be provided to merge.")
sys.exit(1)
# Validate input PDF files
for input_pdf in args.input_pdf_files:
if not os.path.isfile(input_pdf):
print(f"Error: {input_pdf} doesn't exist")
sys.exit(1)
elif not input_pdf.lower().endswith('.pdf'):
print(f"Error: {input_pdf} is not a PDF file. It must end with `.pdf`")
sys.exit(1)
output_pdf_file = args.output_pdf_file
output_dir = os.path.dirname(output_pdf_file)
if output_dir and not os.path.exists(output_dir):
print(f"Error: The directory '{output_dir}' doesn't exist.")
sys.exit(1)
if not output_pdf_file.lower().endswith('.pdf'):
print("Error: The output file must have a `.pdf` extension.")
sys.exit(1)
merge_pdfs(output_pdf_file, *args.input_pdf_files)
elif args.command == 'pdf2img':
try:
# Attempt to load the input PDF file path from the saved state
input_pdf_file, persist_state = load_state()
except ValueError as e:
# If the saved state doesn't have the input, check the user-provided input
if args.input_pdf_file:
input_pdf_file = args.input_pdf_file
else:
print("Error: No input file stated. Please provide an input file.")
sys.exit(1)
if args.input_pdf_file:
input_pdf_file = args.input_pdf_file
if not os.path.isfile(input_pdf_file):
print(f"Error: {input_pdf_file} doesn't exist")
sys.exit(1)
elif not input_pdf_file.lower().endswith('.pdf'):
print(f"Error: {input_pdf_file} is not a PDF file. It must end with `.pdf`")
sys.exit(1)
output_pdf_path = args.output_img_path
if os.path.isdir(output_pdf_path):
# If the path is a directory and does not have a file name, ask for a valid file name
print(f"Error: '{output_pdf_path}' is a directory. Please provide a full file path including the file name and extension (e.g., 'output_image.png').")
sys.exit(1)
if os.path.basename(output_pdf_path) == output_pdf_path: # No directory provided, only a file name
output_pdf_path = os.path.join(os.getcwd(), output_pdf_path)
print(f"Saving image to the current directory: {output_pdf_path}")
# Check if the file path ends with a valid image extension
valid_extensions = ['.png', '.jpg', '.jpeg', '.bmp', '.gif']
if not any(output_pdf_path.lower().endswith(ext) for ext in valid_extensions):
print("Error: Please provide a valid file name with a supported image extension (e.g., .png, .jpg).")
sys.exit(1)
# If the directory doesn't exist, create it
directory = os.path.dirname(output_pdf_path)
if not os.path.isdir(directory):
try:
os.makedirs(directory)
print(f"Directory '{directory}' created.")
except Exception as e:
print(f"Error: Failed to create directory '{directory}'. {e}")
sys.exit(1)
page_number = None
if args.page_number:
page_number_str = str(args.page_number)
if not page_number_str.isdigit():
print(f"Error: {args.page_number} must be an integer.")
sys.exit(1)
page_number = int(args.page_number)
if page_number < 1:
print(f"Error: {args.page_number} must be an integer greater than 0.")
sys.exit(1)
if not page_number:
print("Page Number must be greater than 1")
sys.exit(1)
save_image(pdf_page_to_image(fitz.open(input_pdf_file), page_number), output_pdf_path)
if not persist_state:
print("Clearing state...")
clear_state()
else:
args.func()
if __name__ == '__main__':
main()