-
-
Notifications
You must be signed in to change notification settings - Fork 2
/
doc-toc.el
1059 lines (942 loc) · 43.3 KB
/
doc-toc.el
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
;;; doc-toc.el --- Manage outlines/table of contents of pdf and djvu documents -*- lexical-binding: t; -*-
;; Copyright (C) 2022 Free Software Foundation, Inc.
;; Author: Daniel Laurens Nicolai <dalanicolai@gmail.com>
;; Version: 1.02
;; Keywords: tools, outlines, convenience
;; Package-Requires: ((emacs "26.1"))
;; URL: https://github.com/dalanicolai/doc-tools-toc
;; This program is free software; you can redistribute it and/or modify
;; it under the terms of the GNU General Public License as published by
;; the Free Software Foundation, either version 3 of the License, or
;; (at your option) any later version.
;; This program is distributed in the hope that it will be useful,
;; but WITHOUT ANY WARRANTY; without even the implied warranty of
;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
;; GNU General Public License for more details.
;; You should have received a copy of the GNU General Public License
;; along with this program. If not, see <https://www.gnu.org/licenses/>.
;;; Commentary:
;; doc-toc.el is a package for creating and adding Tables of Contents to pdf
;; and djvu documents. It includes features for extracting the Table of Contents
;; from the textlayer of a document or via OCR if that last option is necessary
;; (or prefered). For 'software generated' PDFs it provides the option to use
;; pdf.tocgen (see URL `https://krasjet.com/voice/pdf.tocgen/'). Additionally,
;; this package implements various features for assisting in tidying up the
;; extracted Table of Contents, adjusting the pagenumbers and finally parsing
;; the Table of Contents into syntax that is understood by the `pdfoutline' and
;; `djvused' commands that are used to add the table of contents to pdf- and
;; djvu-files respectively.
;; Requirements: To use the pdf.tocgen functionality that software has to be
;; installed (see URL `https://krasjet.com/voice/pdf.tocgen/'). For the
;; remaining functions the package requires the `pdftotext' (part of
;; poppler-utils), `pdfoutline' (part of fntsample) and `djvused' (part of
;; http://djvu.sourceforge.net/) command line utilities to be available.
;; Extraction with OCR requires the tesseract command line utility to be
;; available.
;; Usage:
;; In each step below, check out available shortcuts using C-h m. Additionally
;; you can find available functions by typing the M-x mode-name (e.g. M-x
;; doc-toc-cleanup), or with two dashes in the mode name (e.g. M-x doc-toc--cleanup). Of
;; course if you use packages like Ivy or Helm you just use the fuzzy search
;; functionality.
;; Extraction and adding contents to a document is done in 4 steps:
;; 1 extraction
;; 2 cleanup
;; 3 adjust/correct pagenumbers
;; 4 add TOC to document
;; 1. Extraction: For PDFs without TOC pages, with a very complicated TOC (i.e.
;; that require much cleanup work) or with headlines well fitted for automatic
;; extraction (you will have to decide for yourself by trying it) consider to
;; use the pdf.tocgen (URL `https://krasjet.com/voice/pdf.tocgen/')
;; functionality described below. Otherwise, start with opening some pdf or djvu
;; file in Emacs (pdf-tools and djvu package recommended). Find the pagenumbers
;; for the TOC. Then type M-x `doc-toc-extract-pages', or M-x
;; `doc-toc-extract-pages-ocr' if doc has no text layer or text layer is bad, and
;; answer the subsequent prompts by entering the pagenumbers for the first and
;; the last page each followed by RET. For PDF extraction with OCR, currently it
;; is required to view all contents pages once before extraction (doc-toc uses
;; the cached file data). Also the languages used for tesseract OCR can be
;; customized via the `doc-toc-ocr-languages' variable. A buffer with the, somewhat
;; cleaned up, extracted text will open in TOC-cleanup mode. Prefix command with
;; the universal argument (C-u) to omit clean and get the raw text. If the
;; extracted text is of too low quality you either can hack/extend the
;; `doc-toc-extract-pages-ocr' definition, or alternatively you can try to extract
;; the text with the python document-contents-extractor script (see URL
;; `https://pypi.org/project/document-contents-extractor/'), which is more
;; configurable (you are also welcome to hack and improve that script).
;; The documentation at URL
;; `https://tesseract-ocr.github.io/tessdoc/Command-Line-Usage.html' might be
;; useful.
;; For TOC's that are formatted as two columns per page, prepend the
;; `doc-toc-extract-pages-ocr' command with two universal arguments. Then after you
;; are asked for the start and finish pagenumbers, a third question asks you to
;; set the tesseract psm code. For the double column layout it is best (as far
;; as I know) to use psm code '1'.
;; Software-generated PDF's with pdf.tocgen
;; For 'software-generated' (i.e. PDF's not created from scans) PDF-files it is
;; sometimes easier to use `doc-toc-extract-with-pdf-tocgen'. To use this function
;; you first have to provide the font properties for the different headline
;; levels. For that select the word in a headline of a certain level and then
;; type M-x `doc-toc-gen-set-level'. This function will ask which level you are
;; setting, the highest level should be level 1. After you have set the various
;; levels (1,2, etc.) then it is time to run M-x `doc-toc-extract-with-pdf-tocgen'.
;; If a TOC is extracted succesfully, then in the pdftocgen-mode buffer simply
;; press C-c C-c to add the contents to the PDF. The contents will be added to a
;; copy of the original PDF with the filename output.pdf and this copy will be
;; opened in a new buffer. If the pdf-tocgen option does not work well then
;; continue with the steps below.
;; If you merely want to extract text without further processing then you can
;; use the command `doc-toc-extract-only'.
;; 2. TOC-Cleanup: In this mode you can further cleanup the contents to create a
;; list where each line has the structure:
;; TITLE (SOME) PAGENUMBER
;; (If the initial TOC looks bad/unusable then try to use then universal
;; argument C-u before extraction in the previous step and/or try the ocr option
;; with or without the universal argument)
;; There can be any number of spaces between TITLE and PAGE. The correct
;; pagenumbers can be edited in the next step. A document outline supports
;; different levels and levels are automatically assigned in order of increasing
;; number of preceding spaces, i.e. the lines with the least amount of preceding
;; spaces are assigned level 0 etc., and lines with equal number of spaces get
;; assigned the same levels.
;; Contents 1
;; Chapter 1 2
;; Section 1 3
;; Section 1.1 4
;; Chapter 2 5
;; There are some handy functions to assist in the cleanup. C-c C-j jumps
;; automatically to the next line not ending with a number and joins it with the
;; next line. If the indentation structure of the different lines does not
;; correspond with the levels, then the levels can be set automatically from the
;; number of separatorss in the indices with M-x doc-toc-cleanup-set-level-by-index.
;; The default separators is a . but a different separators can be entered by
;; preceding the function invocation with the universal argument (C-u). Some
;; documents contain a structure like
;; 1 Chapter 1 1
;; Section 1 2
;; Here the indentation can be set with M-x replace-regexp ^[^0-9] -> \& (where
;; there is a space character before the \&).
;; Type C-c C-c when finished
;; 3. TOC-tabular (adjust pagenumbers): This mode provides the functionality for
;; easy adjustment of pagenmumbers. The buffer can be navigated with the arrow
;; up/down keys. The left and right arrow keys will shift down/up all the page
;; numbers from the current line and below (combine with SHIFT for setting
;; individual pagenumbers).
;; The TAB key jumps to the pagenumber of the current line, while C-right/C-left
;; will shift all remaining page numbers up/down while jumping/scrolling to the
;; line its page in the document window. to the S-up/S-donw in the tablist
;; window will just scroll page up/down in the document window and, only for
;; pdf, C-up/C-down will scroll smoothly in that window.
;; Type C-c C-c when done.
;; 4. TOC-mode (add outline to document): The text of this buffer should have
;; the right structure for adding the contents to (for pdf’s a copy of) the
;; original document. Final adjusments can be done but should not be necessary.
;; Type C-c C-c for adding the contents to the document.
;; By default, the TOC is simply added to the original file. ONLY FOR PDF’s, if
;; the (customizable) variable doc-toc-replace-original-file is nil, then the TOC is
;; added to a copy of the original pdf file with the path as defined by the
;; variable doc-toc-destination-file-name. Either a relative path to the original
;; file directory or an absolute path can be given.
;; Sometimes the `pdfoutline/djvused' application is not able to add the TOC to
;; the document. In that case you can either debug the problem by copying the
;; used terminal command from the `*messages*' buffer and run it manually in the
;; document's folder, or you can delete the outline source buffer and run
;; `doc-toc--tablist-to-handyoutliner' from the tablist buffer to get an outline
;; source file that can be used with HandyOutliner (see URL
;; `http://handyoutlinerfo.sourceforge.net/') Unfortunately the handyoutliner
;; command does not take arguments, but if you customize the
;; `doc-toc-handyoutliner-path' and `doc-toc-file-browser-command' variables, then Emacs
;; will try to open HandyOutliner and the file browser so that you can drag the
;; files directly into HandyOutliner).
;; Finally, if you just want to extract some text
;; Keybindings
;; Key Binding Description
;; all-modes (i.e. all steps)
;; C-c C-c dispatch (next step)
;; doc-toc-cleanup-mode
;; C-c C-j doc-toc--join-next-unnumbered-lines
;; C-c C-s doc-toc--roman-to-arabic
;; doc-toc (tablist)
;; TAB~ preview/jump-to-page
;; right/left doc-toc-in/decrease-remaining
;; C-right/C-left doc-toc-in/decrease-remaining and view page
;; S-right/S-left in/decrease pagenumber current entry
;; C-down/C-up scroll document other window (if document buffer shown)
;; S-down/S-up full page scroll document other window ( idem )
;; C-j doc-toc--jump-to-next-entry-by-level
;;; Code:
(eval-when-compile
(require 'pdf-tools nil t))
(require 'djvu nil t)
(require 'evil nil t)
(require 'seq)
(require 'rst)
;; List of declarations to eliminate byte-compile errors
(defvar djvu-doc-image)
(defvar doc-buffer)
(defvar pdf-filename)
(declare-function pdf-cache-get-image "pdf-cache")
(declare-function pdf-view-goto-page "pdf-view")
(declare-function pdf-view-next-page "pdf-view")
(declare-function pdf-view-previous-page "pdf-view")
(declare-function pdf-view-scroll-up-or-next-page "pdf-view")
(declare-function pdf-view-scroll-down-or-previous-page "pdf-view")
(declare-function djvu-goto-page "djvu")
(declare-function djvu-next-page "djvu")
(declare-function djvu-prev-page "djvu")
(declare-function djvu-scroll-up-or-next-page "djvu")
(declare-function djvu-scroll-down-or-previous-page "djvu")
(declare-function evil-scroll-page-down "evil-commands")
(declare-function evil-scroll-page-up "evil-commands")
;;;; Customize definitions
(defgroup toc nil
"Setting for the doc-toc package"
:group 'data)
(defcustom doc-toc-replace-original-file t
"For PDF include TOC and replace old PDF file.
For DJVU the old DJVU file is replaced by default"
:type 'boolean)
(defcustom doc-toc-destination-file-name "pdfwithtoc.pdf"
"Filename for new PDF if `doc-toc-replace-original-file' is nil."
:type 'file)
(defcustom doc-toc-ocr-languages nil
"Languages used for extraction with ocr.
Should be one or multiple language codes as recognized
by tesseract -l flag, e.g. eng or eng+nld. Use
\\[execute-extended-command] `doc-toc-list-languages' to list the
available languages."
:type 'string)
(defcustom doc-toc-handyoutliner-path nil
"Path to handyoutliner executable.
String (i.e. surround with double quotes). See
URL`http://handyoutlinerfo.sourceforge.net/'."
:type 'file)
(defcustom doc-toc-file-browser-command nil
"Command to open file browser.
String (i.e. surround with double quotes)."
:type 'file)
;;;; pdf.tocgen
;;;###autoload
(defun doc-toc-gen-set-level (level)
"Define the text properties of the heading level.
In a pdf-view buffer select a single word in the headline of a
certain level. Then run `doc-toc-gen-set-level' to write the text
properties to the recipe.toml file that is created in the
document's directory. You will be prompted to enter the LEVEL
number. The highest level should have number 1, the next level
number 2 etc."
(interactive "nWhich level you are setting (number): ")
(let* ((page (pdf-view-current-page))
(filename (url-filename (url-generic-parse-url buffer-file-name)))
(pdfxmeta-result (shell-command
(format "pdfxmeta --auto %s --page %s %s \"%s\" >> recipe.toml"
level
page
(shell-quote-argument filename)
(car (pdf-view-active-region-text))))))
;; (pdfxmeta-result (call-process "pdfxmeta" nil "recipe.toml" nil
;; "--auto" (number-to-string level)
;; "--page" (number-to-string page)
;; (shell-quote-argument filename)
;; (concat "\"" (car (pdf-view-active-region-text)) "\""))))
(when (eq pdfxmeta-result 1)
(let ((page-text (shell-command-to-string
(format "mutool draw -F text %s %s"
(shell-quote-argument filename)
page
))))
(pop-to-buffer "page-text")
(insert
"COULD NOT SET HEADING LEVEL. MUPDF EXTRACTED FOLLOWING PAGE TEXT FROM PAGE:\n")
(add-text-properties 1 (point) '(face font-lock-warning-face))
(let ((beg (point)))
(insert "(try to select partial word)\n\n")
(add-text-properties beg (point) '(face font-lock-warning-face)))
(insert page-text)
(goto-char (point-min))))))
(defun doc-toc-extract-with-pdf-tocgen ()
"Extract Table of Contents with `pdf-tocgen'.
Inserts the extracted TOC to a newly created buffer. This
function requires `pdf.tocgen' to be installed (see URL
`https://krasjet.com/voice/pdf.tocgen/'). The function can only
be used after the headline text properties have been defined with
the function `doc-toc-gen-set-level'"
(interactive)
(let ((filename buffer-file-name)
(toc (shell-command-to-string
(format "pdftocgen %s < recipe.toml" (shell-quote-argument buffer-file-name)))))
(switch-to-buffer "toc")
(doc-toc-pdftocgen-mode) ;; required before setting local variable
(when (fboundp 'flyspell-mode)
flyspell-mode)
(setq-local pdf-filename filename)
(insert toc)))
(defun doc-toc--pdftocgen-add-to-pdf ()
"Add content extracted with `pdf.tocgen' to copy of original PDF.
The newly created PDF that includes the TOC is written to a file
named output.pdf and opened in a new buffer. Don't forget to
rename this new file."
(interactive)
(let* ((output-buf (get-buffer-create "*pdftocio-output*")))
(call-process-region (point-min) (point-max) "pdftocio"
nil output-buf nil pdf-filename)
(kill-buffer-if-not-modified (find-file pdf-filename))
(when (file-exists-p (concat (file-name-base pdf-filename) "_out.pdf"))
(delete-file pdf-filename)
(rename-file (concat (file-name-base pdf-filename) "_out.pdf") pdf-filename))
(find-file pdf-filename)
(if (> (buffer-size output-buf) 0)
(pop-to-buffer output-buf)
(kill-buffer output-buf))))
(defvar doc-toc-pdftocgen-mode-map
(let ((map (make-sparse-keymap)))
(define-key map "\C-c\C-c" #'doc-toc--pdftocgen-add-to-pdf)
map))
(define-derived-mode doc-toc-pdftocgen-mode
fundamental-mode "TOC-cleanup"
"Major mode for cleaning up Table Of Contents
\\{doc-toc-pdftocgen-mode-map}")
;;;; doc-toc-extract and cleanup
;;; doc-toc-cleanup
(defun doc-toc--cleanup-dots ()
"Remove dots between heading its title and page number."
(interactive)
(goto-char (point-min))
(while (not (eobp))
(re-search-forward "\\([\\. ]*\\)\\([0-9ivx]*\\) *$")
(replace-match " \\2")
(forward-line 1)))
(defun doc-toc--cleanup-dots-ocr ()
"Remove dots between heading its title and page number.
Like `doc-toc--cleanup-dots' but more suited for use after OCR"
(interactive)
(goto-char (point-min))
(while (re-search-forward "\\([0-9\\. \\-]*\\)\\( [0-9]* *\\)$" nil t)
(replace-match " \\2")))
(defun doc-toc--cleanup-lines-contents-string ()
"Delete all lines containing the string \"contents\"."
(interactive)
(flush-lines "contents"))
(defun doc-toc--cleanup-lines-roman-string ()
"Delete all lines that contain only linefeeds and/or blanks
and/or roman numerals."
(interactive)
(goto-char (point-min))
(while (not (eobp))
(re-search-forward "^[\f ]*[ivx0-9\\.]* *$")
(replace-match "")
(forward-line 1)))
(defun doc-toc-cleanup-blank-lines ()
"Delete all empty lines."
(interactive)
(goto-char (point-min))
(flush-lines "^ *$"))
(defun doc-toc--roman-to-arabic (arg)
"Transform roman pagenumbers to Hindu-Arabic numerals.
This function only works for lines that end with roman numerals.
Prefix with numeric ARG prefix to apply to the next ARG lines."
(interactive "p")
(dotimes (_x arg)
(move-end-of-line 1)
(let ((latin (number-to-string
(rst-roman-to-arabic
(thing-at-point 'word t)))))
(backward-word)
(kill-word 1)
(insert latin)
(forward-line))))
(defun doc-toc--join-next-unnumbered-lines ()
"Search from point for first occurence of line not ending with
Western Arabic numerals."
(interactive)
(re-search-forward "[^0-9]\\s-*$" nil t)
(join-line 1))
(defun doc-toc--jump-next-overindexed-index ()
"Jump to next line with unwanted dot behind its index."
(interactive)
(re-search-forward "^[0-9\\.]*\\. " nil t))
(defun doc-toc--cleanup (contents-page &optional arg)
"Cleanup extracted Table Of Contents by running a series of
cleanup functions.
It executes the following steps:
1. insert a Contents entry with pagenumber CONTENTS-PAGE
2. delete subsequent lines containing the string \"contents\"
3. tries to delete redundant dots
4. deletes lines that containi only roman numerals and linefeed characters
5. deletes
When ARG is non-nil it skips the last three steps"
(interactive)
(goto-char (point-min))
(when (search-forward "contents" nil t)
(replace-match (format "Contents %s" contents-page)))
(doc-toc--cleanup-lines-contents-string)
(if arg
(doc-toc--cleanup-dots-ocr)
(doc-toc--cleanup-dots))
(doc-toc--cleanup-lines-roman-string)
(doc-toc-cleanup-blank-lines))
(defun doc-toc--get-section-inidices (separators)
"Determine index part of current line. Counting the number of SEPARATORS."
(let* ((string (thing-at-point 'line t))
(sep (cond (separators)
("\\."))))
(string-match (format "^\\([[:alnum:]]+%s\\)*" sep) string)
(match-string 0 string)))
(defun doc-toc--count-level-by-index (separators)
"Determine level of current line in TOC tree by counting SEPARATORS."
(let* ((index (doc-toc--get-section-inidices separators))
(sep (cond (separators)
("\\.")))
(string-list (split-string index sep t)))
(length string-list)))
(defun doc-toc-cleanup-set-level-by-index (&optional arg)
"Automatic set indentation by number of separatorss in index.
By default uses dots as separators. Prepend with universal
ARG (\\[universal-argument]) to enter different separators."
(interactive "P")
(let ((separators (if arg
(read-string
"Enter index separators as regexp (escape with \\ if required): ")
nil)))
(goto-char (point-min))
(while (re-search-forward "^\\s-+" nil t)
(replace-match ""))
(goto-char (point-min))
(while (not (eobp))
(let* ((level (doc-toc--count-level-by-index separators)))
(dotimes (_x level) (insert " "))
(forward-line 1)))))
;;; toc extract
(defun doc-toc--document-extract-pages-text (startpage endpage)
"Extract text from text layer of current document from STARTPAGE to ENDPAGE."
(let* ((source-buffer (current-buffer))
(ext (url-file-extension (buffer-file-name (current-buffer))))
(default-process-coding-system
(cond ((string= ".pdf" ext)'(windows-1252-unix . utf-8-unix))
((string= ".djvu" ext) '(utf-8-unix . utf-8-unix))))
(shell-command (cond ((string= ".pdf" ext) "pdftotext -f %s -l %s -layout %s -")
((string= ".djvu" ext) "djvutxt --page=%s-%s %s")
(t (error "Buffer-filename does not have pdf or djvu extension"))))
(text (shell-command-to-string
(format shell-command
startpage
endpage
(shell-quote-argument buffer-file-name))))
(buffer (get-buffer-create (file-name-sans-extension (buffer-name)))))
(switch-to-buffer buffer)
(doc-toc-cleanup-mode) ;; required before setting local variable
(when (fboundp 'flyspell-mode)
flyspell-mode)
(setq-local doc-buffer source-buffer)
(insert text)))
;;;###autoload
(defun doc-toc-extract-pages (arg)
"Extract text from text layer of current document and cleanup.
Extract from STARTPAGE to ENDPAGE. Use with the universal
ARG (\\[universal-argument]) omits cleanup to get the unprocessed
text."
(interactive "P")
(let ((mode (derived-mode-p 'pdf-view-mode 'djvu-read-mode)))
(if mode
(let* ((startpage (read-string "Enter start-pagenumber for extraction: "))
(endpage (read-string "Enter end-pagenumber for extraction: ")))
(doc-toc--document-extract-pages-text startpage endpage)
(unless arg
(doc-toc--cleanup startpage)))
(message "Buffer not in pdf-view- or djvu-read-mode"))))
(defun doc-toc-list-languages ()
"List languages available for OCR.
For use in `doc-toc-ocr-languages'."
(interactive)
(let ((print-length nil))
(message (format "%s" (seq-subseq
(split-string
(shell-command-to-string "tesseract --list-langs"))
5)))))
;;;###autoload
(defun doc-toc-extract-pages-ocr (arg)
"Extract via OCR text of current document and cleanup.
Extract from STARTPAGE to ENDPAGE. Use with the universal
ARG (\\[universal-argument]) omits cleanup to get the
unprocessed text."
(interactive "p")
(let ((mode (derived-mode-p 'pdf-view-mode 'djvu-read-mode)))
(if mode
(let* ((startpage (string-to-number
(read-string "Enter start-pagenumber for extraction: ")))
(endpage (string-to-number
(read-string "Enter end-pagenumber for extraction: ")))
(page startpage)
(source-buffer (current-buffer))
(ext (url-file-extension (buffer-file-name (current-buffer))))
(buffer (file-name-sans-extension (buffer-name)))
(psm (if (= arg 16)
(read-string "Enter code (interger) for tesseract psm: ")
"6"))
(args (list "stdout" "--psm" psm)))
(when doc-toc-ocr-languages
(setq args (append args (list "-l" doc-toc-ocr-languages))))
(while (<= page (+ endpage))
(let ((file (cond ((string= ".pdf" ext)
(make-temp-file "pageimage"
nil
(number-to-string page)
(pdf-cache-get-image page 600)))
((string= ".djvu" ext)
;; new code for djvu3
(let ((outfile (format "/tmp/pageimagep%s" page)))
(shell-command (format "ddjvu -page=%s '%s' %s"
page
buffer-file-name
outfile))
outfile)))))
;; old code for original djvu.el
;; (djvu-goto-page page)
;; (make-temp-file "pageimage"
;; nil
;; (number-to-string page)
;; (image-property djvu-doc-image :data))))))
(apply #'call-process
(append (list "tesseract" nil (list buffer nil) nil file)
args))
(setq page (1+ page))))
(switch-to-buffer buffer)
(doc-toc-cleanup-mode) ;; required before setting local variable
(when (fboundp 'flyspell-mode)
(flyspell-mode))
(setq-local doc-buffer source-buffer)
(unless (or (= arg 4) (= arg 16))
(doc-toc--cleanup startpage t)))
(message "Buffer not in pdf-view- or djvu-read-mode"))))
;;;###autoload
(defun doc-toc-extract-outline ()
"Extract Table of Contents attached to current document."
(interactive)
(let* ((source-buffer (current-buffer))
(ext (url-file-extension (buffer-file-name (current-buffer))))
(shell-command (cond ((string= ".pdf" ext) (if (executable-find "mutool")
"mutool show %s outline"
"mutool command is not found"))
((string= ".djvu" ext) "djvused -e 'print-outline' %s")
(t (error "Buffer-filename does not have pdf or djvu extension"))))
(text (shell-command-to-string
(format shell-command
(shell-quote-argument buffer-file-name))))
(buffer (get-buffer-create (concat (file-name-sans-extension (buffer-name)) ".txt"))))
(switch-to-buffer buffer)
(setq-local doc-buffer source-buffer)
(insert text)))
;;;###autoload
(defun doc-toc-extract-only ()
"Just extract text via OCR without further processing.
Prompt for startpage and endpage and print OCR output to new buffer."
(interactive)
(let ((mode (derived-mode-p 'pdf-view-mode 'djvu-read-mode)))
(if mode
(let* ((page (string-to-number
(read-string "Enter start-pagenumber for extraction: ")))
(endpage (string-to-number
(read-string "Enter end-pagenumber for extraction: ")))
(ext (url-file-extension (buffer-file-name (current-buffer))))
(buffer (concat (file-name-sans-extension (buffer-name)) ".txt"))
(args (list "stdout" "--psm" "6")))
(when doc-toc-ocr-languages
(setq args (append args (list "-l" doc-toc-ocr-languages))))
(while (<= page (+ endpage))
(let ((file (cond ((string= ".pdf" ext)
(make-temp-file "pageimage"
nil
(number-to-string page)
(pdf-cache-get-image page 600)))
((string= ".djvu" ext)
(djvu-goto-page page)
(make-temp-file "pageimage"
nil
(number-to-string page)
(image-property djvu-doc-image :data))))))
(apply #'call-process "tesseract" nil (list buffer nil) nil
file args)
(setq page (1+ page))))
(switch-to-buffer buffer)))))
(defun doc-toc--create-tablist-buffer ()
"Create tablist buffer, from cleaned up Table of Contents buffer.
The tablist buffer provides features for easy page number
adjustment."
(interactive)
(doc-toc--list doc-buffer))
;;;; toc major modes
(when (require 'pdf-tools nil t)
(define-key pdf-view-mode-map (kbd "C-c C-e") #'doc-toc-extract-pages)
(define-key pdf-view-mode-map (kbd "C-c e") #'doc-toc-extract-pages-ocr))
(when (require 'djvu nil t)
(define-key djvu-read-mode-map (kbd "C-c C-e") #'doc-toc-extract-pages)
(define-key djvu-read-mode-map (kbd "C-c e") #'doc-toc-extract-pages-ocr))
(defvar doc-toc-cleanup-mode-map
(let ((map (make-sparse-keymap)))
(define-key map "\C-c\C-c" #'doc-toc--create-tablist-buffer)
(define-key map "\C-c\C-j" #'doc-toc--join-next-unnumbered-lines)
(define-key map "\C-c\C-s" #'doc-toc--roman-to-arabic)
map))
(define-derived-mode doc-toc-cleanup-mode
fundamental-mode "TOC-cleanup"
"Major mode for cleaning up Table Of Contents
\\{doc-toc-cleanup-mode-map}")
;;; toc tablist
(defun doc-toc-count-leading-spaces ()
"Count number of leading spaces on current line."
(interactive)
(let ((start (string-match "^ *" (thing-at-point 'line)))
(end (match-end 0)))
(- end start)))
(defun doc-toc--convert-to-tabulated-list ()
"Parse and prepare content of current buffer for `doc-toc-tabular-mode'."
(interactive)
(untabify (point-min) (point-max))
(goto-char (point-min))
(let (lines
levels)
(while (not (eobp))
(let ((line-list (split-string (buffer-substring (line-beginning-position) (line-end-position))))
(spaces (doc-toc-count-leading-spaces)))
(unless (member spaces levels)
(setq levels (append levels (list spaces))))
(setq lines
(append
lines
(list (list nil
(vector
(number-to-string (seq-position levels spaces))
(mapconcat #'identity (butlast line-list) " ")
(mapconcat #'identity (last line-list) " "))))))
(forward-line)))
lines))
(defun doc-toc--increase ()
"Increase pagenumber of current entry."
(interactive)
(tabulated-list-set-col
"page"
(number-to-string (+ (string-to-number (aref (tabulated-list-get-entry) 2)) 1))
t))
(defun doc-toc--decrease ()
"Decrease pagenumber of current entry."
(interactive)
(tabulated-list-set-col
"page"
(number-to-string (- (string-to-number (aref (tabulated-list-get-entry) 2)) 1))
t))
(defun doc-toc--increase-remaining ()
"Increase pagenumber of current entry and all entries below."
(interactive)
(save-excursion
(while (not (eobp))
(tabulated-list-set-col
"page"
(number-to-string (+ (string-to-number (aref (tabulated-list-get-entry) 2)) 1))
t)
(forward-line 1))))
(defun doc-toc--decrease-remaining ()
"Decrease pagenumber of current entry and all entries below."
(interactive)
(save-excursion
(while (not (eobp))
(tabulated-list-set-col
"page"
(number-to-string (- (string-to-number (aref (tabulated-list-get-entry) 2)) 1))
t)
(forward-line 1))))
(defun doc-toc--replace-input ()
(interactive)
(let* ((column (cond ((< (current-column) 5)
0)
((and (> (current-column) 5) (< (current-column) 90))
1)
((> (current-column) 90)
2)))
(old-input (aref (tabulated-list-get-entry) column))
(new-input (read-string "Replace column input with: " old-input)))
(tabulated-list-set-col column new-input t)))
(defun doc-toc--tablist-follow ()
"Preview pagenumber of current line in separate document buffer."
(interactive)
(let ((ext (url-file-extension (buffer-file-name doc-buffer)))
(page (string-to-number (aref (tabulated-list-get-entry) 2))))
(pop-to-buffer doc-buffer)
(cond ((string= ".pdf" ext) (pdf-view-goto-page page))
((string= ".djvu" ext) (djvu-goto-page page)))
(other-window 1)))
(defun doc-toc--increase-remaining-and-follow ()
"Increase pagenumber of current entry and all entries below.
Additionally preview page in separate document buffer."
(interactive)
(doc-toc--increase-remaining)
(doc-toc--tablist-follow))
(defun doc-toc--decrease-remaining-and-follow ()
"Decrease pagenumber of current entry and all entries below.
Additionally preview page in separate document buffer."
(interactive)
(doc-toc--decrease-remaining)
(doc-toc--tablist-follow))
(defun doc-toc--scroll-other-window-page-up ()
"Scroll page up in document buffer from current buffer."
(interactive)
(other-window 1)
(let ((ext (url-file-extension (buffer-file-name (current-buffer)))))
(cond ((string= ".pdf" ext) (pdf-view-next-page 1))
((string= ".djvu" ext) (djvu-next-page 1))))
(other-window 1))
(defun doc-toc--scroll-other-window-page-down ()
"Scroll page down in document buffer from current buffer."
(interactive)
(other-window 1)
(let ((ext (url-file-extension (buffer-file-name (current-buffer)))))
(cond ((string= ".pdf" ext) (pdf-view-previous-page 1))
((string= ".djvu" ext) (djvu-prev-page 1))))
(other-window 1))
(defun doc-toc--scroll-pdf-other-window-down ()
"Scroll down in document buffer from current buffer."
(interactive)
(other-window 1)
(let ((ext (url-file-extension (buffer-file-name (current-buffer)))))
(cond ((string= ".pdf" ext) (pdf-view-scroll-up-or-next-page 1))
((string= ".djvu" ext) (djvu-scroll-up-or-next-page))))
(other-window 1))
(defun doc-toc--scroll-pdf-other-window-up ()
"Scroll up in document buffer from current buffer."
(interactive)
(other-window 1)
(let ((ext (url-file-extension (buffer-file-name (current-buffer)))))
(cond ((string= ".pdf" ext) (pdf-view-scroll-down-or-previous-page 1))
((string= ".djvu" ext) (djvu-scroll-down-or-previous-page))))
(other-window 1))
(defun doc-toc--jump-to-next-entry-by-level (char)
"Jump to the next entry of level CHAR."
(interactive "cJump to next entry of level: ")
(forward-line)
(let ((level (char-to-string char)))
(while (not (or (string= (aref (tabulated-list-get-entry) 0) level) (eobp)))
(forward-line)))
(doc-toc--tablist-follow))
(defvar doc-toc-tabular-mode-map
(let ((map (make-sparse-keymap)))
(define-key map [right] #'doc-toc--increase-remaining)
(define-key map [left] #'doc-toc--decrease-remaining)
(define-key map [S-right] #'doc-toc--increase)
(define-key map [S-left] #'doc-toc--decrease)
(define-key map [C-right] #'doc-toc--increase-remaining-and-follow)
(define-key map [C-left] #'doc-toc--decrease-remaining-and-follow)
(define-key map "r" #'doc-toc--replace-input)
(define-key map [tab] #'doc-toc--tablist-follow)
(define-key map [S-down] #'doc-toc--scroll-other-window-page-up)
(define-key map [S-up] #'doc-toc--scroll-other-window-page-down)
(define-key map [C-down] #'doc-toc--scroll-pdf-other-window-down)
(define-key map [C-up] #'doc-toc--scroll-pdf-other-window-up)
(define-key map "\C-j" #'doc-toc--jump-to-next-entry-by-level)
(define-key map "\C-c\C-c" #'doc-toc--tablist-to-doc-toc-source)
(define-key map "\C-c\C-c" #'doc-toc--tablist-to-doc-toc-source)
(when (featurep 'evil-commands)
(define-key map "\S-j" #'evil-scroll-page-down)
(define-key map "\S-k" #'evil-scroll-page-up))
map))
(define-derived-mode doc-toc-tabular-mode
tabulated-list-mode "TOC-tabular"
"Major mode for Table Of Contents.
\\{doc-toc-tabular-mode-map}"
(setq-local tabulated-list-format [("level" 5 nil) ("name" 84 nil) ("page" 4 nil)])
(tabulated-list-init-header))
(defun doc-toc--list (buffer)
"Create, BUFFER, new tabular-mode-buffer for easy pagenumber adjusment."
(interactive)
(let ((source-buffer buffer)
(doc-toc-tablist (doc-toc--convert-to-tabulated-list)))
(switch-to-buffer (concat (buffer-name) ".list"))
(when (fboundp 'golden-ratio-mode)
(golden-ratio-mode))
(doc-toc-tabular-mode)
(setq-local doc-buffer source-buffer)
(setq-local tabulated-list-entries doc-toc-tablist)
(tabulated-list-print)))
;;;; parse tablist to outline
(defvar doc-toc-map
(let ((map (make-sparse-keymap)))
(define-key map "\C-c\C-c" #'doc-toc--add-to-doc)
map))
(define-derived-mode doc-toc
fundamental-mode "TOC"
"Major mode for editing pdf or djvu Table Of Contents source files
\\{doc-toc-map}")
;;; pdf parse tablist to
(defun doc-toc--tablist-to-pdfoutline ()
"Parse and prepare tablist-mode-buffer to source input.
Displays results in a newly created buffer for use as source input
to `pdfoutline' shell command."
(interactive)
(goto-char (point-min))
(let ((source-buffer doc-buffer)
text)
(while (not (eobp))
(let ((v (tabulated-list-get-entry)))
(setq text (concat text (format "%s %s %s\n" (aref v 0) (aref v 2) (aref v 1) )))
(forward-line 1)))
(switch-to-buffer (find-file (concat (file-name-sans-extension (buffer-name)) ".txt")))
(erase-buffer)
(doc-toc)
(setq-local doc-buffer source-buffer)
(insert text)))
;;; djvu parse tablist to outline
(defun doc-toc--tablist-to-djvused ()
"Parse and prepare djvused outline source form tablist."
(interactive)
(let ((source-buffer doc-buffer)
(buff (get-buffer-create
(concat
(file-name-sans-extension (buffer-name))
".txt"))))
(with-current-buffer buff
(insert "(bookmarks "))
(goto-char (point-min))
(while (not (eobp))
(let* ((v (tabulated-list-get-entry))
(level-current (string-to-number (aref v 0)))
(sexp (list (aref v 1) (format "#%s" (aref v 2))))
(v-next (progn (forward-line) (tabulated-list-get-entry)))
(level-next (when v-next (string-to-number (aref v-next 0)))))
(if level-next
(with-current-buffer buff
(cond ((= level-next level-current)
(insert (format "(%s \"%s\") " (prin1-to-string (car sexp)) (nth 1 sexp))))
((> level-next level-current)
(insert (format "(%s \"%s\" " (prin1-to-string (car sexp)) (nth 1 sexp))))
((< level-next level-current)
(insert (format "(%s \"%s\")" (prin1-to-string (car sexp)) (nth 1 sexp)))
(let ((level-diff (- level-current level-next)))
(while (> level-diff 0)
(insert ") ")
(setq level-diff (1- level-diff)))))))
(forward-line))))
(forward-line -1)
(let ((v (tabulated-list-get-entry)))
(switch-to-buffer buff)
(insert (format " (\"%s\" \"#%s\"))" (aref v 1) (aref v 2)))
(doc-toc)
(setq-local doc-buffer source-buffer))))
(defun doc-toc--tablist-to-doc-toc-source ()
"Parse and prepare source file.
From tablist-mode-buffer, parse code and create source in new
buffer to use as input for `pdfoutline' or `djvused' shell
command."
(interactive)
(let ((ext (url-file-extension (buffer-file-name doc-buffer))))
(cond ((string= ".pdf" ext) (doc-toc--tablist-to-pdfoutline))
((string= ".djvu" ext) (doc-toc--tablist-to-djvused))
(t (error "Buffer-source-file does not have pdf or djvu extension")))))
(defun doc-toc--open-handy-outliner ()
"Open handyoutliner to add TOC to document.
Prepare TOC using `doc-toc--tablist-to-handyoutliner'. Requires the
variabele `doc-toc-handyoutliner-path' to be set to the correct
path."
(interactive)
(start-process ""
nil
doc-toc-handyoutliner-path))
(defun doc-toc--open-filepath-in-file-browser ()
"Open the buffer file directory in the file browser.
When the variable `doc-toc-file-browser-command' is set, this
function is used by the `doc-toc--tablist-to-handyoutliner' function
so that the generated TOC can be easily added to the document
with the handyoutliner software."
(interactive)
(let ((process-connection-type nil))
(start-process ""
nil
doc-toc-file-browser-command
(url-file-directory (buffer-file-name)))))
;;; pdf parse tablist to
(defun doc-toc--tablist-to-handyoutliner ()
"Parse and prepare tablist-mode-buffer to source input.
Displays results in a newlycreated buffer for use as source input
to `pdfoutline' shell command."
(interactive)
(goto-char (point-min))
(let ((source-buffer (when (boundp 'doc-buffer) doc-buffer))
text)
(while (not (eobp))
(let* ((v (tabulated-list-get-entry))
(tabs (make-string (string-to-number (aref v 0)) ?\t)))
(setq text (concat text (format "%s%s %s\n" tabs (aref v 1) (aref v 2))))
(forward-line 1)))
(switch-to-buffer (find-file "contents.txt"))
(erase-buffer)
(doc-toc)
(when source-buffer
(setq-local doc-buffer source-buffer))
(insert text))
(save-buffer)
(if (not (and doc-toc-handyoutliner-path doc-toc-file-browser-command))
(message "Path to handyoutliner or file browser command not defined")
(doc-toc--open-handy-outliner)
(doc-toc--open-filepath-in-file-browser)))
;;;; add outline to document
(defun doc-toc--add-to-pdf ()
"Use buffer contents as source for adding TOC to PDF.
This command uses the shell program `pdfoutline'."
(interactive)
(save-buffer)
(call-process "pdfoutline" nil "*pdfoutline*" nil
(concat (file-name-sans-extension (buffer-name)) ".pdf")
(buffer-name)
(if doc-toc-replace-original-file
(concat (file-name-sans-extension (buffer-name)) ".pdf")
doc-toc-destination-file-name)))