From 7525add5f581272b746919085cfe2c5f5c6f28a9 Mon Sep 17 00:00:00 2001 From: Mykhailo Shevchuk Date: Sat, 12 Dec 2020 12:05:07 +0100 Subject: [PATCH 1/7] (feat,fix) save/export scrapper buffers - remap `save-buffer` to new function `orb-pdf-scrapper-save` - remap `write-file` to new function `orb-pdf-scrapper-save-as` These functions handle "fileless" Scrapper buffers correctly - fix a bug where the buffer would master xml file name would after cancelling - switch to Scrapper buffer when pressing 'n' in prevent-concurrent dialog --- orb-pdf-scrapper.el | 39 +++++++++++++++++++++++++++++++++++++-- 1 file changed, 37 insertions(+), 2 deletions(-) diff --git a/orb-pdf-scrapper.el b/orb-pdf-scrapper.el index 60c5c08..2e05718 100644 --- a/orb-pdf-scrapper.el +++ b/orb-pdf-scrapper.el @@ -944,6 +944,8 @@ process." (defvar orb-pdf-scrapper-mode-map (let ((map (make-sparse-keymap))) (define-key map "\C-c\C-k" #'orb-pdf-scrapper-kill) + (define-key map [remap save-buffer] #'orb-pdf-scrapper-save) + (define-key map [remap write-file] #'orb-pdf-scrapper-save-as) map) "Keymap for `orb-pdf-scrapper-mode' minor mode. The keymap is updated automatically according to the Orb PDF @@ -1235,7 +1237,9 @@ Kill it and start a new one %s? " (orb--with-message! "Killing current process" (orb-pdf-scrapper--cleanup)) (orb-pdf-scrapper-run (orb-pdf-scrapper--get :new-key))) - ;; Do nothing + ;; go to the Scrapper buffer + (pop-to-buffer orb-pdf-scrapper--buffer) + ;; reset the concurring flag set by `orb-pdf-scrapper-run' (orb-pdf-scrapper--put :prevent-concurring nil)) ;; Finilize the requested context otherwise (cl-case callee @@ -1282,7 +1286,8 @@ Kill it and start a new one %s? " (orb-pdf-scrapper-dispatcher 'edit-bib 'continue)) ('edit-xml (when-let ((master-backup (orb-pdf-scrapper--get :master-backup))) - (rename-file master-backup orb-anystyle-parser-training-set t)) + (rename-file master-backup orb-anystyle-parser-training-set t) + (setq buffer-file-name nil)) (orb-pdf-scrapper-dispatcher (orb-pdf-scrapper--get :callee) 'continue)) (t (orb-pdf-scrapper-dispatcher 'error)))) @@ -1294,6 +1299,36 @@ Kill it and start a new one %s? " (kill-process process)) (orb-pdf-scrapper-dispatcher 'kill)) +(defun orb-pdf-scrapper-save () + "Save current ORB PDF Scrapper buffer in the respective temp file. +This command shadows `save-buffer' when `orb-pdf-scrapper-mode' is active." + (interactive) + (let ((temp-file + (cl-case (orb-pdf-scrapper--get :caller) + ('edit-txt (orb-pdf-scrapper--get :temp-txt)) + ('edit-bib (orb-pdf-scrapper--get :temp-bib)) + ('edit-org (orb-pdf-scrapper--get :temp-org)) + ('edit-xml (orb-pdf-scrapper--get :temp-xml)) + (t nil)))) ; fallback flag + (cond + ;; ORB PDF Scrapper buffers do not have file names + ((and (not buffer-file-name) temp-file) + (write-region (orb-buffer-string) nil temp-file nil -1) + (set-buffer-modified-p nil)) + ((save-buffer))))) + +(defun orb-pdf-scrapper-save-as () + "Export current ORB PDF Scrapper buffer to a file. +This command shadows `write-file' when `orb-pdf-scrapper-mode' is active." + (interactive) + ;; ORB PDF Scrapper buffers do not have file names + (cond + ((not buffer-file-name) + (call-interactively #'write-file) + (set-visited-file-name nil) + (rename-buffer orb-pdf-scrapper--buffer)) + ((call-interactively #'write-file)))) + ;; ============================================================================ ;;; Entry point From b179e9ba195a22b1932ae654a4983bebcbb07b4f Mon Sep 17 00:00:00 2001 From: Mykhailo Shevchuk Date: Mon, 21 Dec 2020 22:16:40 +0100 Subject: [PATCH 2/7] (feat) orb-pdf-scrapper-export-options --- orb-pdf-scrapper.el | 218 +++++++++++++++++++++++++++++++++++++++----- 1 file changed, 196 insertions(+), 22 deletions(-) diff --git a/orb-pdf-scrapper.el b/orb-pdf-scrapper.el index 2e05718..40895ec 100644 --- a/orb-pdf-scrapper.el +++ b/orb-pdf-scrapper.el @@ -60,8 +60,7 @@ ;; ============================================================================ (defcustom orb-pdf-scrapper-grouped-export - '((parent "References") - (in-roam "In Org Roam database" list) + '((in-roam "In Org Roam database" list) (in-bib "In BibTeX file" list) (valid "Valid citation keys" table) (invalid "Invalid citation keys" table)) @@ -80,10 +79,7 @@ TYPE is ignored for the `parent' group and defaults to `list' for other groups when set to nil. Takes effect when `orb-pdf-scrapper-group-references' is t." - :type '(list (list :tag "Parent headline" - (const :format "" parent) - (string :tag "Title")) - (list :tag "\nIn-roam" + :type '(list (list :tag "\nIn-roam" (const :format "" in-roam) (string :tag "Title") (radio :tag "Type" :value list @@ -214,6 +210,24 @@ If the value of `orb-pdf-scrapper-list-style' is one of the :group 'orb-pdf-scrapper :type 'string) +(defcustom orb-pdf-scrapper-export-options + `((org (headline "References (extracted by ORB PDF Scrapper)" + :property-drawer (("PDF_SCRAPPER_TYPE" . "org") + ("PDF_SCRAPPER_SOURCE") + ("PDF_SCRAPPER_DATE")))) + (txt (headline "References (extracted by ORB PDF Scrapper)" + :property-drawer (("PDF_SCRAPPER_TYPE" . "text") + ("PDF_SCRAPPER_SOURCE") + ("PDF_SCRAPPER_DATE")))) + (bib (file ,(if (listp bibtex-completion-bibliography) + (car bibtex-completion-bibliography) + bibtex-completion-bibliography) + :placement prepend))) + "Description." + :group 'orb-pdf-scrapper + :risky t + :type '(repeat list)) + (defcustom orb-pdf-scrapper-set-fields '(("author" orb-pdf-scrapper--invalidate-nil-value) ("editor" orb-pdf-scrapper--invalidate-nil-value @@ -378,13 +392,13 @@ This is an auxiliary function for command Auxiliary function for `orb-pdf-scrapper-generate-keys'. REFS should be an alist of form ((CITEKEY . FORMATTED-ENTRY) . VALIDP). -References marked valid by `orb-pdf-scrapper-keygen-function' function +References validated by `orb-pdf-scrapper-keygen-function' function are further sorted into four groups: 'in-roam - available in the `org-roam' database; 'in-bib - available in `bibtex-completion-bibliography' file(s); 'valid - marked valid by the keygen function but are not -available in the user databases; + available in user database(s); 'invalid - marked invalid by the keygen function." (let* ((bibtex-completion-bibliography (orb-pdf-scrapper--get :global-bib)) ;; When using a quoted list here, sorted-refs is not erased in @@ -530,7 +544,6 @@ list otherwise." (org-insert-heading '(16) nil t) ;; insert heading (insert (format "%s\n" title)) - (org-demote) (org-end-of-subtree) ;; insert references (insert (format "\n#+name: %s\n" group)) @@ -742,15 +755,6 @@ Pressing the RED button, just in case") (orb-pdf-scrapper--refresh-mode 'org) (orb--with-message! "Generating Org data" (erase-buffer) - ;; insert parent heading - (org-insert-heading nil nil t) - (insert - (concat - (cadr (assoc 'parent orb-pdf-scrapper-grouped-export)) - " (retrieved by Orb PDF Scrapper from " - (f-filename (orb-pdf-scrapper--get :pdf-file)) ")")) - (org-end-of-subtree) - ;; insert child headings: in-roam, in-bib, valid, invalid (orb-pdf-scrapper--insert-refs) (write-region (orb-buffer-string) nil temp-org nil -1) (setq buffer-undo-list nil) @@ -908,18 +912,188 @@ set `orb-anystyle-parser-model' variable to the above path.")) (orb-pdf-scrapper--put :context 'error :training-process nil)))))))) + +;; ============================================================================ +;;; Helper functions: Export of extracted references +;; ============================================================================ + +(defun orb-pdf-scrapper--export-to-headline (type target properties temp-file) + "Description TYPE TARGET PROPERTIES TEMP-FILE." + (let ((drawer-props (plist-get properties :property-drawer)) + data) + ;; insert parent heading + (org-insert-heading nil nil t) + (insert target) + ;; insert properties + (dolist (prop drawer-props) + (let ((prop-name (car prop)) + (value (cdr prop)) + prop-value) + (cond + ;; call user function if provided + ((functionp value) + (setq prop-value (funcall value))) + ;; provide some values for select properties - if the name was + ;; specified but not a value; + ;; NOTE: rather a placeholder for future elaboration + ((null value) + (cond + ((string= "PDF_SCRAPPER_SOURCE" prop-name) + (setq prop-value + (f-filename (orb-pdf-scrapper--get :pdf-file)))) + ((string= "PDF_SCRAPPER_DATE" prop-name) + (setq prop-value (org-timestamp-format + (org-timestamp-from-time + (current-time) 'with-time) + "%Y-%m-%d %a %H:%M"))))) + ;; insert the user value + (t (setq prop-value value))) + ;; insert the property + (org-set-property prop-name prop-value))) + ;; prepare the data + ;; get the data from temp file and put them into the target buffer, + ;; which must be current when this function is called. + (with-temp-buffer + (insert-file-contents temp-file) + ;; do some type-specific stuff + (cl-case type + (org + (org-mode) ; not sure if this is really necessary + (goto-char (point-min)) + (while (re-search-forward org-heading-regexp nil t) + (org-demote))) + (bib + (goto-char (point-min)) + (insert "#+begin_src bibtex\n") + (goto-char (point-max)) + (insert "#+end_src"))) + (setq data (orb-buffer-string))) + ;; insert the data + (org-end-of-subtree) + (insert "\n\n" data))) + +(defun orb-pdf-scrapper--export-to-file (type target properties temp-file) + "TYPE TARGET PROPERTIES TEMP-FILE." + (let* ((current-dir (file-name-directory + (buffer-file-name + (orb-pdf-scrapper--get :original-buffer)))) + (current-key + (orb-pdf-scrapper--get :current-key)) + ;; this is a sort of cond, but execute all clauses sequentially + (target (--> target + ;; if target is non-nil and it is a relative filename, + ;; expand it within the original buffer's directory + (when it + (if (f-relative? it) (f-join current-dir it) it)) + ;; if target is nil assume current directory + (if (null it) current-dir it) + ;; if target is a directory, make a file with citekey as + ;; the file name and type as the extension the target + ;; otherwise return the target + (if (f-dir? it) + (f-join it (format "%s.%s" current-key type)) + it))) + (bibtex-completion-bibliography + (if (listp bibtex-completion-bibliography) + bibtex-completion-bibliography + (list bibtex-completion-bibliography))) + (placement (or (plist-get properties :placement) 'append)) + (buffer-visited-p (find-buffer-visiting target)) + ;; inline subroutine to insert only those entries from TEMP-FILE, that + ;; are not already in BUF + (insert-filtered-bib-entries + (lambda (temp-file) + (let (keys buf-data) + (save-excursion + (maphash (lambda (key _val) + (push key keys)) + (car (parsebib-parse-buffer)))) + (with-temp-buffer + (insert-file-contents temp-file) + (goto-char (point-min)) + (let ((bibtex-sort-ignore-string-entries t)) + (bibtex-map-entries + (lambda (key _beg _end) + (when (member key keys) + (bibtex-kill-entry))))) + (setq buf-data (orb-buffer-string))) + (insert buf-data)))) + buf) + (save-excursion + (find-file target) + (setq buf (current-buffer)) + (cl-case placement + (prepend + (goto-char (point-min)) + (cl-case type + (bib + (let ((bibtex-sort-ignore-string-entries t)) + (bibtex-beginning-of-first-entry)) + (funcall insert-filtered-bib-entries temp-file)) + (org + (orb-pdf-scrapper--export-to-headline + type (format "References extracted from %s" current-key) + nil temp-file) + (insert "\n")) + (txt + (insert-file-contents temp-file) + (insert "\n")))) + (append + (goto-char (point-max)) + (cl-case type + (bib + (let ((bibtex-sort-ignore-string-entries t)) + (bibtex-end-of-entry)) + (insert "\n") + (funcall insert-filtered-bib-entries temp-file)) + (org + (orb-pdf-scrapper--export-to-headline + type (format "References extracted from %s" current-key) + nil temp-file)) + (txt + (insert "\n") + (insert-file-contents temp-file))) + t) + ;; (overwrite + ;; t) + ) + (save-buffer buf)) + (unless buffer-visited-p + (kill-buffer buf)))) + +(defun orb-pdf-scrapper--export (type) + "Export the extracted and/or generated data. +TYPE is a symbol identifying type of data to be exported, one +of `txt', `bib' or `org'." + (let ((temp-file (orb-pdf-scrapper--get + (intern (format ":temp-%s" type))))) + ;; there may be several targets, export to all of them + (cl-loop + for (name target . properties) + in (cdr (assoc type orb-pdf-scrapper-export-options)) + do (cl-case name + (headline + ;; (orb-pdf-scrapper--export-to-headline type target + ;; properties temp-file) + t) + (file + (orb-pdf-scrapper--export-to-file type target + properties temp-file)))))) + (defun orb-pdf-scrapper--checkout () "Finalize Orb PDF Scrapper process. -Insert generated Org data into the note buffer that started the -process." +Insert the extracted and generated data according to the settings +of `orb-pdf-scrapper-org-export', `orb-pdf-scrapper-text-export', +and `orb-pdf-scarpper-bibtex-export'." (cl-case (orb-pdf-scrapper--get :context) ('start (pop-to-buffer (orb-pdf-scrapper--get :original-buffer)) (save-restriction (save-excursion (widen) - (goto-char (point-max)) - (insert-file-contents (orb-pdf-scrapper--get :temp-org)))) + (dolist (type (mapcar #'car orb-pdf-scrapper-export-options)) + (goto-char (point-max)) + (orb-pdf-scrapper--export type)))) (orb-pdf-scrapper-dispatcher 'kill)) (t (orb-pdf-scrapper-dispatcher 'error)))) From 4f7061f4d7346a3044bfb7a69a6b2babfe2d8e05 Mon Sep 17 00:00:00 2001 From: Mykhailo Shevchuk Date: Mon, 21 Dec 2020 22:30:47 +0100 Subject: [PATCH 3/7] (int) uncomment a region --- orb-pdf-scrapper.el | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/orb-pdf-scrapper.el b/orb-pdf-scrapper.el index 40895ec..b783eff 100644 --- a/orb-pdf-scrapper.el +++ b/orb-pdf-scrapper.el @@ -1069,12 +1069,12 @@ of `txt', `bib' or `org'." (intern (format ":temp-%s" type))))) ;; there may be several targets, export to all of them (cl-loop - for (name target . properties) + for (location target . properties) in (cdr (assoc type orb-pdf-scrapper-export-options)) - do (cl-case name + do (cl-case location (headline - ;; (orb-pdf-scrapper--export-to-headline type target - ;; properties temp-file) + (orb-pdf-scrapper--export-to-headline type target + properties temp-file) t) (file (orb-pdf-scrapper--export-to-file type target From b18bdf9772ca521c85ee626e65f8913616efd9a0 Mon Sep 17 00:00:00 2001 From: Mykhailo Shevchuk Date: Sun, 10 Jan 2021 23:04:10 +0100 Subject: [PATCH 4/7] orb-pdf-scrapper export working beta --- orb-pdf-scrapper.el | 521 ++++++++++++++++++++++++++++++-------------- 1 file changed, 363 insertions(+), 158 deletions(-) diff --git a/orb-pdf-scrapper.el b/orb-pdf-scrapper.el index b783eff..fed3337 100644 --- a/orb-pdf-scrapper.el +++ b/orb-pdf-scrapper.el @@ -211,19 +211,148 @@ If the value of `orb-pdf-scrapper-list-style' is one of the :type 'string) (defcustom orb-pdf-scrapper-export-options - `((org (headline "References (extracted by ORB PDF Scrapper)" - :property-drawer (("PDF_SCRAPPER_TYPE" . "org") + '((org (headline "References (extracted by ORB PDF Scrapper)" + :property-drawer (("PDF_SCRAPPER_TYPE") ("PDF_SCRAPPER_SOURCE") - ("PDF_SCRAPPER_DATE")))) - (txt (headline "References (extracted by ORB PDF Scrapper)" - :property-drawer (("PDF_SCRAPPER_TYPE" . "text") - ("PDF_SCRAPPER_SOURCE") - ("PDF_SCRAPPER_DATE")))) - (bib (file ,(if (listp bibtex-completion-bibliography) - (car bibtex-completion-bibliography) - bibtex-completion-bibliography) - :placement prepend))) - "Description." + ("PDF_SCRAPPER_DATE"))))) + "Options for automatic export of references extracted by ORB PDF Scrapper. +This variable is an association list of the form +\(TYPE . ((TARGET LOCATION PROPERTIES))). + +TYPE is the type of the exported data, one of the symbols `txt', +`bib' or `org'. The data will only be exported if its +corresponding symbol is present on the list. + +TARGET must be one of the symbols `heading' or `path'. The +symbol `heading' means export the data under a heading in the +buffer of origin, the Org-mode buffer where the ORB PDF Scrapper +process was started. The symbol `path' means export the data to +another file. It is possible to specify both export targets +simultaneously for a given export TYPE or multiple targets of the +same type. + +Example: +\(setq orb-pdf-scrapper-export-options + '((org (heading HEADLINE PROPERTIES) + (path LOCATION PROPERTIES)))) + +LOCATION (HEADLINE) is a string specifying location of the TARGET. + +- If TARGET is `heading', the supplied string will be used as +headline text. The data will be exported slightly differently +depending on TYPE. Text references will be exported as is. +BibTeX references will be put into an Org-mode source code block. +Org-mode references, if grouped under different headings, will be +exported with the headings demoted by one level. + +- If TARGET is `path', the supplied string will be used as a +filesystem target. The path can be absolute or relative, in the +latter case it will be relative to the directory of the buffer of +origin. If the path (absolute or relative) is an *existing* +directory, the full path to the target file will be constructed +from the supplied string as a directory name, the #+ROAM_KEY: +property in the buffer of origin as a file name and TYPE as the +file's extension. If the path is not an existing directory, it +will be treated as a file name, and the data will be exported +there. The file will be created if it does not exist: + +> \"~/org/references.org\" - absolute path +> \"this-note's-bib-references.bib\" - path relative to the buffer of origin +> \"~/orb-pdf-scrapper-references/\" - absolute directory path. + +In the latter case, if the directory exists, the extracted data +will be put into a file \"Doe2020.bib\", assuming the #+ROAM_KEY: +property is \"Doe2020\" and TYPE is `bib'. If the directory does +not exist, the extracted data will be put into a (newly created) +file \"~/orb-pdf-scrapper-references/\". + +Example: +\(setq orb-pdf-scrapper-export-options + '((org (heading \"Org references\" PROPERTIES)) + (txt (path \"text-references/\" PROPERTIES)))) + +PROPERTIES is a property list providing additional export +specifications. Some properties are specific to only certain +export TYPEs or TARGETs. + +`:placement' property allows to specify placement of the exported +data. It can be a symbol `prepend' or `append'. + +When TARGET is `path', text data is simply put at the beginning +or end of the target file accordingly to the value of the +`:placement' property. Org-mode data is placed before the first +or after the last heading, respectively. Similarly, BibTeX data +is placed or before the first or after the last entry, comments +and @String entries ignored. + +When TARGET is `heading', this property specifies whether the +parent heading should be put before or after other headings. + +When TARGET is `path' and LOCATION is an Org-mode file, the value +of the `:placement' property can also be a list of the form +\(heading HEADLINE PROPERTIES). In this case the data will be +put in the target file under a heading with HEADLINE as the +headline text. PROPERTIES are additional export options as +described here and below. The `heading' value of the +`:placement' property cannot be used recursively in this case. + +Example: +\(setq orb-pdf-scrapper-export-options + '((org (path \"references.org\" :placement append)) + (bib (path \"references.bib\" :placement prepend)) + (txt (path \"references.txt\" + :placement (headline \"References\" :placement append))))) + +If placement is not specified, the data is appended by default. + +`:property-drawer' property allows to supply a heading with some +properties. The value of this property is list with +elements (PROPERTY_NAME . PROPERTY_VALUE) or PROPERTY_NAME, the +latter form being treated as (PROPERTY_NAME . nil). + +PROPERTY_NAME must be a string, it will be used as a property +name. PROPERY_VALUE can be a string, in which case it will be +used as the value of the property. It can also be a function +name as an unquoted symbol, in which case this function will be +called to get the value of the property. The return value must +be a string. + +The following properties are recognized internally and will be +supplied with automatically generated values if PROPERTY_VALUE is +nil: + +> PDF_SCRAPPER_TYPE - TYPE of the export data +> PDF_SCRAPPER_SOURCE - name of the PDF file the data were extracted from +> PDF_SCRAPPER_DATE - time and date the data were exported + +Example: +\(setq orb-pdf-scrapper-export-options + '((org (heading \"Org references\" PROPERTIES))) + (txt (heading \"Text references\" + :property-drawer + '((\"PDF_SCRAPPER_TYPE\" . \"text\") + \"PDF_SCRAPPER_DATE\" + \"PDF_SCRAPPER_SOURCE\" + (\"PROPERTY_1\" . \"VALUE_1\") + (\"PROPERTY_2\" . my-function)))))) + +`:filter-bib-entries' property controls filtering of exported +BibTeX entries. If the value of this property is non-nil and +TARGET is a BibTeX file, only the entries that are not already +present in this file will be exported. The value can also be a +string or a list of strings specifying BibTeX file(s), or a +variable as an unquoted symbol holding a string or a list of +strings specifying BibTeX file(s), in which cases the entries +will be filtered also against this/these file(s) in *addition* to +the TARGET file. In such instances, filtering will also be +applied to entries exported to an Org-mode heading. + +Example: +\(setq orb-pdf-scrapper-export-options + '((bib (path \"references.bib\" :filter-bib-entries t + (heading \"BibTeX references\" + :filter-bib-entries bibtex-completion-bibliography))))." + :group 'orb-pdf-scrapper :risky t :type '(repeat list)) @@ -385,7 +514,7 @@ This is an auxiliary function for command ;; TODO: for testing until implemented (when natural-order (cl-pushnew `("natural-order" . ,natural-order) entry)) - (cons new-key (cons entry validp)))) + (cons new-key (cons entry validp)))) (defun orb-pdf-scrapper--sort-refs (refs) "Sort references REFS. @@ -917,168 +1046,245 @@ set `orb-anystyle-parser-model' variable to the above path.")) ;;; Helper functions: Export of extracted references ;; ============================================================================ -(defun orb-pdf-scrapper--export-to-headline (type target properties temp-file) - "Description TYPE TARGET PROPERTIES TEMP-FILE." +(defun orb-pdf-scrapper--export-get-point (type placement) + "In current buffer, go to the point where data should be placed. +TYPE is target type, one of the symbols `txt', `bib' or `org'. +PLACEMENT is placement type, one of the symbols `append' or `prepend'. + +Return the point." + ;; for Org export go to the first or last heading, for BibTeX export + ;; go to the first or last entry rather than the beginning or end of + ;; buffer, respectively. + (cl-case placement + ('prepend + (cl-case type + (bib + (let ((bibtex-sort-ignore-string-entries t)) + (bibtex-beginning-of-first-entry))) + (org + (goto-char (point-min)) + (when (org-before-first-heading-p) + (org-get-next-sibling))) + (t + (goto-char (point-min))))) + (t + (cl-case type + (bib + (let ((bibtex-sort-ignore-string-entries t)) + (bibtex-end-of-entry))) + (org + (goto-char (point-max)) + (org-end-of-subtree) + (forward-line)) + (t + (goto-char (point-max)))))) + (point)) + +(defun orb-pdf-scrapper--export-insert-temp-data (type properties) + "Insert data from temporary file at point. +TYPE is type of data. PROPERTIES are additional export properties." + (let* ((temp-file (orb-pdf-scrapper--get + (intern (format ":temp-%s" type)))) + (filter (plist-get properties :filter-bib-entries)) + ;; inline subroutine to filter BibTeX entries + (insert-filtered-bib-entries + (lambda (temp-file filter) + (when (symbolp filter) + (setq filter (symbol-value filter))) + (let ((sources (cond + ((stringp filter) (list filter buffer-file-name)) + ((listp filter) (append filter buffer-file-name)) + (t (list buffer-file-name)))) + keys buf-data) + (save-excursion + (dolist (source sources) + (when source + (let ((buffer-visisted-p (find-buffer-visiting source))) + (find-file source) + (when (eq major-mode 'bibtex-mode) + (maphash (lambda (key _val) + (push key keys)) + (car (parsebib-parse-buffer)))) + (unless buffer-visisted-p + (kill-buffer (current-buffer))))))) + (with-temp-buffer + (insert-file-contents temp-file) + (goto-char (point-min)) + (let ((bibtex-sort-ignore-string-entries t)) + (bibtex-map-entries + (lambda (key _beg _end) + (when (member key keys) + (bibtex-kill-entry))))) + (setq buf-data (orb-buffer-string))) + (insert buf-data))))) + (cl-case type + (bib + (if filter + (funcall insert-filtered-bib-entries temp-file filter) + (insert-file-contents temp-file))) + (t + (insert-file-contents temp-file))))) + +(defun orb-pdf-scrapper--export-to-heading (type name properties) + "Description TYPE NAME PROPERTIES." (let ((drawer-props (plist-get properties :property-drawer)) - data) - ;; insert parent heading - (org-insert-heading nil nil t) - (insert target) - ;; insert properties - (dolist (prop drawer-props) - (let ((prop-name (car prop)) - (value (cdr prop)) - prop-value) - (cond - ;; call user function if provided - ((functionp value) - (setq prop-value (funcall value))) - ;; provide some values for select properties - if the name was - ;; specified but not a value; - ;; NOTE: rather a placeholder for future elaboration - ((null value) - (cond - ((string= "PDF_SCRAPPER_SOURCE" prop-name) - (setq prop-value - (f-filename (orb-pdf-scrapper--get :pdf-file)))) - ((string= "PDF_SCRAPPER_DATE" prop-name) - (setq prop-value (org-timestamp-format - (org-timestamp-from-time - (current-time) 'with-time) - "%Y-%m-%d %a %H:%M"))))) - ;; insert the user value - (t (setq prop-value value))) - ;; insert the property - (org-set-property prop-name prop-value))) - ;; prepare the data - ;; get the data from temp file and put them into the target buffer, - ;; which must be current when this function is called. + (placement (plist-get properties :placement)) + (end (make-marker)) + beg data) + ;; Make the heading in a temporary buffer (with-temp-buffer - (insert-file-contents temp-file) + ;; get the desired position + ;; insert parent heading + (org-insert-heading nil nil t) + (insert name) + ;; insert properties + (dolist (prop drawer-props) + (let ((prop-name (or (car-safe prop) prop)) + (value (cdr-safe prop)) + prop-value) + (cond + ;; call user function if provided + ((functionp value) + (setq prop-value (funcall value)) + (unless (stringp prop-value) + (user-error "Function %s must return a string. \ +Check `orb-pdf-scrapper-export-options'" value))) + ;; provide some values for select properties - if the name was + ;; specified but not a value; + ;; NOTE: rather a placeholder for future elaboration + ((null value) + (cond + ((string= "PDF_SCRAPPER_TYPE" prop-name) + (setq prop-value (format "%s" type))) + ((string= "PDF_SCRAPPER_SOURCE" prop-name) + (setq prop-value + (f-filename (orb-pdf-scrapper--get :pdf-file)))) + ((string= "PDF_SCRAPPER_DATE" prop-name) + (setq prop-value (org-timestamp-format + (org-timestamp-from-time + (current-time) 'with-time) + "%Y-%m-%d %a %H:%M"))))) + ;; insert the user value + (t (setq prop-value value))) + ;; insert the property + (org-set-property prop-name prop-value))) + (org-end-of-meta-data) + (insert "\n") + (setq beg (point)) + (set-marker end beg) + (set-marker-insertion-type end t) + (orb-pdf-scrapper--export-insert-temp-data type properties) ;; do some type-specific stuff + ;; + ;; Org: demote group headings which are to become subheadings of the + ;; newly created heading. + ;; + ;; BibTeX: insert into a language source block (cl-case type (org - (org-mode) ; not sure if this is really necessary - (goto-char (point-min)) + (org-mode) + (goto-char beg) (while (re-search-forward org-heading-regexp nil t) (org-demote))) (bib - (goto-char (point-min)) + (goto-char beg) (insert "#+begin_src bibtex\n") - (goto-char (point-max)) - (insert "#+end_src"))) - (setq data (orb-buffer-string))) + (goto-char end) + (insert "#+end_src\n"))) + (setq data (orb-buffer-string)) + (set-marker end nil)) ;; insert the data - (org-end-of-subtree) - (insert "\n\n" data))) - -(defun orb-pdf-scrapper--export-to-file (type target properties temp-file) - "TYPE TARGET PROPERTIES TEMP-FILE." + (orb-pdf-scrapper--export-get-point 'org placement) + (insert data "\n"))) + +(defun orb-pdf-scrapper--export-to-file (type location properties) + "Export data generated by ORB PDF Scrapper to a file. +TYPE is a symbol identifying type of data to be exported, one of +`org', `txt', or `bib'. + +LOCATION is a string specifying the location of the target file. +It can be a relative or an absolute file path. If the file does +not exist, it will be created. It can also be a relative or an +absolute path to an existing directory. In this case the data +will be exported to a file in that directory with the citation +key associated with the buffer of origin (extracted from its +#+ROAM_KEY: property) as the filename and TYPE as the extension. + +PROPERTIES is a property list with additional export properties. +See `orb-pdf-scrapper-export-options' for details." (let* ((current-dir (file-name-directory (buffer-file-name (orb-pdf-scrapper--get :original-buffer)))) (current-key (orb-pdf-scrapper--get :current-key)) ;; this is a sort of cond, but execute all clauses sequentially - (target (--> target - ;; if target is non-nil and it is a relative filename, - ;; expand it within the original buffer's directory - (when it - (if (f-relative? it) (f-join current-dir it) it)) - ;; if target is nil assume current directory - (if (null it) current-dir it) - ;; if target is a directory, make a file with citekey as - ;; the file name and type as the extension the target - ;; otherwise return the target - (if (f-dir? it) - (f-join it (format "%s.%s" current-key type)) - it))) - (bibtex-completion-bibliography - (if (listp bibtex-completion-bibliography) - bibtex-completion-bibliography - (list bibtex-completion-bibliography))) - (placement (or (plist-get properties :placement) 'append)) - (buffer-visited-p (find-buffer-visiting target)) - ;; inline subroutine to insert only those entries from TEMP-FILE, that - ;; are not already in BUF - (insert-filtered-bib-entries - (lambda (temp-file) - (let (keys buf-data) - (save-excursion - (maphash (lambda (key _val) - (push key keys)) - (car (parsebib-parse-buffer)))) - (with-temp-buffer - (insert-file-contents temp-file) - (goto-char (point-min)) - (let ((bibtex-sort-ignore-string-entries t)) - (bibtex-map-entries - (lambda (key _beg _end) - (when (member key keys) - (bibtex-kill-entry))))) - (setq buf-data (orb-buffer-string))) - (insert buf-data)))) - buf) - (save-excursion - (find-file target) + (path (--> location + ;; if location is non-nil and it is a relative filename, + ;; expand it within the original buffer's directory + (when it + (if (f-relative? it) (f-join current-dir it) it)) + ;; if location is nil assume current directory + (if (null it) current-dir it) + ;; if location is a directory, make a file with citekey as + ;; the file name and type as the extension the location + ;; otherwise return the location + (if (f-dir? it) + (f-join it (format "%s.%s" current-key type)) + it))) + ;; file extension if any + (ext (f-ext path)) + (buffer-visited-p (find-buffer-visiting path)) + target-type buf) + (find-file path) (setq buf (current-buffer)) - (cl-case placement - (prepend - (goto-char (point-min)) - (cl-case type - (bib - (let ((bibtex-sort-ignore-string-entries t)) - (bibtex-beginning-of-first-entry)) - (funcall insert-filtered-bib-entries temp-file)) - (org - (orb-pdf-scrapper--export-to-headline - type (format "References extracted from %s" current-key) - nil temp-file) - (insert "\n")) - (txt - (insert-file-contents temp-file) - (insert "\n")))) - (append - (goto-char (point-max)) - (cl-case type - (bib - (let ((bibtex-sort-ignore-string-entries t)) - (bibtex-end-of-entry)) - (insert "\n") - (funcall insert-filtered-bib-entries temp-file)) - (org - (orb-pdf-scrapper--export-to-headline - type (format "References extracted from %s" current-key) - nil temp-file)) - (txt - (insert "\n") - (insert-file-contents temp-file))) - t) - ;; (overwrite - ;; t) - ) - (save-buffer buf)) + ;; type of the target file; try to determine it from the major mode; + ;; assume TYPE otherwise. + (setq target-type + (pcase major-mode + ('org-mode 'org) + ('bibtex-mode 'bib) + ((or 'text-mode 'fundamental-mode) 'txt) + (_ type))) + (save-mark-and-excursion + (pcase (plist-get properties :placement) + ('prepend + (orb-pdf-scrapper--export-get-point target-type 'prepend) + (orb-pdf-scrapper--export-insert-temp-data type properties) + (when (memq type '(org txt)) + (insert "\n"))) + (`(heading ,headline . ,heading-properties) + (if (string= ext "org") + ;; NOTE: heading-properties take precendence over path + ;; properties + (orb-pdf-scrapper--export-to-heading + type headline (append heading-properties properties)) + (user-error "Heading placement only possible in ORG files"))) + ;; defaults to append + (_ + (orb-pdf-scrapper--export-get-point target-type 'append) + (when (memq type '(bib txt)) + (insert "\n")) + (orb-pdf-scrapper--export-insert-temp-data type properties)))) + (save-buffer buf) (unless buffer-visited-p (kill-buffer buf)))) (defun orb-pdf-scrapper--export (type) "Export the extracted and/or generated data. TYPE is a symbol identifying type of data to be exported, one -of `txt', `bib' or `org'." - (let ((temp-file (orb-pdf-scrapper--get - (intern (format ":temp-%s" type))))) - ;; there may be several targets, export to all of them - (cl-loop - for (location target . properties) - in (cdr (assoc type orb-pdf-scrapper-export-options)) - do (cl-case location - (headline - (orb-pdf-scrapper--export-to-headline type target - properties temp-file) - t) - (file - (orb-pdf-scrapper--export-to-file type target - properties temp-file)))))) +of `txt', `bib' or `org'. + +The user variable `orb-pdf-scrapper-export-options' controls +export options." + ;; there may be several targets for a given TYPE, export to all of them + (cl-loop + for (target location . properties) + in (cdr (assoc type orb-pdf-scrapper-export-options)) + do (cl-case target + (heading + (orb-pdf-scrapper--export-to-heading type location properties)) + (path + (orb-pdf-scrapper--export-to-file type location properties))))) (defun orb-pdf-scrapper--checkout () "Finalize Orb PDF Scrapper process. @@ -1088,12 +1294,11 @@ and `orb-pdf-scarpper-bibtex-export'." (cl-case (orb-pdf-scrapper--get :context) ('start (pop-to-buffer (orb-pdf-scrapper--get :original-buffer)) - (save-restriction - (save-excursion - (widen) - (dolist (type (mapcar #'car orb-pdf-scrapper-export-options)) - (goto-char (point-max)) - (orb-pdf-scrapper--export type)))) + ;; export the extracted/generated data + (dolist (type (mapcar #'car orb-pdf-scrapper-export-options)) + (orb-pdf-scrapper--export type)) + ;; NOTE: "break point" for ease of debugging + ;; (user-error "Halt") (orb-pdf-scrapper-dispatcher 'kill)) (t (orb-pdf-scrapper-dispatcher 'error)))) From e099ef01232130b5a59421635bab007fc39fe916 Mon Sep 17 00:00:00 2001 From: Mykhailo Shevchuk Date: Mon, 11 Jan 2021 00:13:39 +0100 Subject: [PATCH 5/7] (fix) consistent formatting between headings fix #151 --- orb-pdf-scrapper.el | 48 ++++++++++++++++++++++++++------------------- 1 file changed, 28 insertions(+), 20 deletions(-) diff --git a/orb-pdf-scrapper.el b/orb-pdf-scrapper.el index fed3337..bff5acc 100644 --- a/orb-pdf-scrapper.el +++ b/orb-pdf-scrapper.el @@ -662,25 +662,31 @@ list otherwise." (cond (orb-pdf-scrapper-group-references (dolist (ref-group - (orb-pdf-scrapper--sort-refs orb-pdf-scrapper--refs)) - (when-let* ((group (car ref-group)) - (refs (cdr ref-group)) - (heading - (cdr (assoc group - orb-pdf-scrapper-grouped-export))) - (title (car heading)) - (type (cadr heading))) - (org-insert-heading '(16) nil t) - ;; insert heading - (insert (format "%s\n" title)) - (org-end-of-subtree) - ;; insert references - (insert (format "\n#+name: %s\n" group)) - (cl-case type - ('table - (orb-pdf-scrapper--insert-org-as-table refs)) - (t - (orb-pdf-scrapper--insert-org-as-list refs)))))) + (orb-pdf-scrapper--sort-refs orb-pdf-scrapper--refs)) + (when-let* ((group (car ref-group)) + (refs (cdr ref-group)) + (heading + (cdr (assoc group + orb-pdf-scrapper-grouped-export))) + (title (car heading)) + (type (cadr heading)) + (pos (make-marker))) + (unless (bobp) + (org-N-empty-lines-before-current 1)) + (org-insert-heading '(16) nil t) + ;; insert heading + (insert (format "%s\n" title)) + (org-N-empty-lines-before-current 1) + ;; insert references + (insert (format "#+name: %s\n" group)) + (set-marker pos (point)) + (set-marker-insertion-type pos t) + (cl-case type + ('table + (orb-pdf-scrapper--insert-org-as-table refs)) + (t + (orb-pdf-scrapper--insert-org-as-list refs))) + (goto-char pos)))) (t (insert "\n") (let ((refs (nreverse orb-pdf-scrapper--refs))) @@ -688,7 +694,9 @@ list otherwise." ('table (orb-pdf-scrapper--insert-org-as-table refs)) (t - (orb-pdf-scrapper--insert-org-as-list refs))))))) + (orb-pdf-scrapper--insert-org-as-list refs)))))) + (goto-char (point-max)) + (org-N-empty-lines-before-current 0)) ;; ============================================================================ From bc5d069cf8f42dec5f78704f08765db08940bd3b Mon Sep 17 00:00:00 2001 From: Mykhailo Shevchuk Date: Sun, 24 Jan 2021 10:46:17 +0100 Subject: [PATCH 6/7] (fix) set bibtex dialect before mapping entries the global value of `bibtex-dialect` may be uninitialized leading to an error --- orb-pdf-scrapper.el | 1 + 1 file changed, 1 insertion(+) diff --git a/orb-pdf-scrapper.el b/orb-pdf-scrapper.el index bff5acc..45b5288 100644 --- a/orb-pdf-scrapper.el +++ b/orb-pdf-scrapper.el @@ -1119,6 +1119,7 @@ TYPE is type of data. PROPERTIES are additional export properties." (insert-file-contents temp-file) (goto-char (point-min)) (let ((bibtex-sort-ignore-string-entries t)) + (bibtex-set-dialect 'biblatex t) (bibtex-map-entries (lambda (key _beg _end) (when (member key keys) From 84fb4bd98db3e2094ddba23a269a54452abfc0e9 Mon Sep 17 00:00:00 2001 From: Mykhailo Shevchuk Date: Tue, 2 Mar 2021 08:59:40 +0100 Subject: [PATCH 7/7] (doc) update README ORB PDF Scrapper section --- README.md | 56 +++++++++++++++++++++++++++++++++++++++++++-- orb-pdf-scrapper.el | 3 +-- 2 files changed, 55 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 5b50da4..dee1f2a 100644 --- a/README.md +++ b/README.md @@ -564,6 +564,9 @@ in putting each reference onto a separate line. After you are finished with editing the text data, press `C-c C-c` to proceed to the second step. +Press `C-x C-s` to save your progress or `C-x C-w` to write the text references +into a file. + Press `C-c C-k` anytime to abort the ORB PDF Scrapper process. #### BibTeX mode @@ -587,10 +590,16 @@ generation. During key generation, it is also possible to automatically set the values of BibTeX fields: see `orb-pdf-scrapper-set-fields` docstring for more details. +Press `C-x C-s` to save your progress or `C-x C-w` to write the BibTeX entries +into a file. + Press `C-c C-r` to return to the text-editing mode in its last state. Note that all the progress in BibTeX mode will be lost. -Press `C-c C-c` to proceed to the third step. +Press `C-c C-c` to proceed to the third step. If the BibTeX buffer was edited +and the changes were not saved, e.g. by pressing `C-x C-s`, you will be +prompted to generated BibTeX keys by default. The variable +`orb-pdf-prompt-to-generate-keys` more finely controls this behaviour. #### Org mode In the third step, the BibTeX records are processed internally by ORB PDF @@ -621,17 +630,60 @@ Review and edit the generated Org-mode data, or press `C-c C-c` to insert the references into the note's buffer and finish the ORB PDF Scrapper. +Press `C-x C-s` to save your progress or `C-x C-w` to write the Org data +into a file. + Press `C-c C-r` to return to BibTeX editing mode in its last state. Note that all the progress in current mode will be lost. The following user variables control the appearance of the generated Org-mode data: `orb-pdf-scrapper-group-references`, `orb-pdf-scrapper-grouped-export`, `orb-pdf-scrapper-ungrouped-export`, `orb-pdf-scrapper-table-export-fields`, -`orb-pdf-scrapper-list-style`, `orb-pdf-scrapper-citation-numbers`, +`orb-pdf-scrapper-list-style`, `orb-pdf-scrapper-reference-numbers`, `orb-pdf-scrapper-citekey-format`. These variables can be set through the Customize interface or with `setq`. Refer to their respective docstrings in Emacs for more information. +#### Exporting data generated by ORB PDF Scrapper +The different types of data generated by ORB PDF Scrapper – text, BibTeX and +Org - can be exported to the buffer of origin or an external file. By default, +only the Org data is exported to the buffer of origin. Different export options +can be set in `orb-pdf-scrapper-export-options`. Consult its docstring for a +detailed explanation. The following example demonstrates various +possibilities. + +``` el +(setq orb-pdf-scrapper-export-options + '((org ;; <= TYPE + ;; Export to a heading in the buffer of origin + (heading "References (extracted by ORB PDF Scrapper)" + ;; ^ ^ + ;; TARGET LOCATION + ;; PROPERTIES + ;; v + :property-drawer ("PDF_SCRAPPER_TYPE" + "PDF_SCRAPPER_SOURCE" + "PDF_SCRAPPER_DATE"))) + (txt + ;; Export to a file "references.org" + (path "references.org" + ;; under a heading "New references" + :placement + (heading "New references" + :property-drawer ("PDF_SCRAPPER_TYPE" + "PDF_SCRAPPER_SOURCE" + "PDF_SCRAPPER_DATE") + ;; Put the new heading in front of other headings + :placement prepend))) + (bib + ;; Export to a file in an existing directory. The file name will be CITEKEY.bib + (path "/path/to/references-dir/" + :placement prepend + ;; Include only the references that are not in the target file + ;; *and* the file(s) specified in bibtex-completion-bibliography + :filter-bib-entries bibtex-completion-bibliography)))) +``` + #### Training a Parser model ##### Prerequisites Currently, the core data set (explained below) must be installed manually by the user as follows: diff --git a/orb-pdf-scrapper.el b/orb-pdf-scrapper.el index 45b5288..d466477 100644 --- a/orb-pdf-scrapper.el +++ b/orb-pdf-scrapper.el @@ -1298,8 +1298,7 @@ export options." (defun orb-pdf-scrapper--checkout () "Finalize Orb PDF Scrapper process. Insert the extracted and generated data according to the settings -of `orb-pdf-scrapper-org-export', `orb-pdf-scrapper-text-export', -and `orb-pdf-scarpper-bibtex-export'." +of `orb-pdf-scrapper-export-options'." (cl-case (orb-pdf-scrapper--get :context) ('start (pop-to-buffer (orb-pdf-scrapper--get :original-buffer))