-
-
Notifications
You must be signed in to change notification settings - Fork 2.6k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Implement more pdf importers (#7947)
* GrobidPdfMetadataImporter implemented Implemented an Importer that querries Grobid for metadata of a pdf. The necessary Grobid functionality (retrieving BibTeX for a pdf) is not yet available in Grobid, but we opened a PR that implements it (kermitt2/grobid#800). * Fixed class when accessing resources * Use FileHelper method to get extension * Use jsoup to issue POST request * Removed unnecessary field * Reverted URLDownload It's no longer necessary to set the POST data by bytes as we use JSoup for that. * Changelog entry * Add pdf link to imported entry * Remove citationkey from Grobid Grobid cannot predict a citationkey * FirstPageImporter * Fixed grammar mistake in CHANGELOG.md Co-authored-by: Christoph <siedlerkiller@gmail.com> * Fixed Grobid tests * Fixed Grobid URL * Checkstyle * Fixed doc * Checkstyle * Use JSoup for plaintext citations as well * Renamed FirstPageImporter to PdfVerbatimBibTextImporter * Fixed getName (no importer) * Renamed Grobid importer to match convention * PdfEmbeddedBibTeXImporter * Renamed PdfEmbeddedBibTeXImporter to PdfEmbeddedBibFileImporter * Checkstyle * Remove debug output * Checkstyle * PdfMergeMetadataImporter * Add DOI and ISBN fetching in PdfMergeMetadataImporter * Fixed concurrent list access * Adapted tests to contain fetchable ID's * Derive XMP preferences from importFormatPreferences * Localization * Use Importers in JabRef * Remove unnecessary test documents * Checkstyle * Grobid Timeout * Null-check * Use MergeImporter as WebFetcher Users can perform a PDF import on already imported pdf's to improve the quality of the entry * Only force BibTeX import if everything else fails Fixes #7984 * Prioritize non-bruteforce importers that When importing, try importers that can tell if they are suitable for a certain file format or not. Some importers only check if a file is present, not if it in the correct format (isRecognizedFormat is always true if an existing file is given). They are used last. The List of importers now reflects that prioritization. It is not sorted by importer names anymore. The getter-methods getImportFormats and getImportFormatList still sort the List by name for the View. * Checkstyle * Fixed WebFetchersTest * Grobid does not need localization * Followup on removed Grobid localization * Fixed tests * Checkstyle * Grobid Fetcher and Tests adapted to updated Grobid * Adapted GrobidServiceTest to updated Grobid Co-authored-by: Christoph <siedlerkiller@gmail.com>
- Loading branch information
1 parent
a80435f
commit 0b02dd4
Showing
21 changed files
with
993 additions
and
47 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
166 changes: 166 additions & 0 deletions
166
src/main/java/org/jabref/logic/importer/fileformat/PdfEmbeddedBibFileImporter.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,166 @@ | ||
package org.jabref.logic.importer.fileformat; | ||
|
||
import java.io.BufferedReader; | ||
import java.io.IOException; | ||
import java.nio.charset.Charset; | ||
import java.nio.file.Path; | ||
import java.util.ArrayList; | ||
import java.util.List; | ||
import java.util.Map; | ||
import java.util.Objects; | ||
|
||
import org.jabref.logic.importer.ImportFormatPreferences; | ||
import org.jabref.logic.importer.Importer; | ||
import org.jabref.logic.importer.ParseException; | ||
import org.jabref.logic.importer.ParserResult; | ||
import org.jabref.logic.l10n.Localization; | ||
import org.jabref.logic.util.StandardFileType; | ||
import org.jabref.logic.util.io.FileUtil; | ||
import org.jabref.logic.xmp.EncryptedPdfsNotSupportedException; | ||
import org.jabref.logic.xmp.XmpUtilReader; | ||
import org.jabref.model.entry.BibEntry; | ||
import org.jabref.model.util.DummyFileUpdateMonitor; | ||
|
||
import org.apache.pdfbox.pdmodel.PDDocument; | ||
import org.apache.pdfbox.pdmodel.PDDocumentNameDictionary; | ||
import org.apache.pdfbox.pdmodel.PDEmbeddedFilesNameTreeNode; | ||
import org.apache.pdfbox.pdmodel.PDPage; | ||
import org.apache.pdfbox.pdmodel.common.PDNameTreeNode; | ||
import org.apache.pdfbox.pdmodel.common.filespecification.PDComplexFileSpecification; | ||
import org.apache.pdfbox.pdmodel.common.filespecification.PDEmbeddedFile; | ||
import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotation; | ||
import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotationFileAttachment; | ||
|
||
/** | ||
* PdfEmbeddedBibFileImporter imports an embedded Bib-File from the PDF. | ||
*/ | ||
public class PdfEmbeddedBibFileImporter extends Importer { | ||
|
||
private final ImportFormatPreferences importFormatPreferences; | ||
private final BibtexParser bibtexParser; | ||
|
||
public PdfEmbeddedBibFileImporter(ImportFormatPreferences importFormatPreferences) { | ||
this.importFormatPreferences = importFormatPreferences; | ||
bibtexParser = new BibtexParser(importFormatPreferences, new DummyFileUpdateMonitor()); | ||
} | ||
|
||
@Override | ||
public boolean isRecognizedFormat(BufferedReader input) throws IOException { | ||
return input.readLine().startsWith("%PDF"); | ||
} | ||
|
||
@Override | ||
public ParserResult importDatabase(BufferedReader reader) throws IOException { | ||
Objects.requireNonNull(reader); | ||
throw new UnsupportedOperationException("PdfEmbeddedBibFileImporter does not support importDatabase(BufferedReader reader)." | ||
+ "Instead use importDatabase(Path filePath, Charset defaultEncoding)."); | ||
} | ||
|
||
@Override | ||
public ParserResult importDatabase(String data) throws IOException { | ||
Objects.requireNonNull(data); | ||
throw new UnsupportedOperationException("PdfEmbeddedBibFileImporter does not support importDatabase(String data)." | ||
+ "Instead use importDatabase(Path filePath, Charset defaultEncoding)."); | ||
} | ||
|
||
@Override | ||
public ParserResult importDatabase(Path filePath, Charset defaultEncoding) { | ||
try (PDDocument document = XmpUtilReader.loadWithAutomaticDecryption(filePath)) { | ||
return new ParserResult(getEmbeddedBibFileEntries(document)); | ||
} catch (EncryptedPdfsNotSupportedException e) { | ||
return ParserResult.fromErrorMessage(Localization.lang("Decryption not supported.")); | ||
} catch (IOException | ParseException e) { | ||
return ParserResult.fromError(e); | ||
} | ||
} | ||
|
||
/** | ||
* Extraction of embedded files in pdfs adapted from: | ||
* Adapted from https://svn.apache.org/repos/asf/pdfbox/trunk/examples/src/main/java/org/apache/pdfbox/examples/pdmodel/ExtractEmbeddedFiles.javaj | ||
*/ | ||
|
||
private List<BibEntry> getEmbeddedBibFileEntries(PDDocument document) throws IOException, ParseException { | ||
List<BibEntry> allParsedEntries = new ArrayList<>(); | ||
PDDocumentNameDictionary nameDictionary = document.getDocumentCatalog().getNames(); | ||
if (nameDictionary != null) { | ||
PDEmbeddedFilesNameTreeNode efTree = nameDictionary.getEmbeddedFiles(); | ||
if (efTree != null) { | ||
Map<String, PDComplexFileSpecification> names = efTree.getNames(); | ||
if (names != null) { | ||
allParsedEntries.addAll(extractAndParseFiles(names)); | ||
} else { | ||
List<PDNameTreeNode<PDComplexFileSpecification>> kids = efTree.getKids(); | ||
for (PDNameTreeNode<PDComplexFileSpecification> node : kids) { | ||
names = node.getNames(); | ||
allParsedEntries.addAll(extractAndParseFiles(names)); | ||
} | ||
} | ||
} | ||
} | ||
// extract files from annotations | ||
for (PDPage page : document.getPages()) { | ||
for (PDAnnotation annotation : page.getAnnotations()) { | ||
if (annotation instanceof PDAnnotationFileAttachment) { | ||
PDAnnotationFileAttachment annotationFileAttachment = (PDAnnotationFileAttachment) annotation; | ||
PDComplexFileSpecification fileSpec = (PDComplexFileSpecification) annotationFileAttachment.getFile(); | ||
allParsedEntries.addAll(extractAndParseFile(getEmbeddedFile(fileSpec))); | ||
} | ||
} | ||
} | ||
return allParsedEntries; | ||
} | ||
|
||
private List<BibEntry> extractAndParseFiles(Map<String, PDComplexFileSpecification> names) throws IOException, ParseException { | ||
List<BibEntry> allParsedEntries = new ArrayList<>(); | ||
for (Map.Entry<String, PDComplexFileSpecification> entry : names.entrySet()) { | ||
String filename = entry.getKey(); | ||
FileUtil.getFileExtension(filename); | ||
if (FileUtil.isBibFile(Path.of(filename))) { | ||
PDComplexFileSpecification fileSpec = entry.getValue(); | ||
allParsedEntries.addAll(extractAndParseFile(getEmbeddedFile(fileSpec))); | ||
} | ||
} | ||
return allParsedEntries; | ||
} | ||
|
||
private List<BibEntry> extractAndParseFile(PDEmbeddedFile embeddedFile) throws IOException, ParseException { | ||
return bibtexParser.parseEntries(embeddedFile.createInputStream()); | ||
} | ||
|
||
private static PDEmbeddedFile getEmbeddedFile(PDComplexFileSpecification fileSpec) { | ||
// search for the first available alternative of the embedded file | ||
PDEmbeddedFile embeddedFile = null; | ||
if (fileSpec != null) { | ||
embeddedFile = fileSpec.getEmbeddedFileUnicode(); | ||
if (embeddedFile == null) { | ||
embeddedFile = fileSpec.getEmbeddedFileDos(); | ||
} | ||
if (embeddedFile == null) { | ||
embeddedFile = fileSpec.getEmbeddedFileMac(); | ||
} | ||
if (embeddedFile == null) { | ||
embeddedFile = fileSpec.getEmbeddedFileUnix(); | ||
} | ||
if (embeddedFile == null) { | ||
embeddedFile = fileSpec.getEmbeddedFile(); | ||
} | ||
} | ||
return embeddedFile; | ||
} | ||
|
||
@Override | ||
public String getName() { | ||
return "PDFembeddedbibfile"; | ||
} | ||
|
||
@Override | ||
public StandardFileType getFileType() { | ||
return StandardFileType.PDF; | ||
} | ||
|
||
@Override | ||
public String getDescription() { | ||
return "PdfEmbeddedBibFileImporter imports an embedded Bib-File from the PDF."; | ||
} | ||
|
||
} |
Oops, something went wrong.