diff --git a/CHANGELOG.md b/CHANGELOG.md index be815db3ce3..a82a7ec2ae2 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -13,6 +13,7 @@ Note that this project **does not** adhere to [Semantic Versioning](http://semve - We added the option to copy the DOI of an entry directly from the context menu copy submenu. [#7826](https://github.com/JabRef/jabref/issues/7826) - We added a fulltext search feature. [#2838](https://github.com/JabRef/jabref/pull/2838) +- We improved the deduction of bib-entries from imported fulltext pdfs. [#7947](https://github.com/JabRef/jabref/pull/7947) - We added unprotect_terms to the list of bracketed pattern modifiers [#7826](https://github.com/JabRef/jabref/pull/7960) - We added an icon picker in group edit dialog. [#6142](https://github.com/JabRef/jabref/issues/6142) diff --git a/src/main/java/org/jabref/gui/entryeditor/EntryEditor.java b/src/main/java/org/jabref/gui/entryeditor/EntryEditor.java index 84c98978b75..aa3097ec004 100644 --- a/src/main/java/org/jabref/gui/entryeditor/EntryEditor.java +++ b/src/main/java/org/jabref/gui/entryeditor/EntryEditor.java @@ -355,7 +355,7 @@ private void setupToolBar() { // Add menu for fetching bibliographic information ContextMenu fetcherMenu = new ContextMenu(); - for (EntryBasedFetcher fetcher : WebFetchers.getEntryBasedFetchers(preferencesService.getImportFormatPreferences())) { + for (EntryBasedFetcher fetcher : WebFetchers.getEntryBasedFetchers(preferencesService.getImportFormatPreferences(), preferencesService.getFilePreferences(), databaseContext, preferencesService.getDefaultEncoding())) { MenuItem fetcherMenuItem = new MenuItem(fetcher.getName()); fetcherMenuItem.setOnAction(event -> fetchAndMerge(fetcher)); fetcherMenu.getItems().add(fetcherMenuItem); diff --git a/src/main/java/org/jabref/logic/externalfiles/ExternalFilesContentImporter.java b/src/main/java/org/jabref/logic/externalfiles/ExternalFilesContentImporter.java index da9a88d7c62..9ff0ca204bf 100644 --- a/src/main/java/org/jabref/logic/externalfiles/ExternalFilesContentImporter.java +++ b/src/main/java/org/jabref/logic/externalfiles/ExternalFilesContentImporter.java @@ -7,7 +7,7 @@ import org.jabref.logic.importer.ImportFormatPreferences; import org.jabref.logic.importer.OpenDatabase; import org.jabref.logic.importer.ParserResult; -import org.jabref.logic.importer.fileformat.PdfContentImporter; +import org.jabref.logic.importer.fileformat.PdfMergeMetadataImporter; import org.jabref.logic.importer.fileformat.PdfXmpImporter; import org.jabref.logic.preferences.TimestampPreferences; import org.jabref.model.util.FileUpdateMonitor; @@ -23,7 +23,11 @@ public ExternalFilesContentImporter(ImportFormatPreferences importFormatPreferen } public ParserResult importPDFContent(Path file) { - return new PdfContentImporter(importFormatPreferences).importDatabase(file, StandardCharsets.UTF_8); + try { + return new PdfMergeMetadataImporter(importFormatPreferences).importDatabase(file, StandardCharsets.UTF_8); + } catch (IOException e) { + return ParserResult.fromError(e); + } } public ParserResult importXMPContent(Path file) { diff --git a/src/main/java/org/jabref/logic/importer/ImportFormatReader.java b/src/main/java/org/jabref/logic/importer/ImportFormatReader.java index 3b00f3e5057..08a72a802a1 100644 --- a/src/main/java/org/jabref/logic/importer/ImportFormatReader.java +++ b/src/main/java/org/jabref/logic/importer/ImportFormatReader.java @@ -2,12 +2,14 @@ import java.io.IOException; import java.nio.file.Path; +import java.util.ArrayList; import java.util.List; import java.util.Objects; import java.util.Optional; import java.util.SortedSet; import java.util.TreeSet; +import org.jabref.logic.importer.fetcher.GrobidCitationFetcher; import org.jabref.logic.importer.fileformat.BibTeXMLImporter; import org.jabref.logic.importer.fileformat.BiblioscapeImporter; import org.jabref.logic.importer.fileformat.BibtexImporter; @@ -22,6 +24,10 @@ import org.jabref.logic.importer.fileformat.MsBibImporter; import org.jabref.logic.importer.fileformat.OvidImporter; import org.jabref.logic.importer.fileformat.PdfContentImporter; +import org.jabref.logic.importer.fileformat.PdfEmbeddedBibFileImporter; +import org.jabref.logic.importer.fileformat.PdfGrobidImporter; +import org.jabref.logic.importer.fileformat.PdfMergeMetadataImporter; +import org.jabref.logic.importer.fileformat.PdfVerbatimBibTextImporter; import org.jabref.logic.importer.fileformat.PdfXmpImporter; import org.jabref.logic.importer.fileformat.RepecNepImporter; import org.jabref.logic.importer.fileformat.RisImporter; @@ -42,7 +48,7 @@ public class ImportFormatReader { * All import formats. * Sorted accordingly to {@link Importer#compareTo}, which defaults to alphabetically by the name */ - private final SortedSet formats = new TreeSet<>(); + private final List formats = new ArrayList<>(); private ImportFormatPreferences importFormatPreferences; @@ -51,8 +57,6 @@ public void resetImportFormats(ImportFormatPreferences newImportFormatPreference formats.clear(); - formats.add(new BiblioscapeImporter()); - formats.add(new BibtexImporter(importFormatPreferences, fileMonitor)); formats.add(new BibTeXMLImporter()); formats.add(new CopacImporter()); formats.add(new EndnoteImporter(importFormatPreferences)); @@ -64,11 +68,17 @@ public void resetImportFormats(ImportFormatPreferences newImportFormatPreference formats.add(new ModsImporter(importFormatPreferences)); formats.add(new MsBibImporter()); formats.add(new OvidImporter()); + formats.add(new PdfMergeMetadataImporter(importFormatPreferences)); + formats.add(new PdfVerbatimBibTextImporter(importFormatPreferences)); formats.add(new PdfContentImporter(importFormatPreferences)); + formats.add(new PdfEmbeddedBibFileImporter(importFormatPreferences)); + formats.add(new PdfGrobidImporter(GrobidCitationFetcher.GROBID_URL, importFormatPreferences)); formats.add(new PdfXmpImporter(xmpPreferences)); formats.add(new RepecNepImporter(importFormatPreferences)); formats.add(new RisImporter()); formats.add(new SilverPlatterImporter()); + formats.add(new BiblioscapeImporter()); + formats.add(new BibtexImporter(importFormatPreferences, fileMonitor)); // Get custom import formats formats.addAll(importFormatPreferences.getCustomImportList()); @@ -110,26 +120,26 @@ public ParserResult importFromFile(String format, Path file) throws ImportExcept * All importers. *

*

- * Elements are in default order. + * Elements are sorted by name. *

* * @return all custom importers, elements are of type InputFormat */ public SortedSet getImportFormats() { - return this.formats; + return new TreeSet<>(this.formats); } /** * Human readable list of all known import formats (name and CLI Id). *

- *

List is in default-order.

+ *

List is sorted by importer name.

* * @return human readable list of all known import formats */ public String getImportFormatList() { StringBuilder sb = new StringBuilder(); - for (Importer imFo : formats) { + for (Importer imFo : getImportFormats()) { int pad = Math.max(0, 14 - imFo.getName().length()); sb.append(" "); sb.append(imFo.getName()); @@ -166,20 +176,25 @@ public UnknownFormatImport(String format, ParserResult parserResult) { public UnknownFormatImport importUnknownFormat(Path filePath, TimestampPreferences timestampPreferences, FileUpdateMonitor fileMonitor) throws ImportException { Objects.requireNonNull(filePath); - // First, see if it is a BibTeX file: try { - ParserResult parserResult = OpenDatabase.loadDatabase(filePath, importFormatPreferences, timestampPreferences, fileMonitor); - if (parserResult.getDatabase().hasEntries() || !parserResult.getDatabase().hasNoStrings()) { - parserResult.setFile(filePath.toFile()); - return new UnknownFormatImport(ImportFormatReader.BIBTEX_FORMAT, parserResult); + UnknownFormatImport unknownFormatImport = importUnknownFormat(importer -> importer.importDatabase(filePath, importFormatPreferences.getEncoding()), importer -> importer.isRecognizedFormat(filePath, importFormatPreferences.getEncoding())); + unknownFormatImport.parserResult.setFile(filePath.toFile()); + return unknownFormatImport; + } catch (ImportException e) { + // If all importers fail, try to read the file as BibTeX + try { + ParserResult parserResult = OpenDatabase.loadDatabase(filePath, importFormatPreferences, timestampPreferences, fileMonitor); + if (parserResult.getDatabase().hasEntries() || !parserResult.getDatabase().hasNoStrings()) { + parserResult.setFile(filePath.toFile()); + return new UnknownFormatImport(ImportFormatReader.BIBTEX_FORMAT, parserResult); + } else { + throw new ImportException(Localization.lang("Could not find a suitable import format.")); + } + } catch (IOException ignore) { + // Ignored + throw new ImportException(Localization.lang("Could not find a suitable import format.")); } - } catch (IOException ignore) { - // Ignored } - - UnknownFormatImport unknownFormatImport = importUnknownFormat(importer -> importer.importDatabase(filePath, importFormatPreferences.getEncoding()), importer -> importer.isRecognizedFormat(filePath, importFormatPreferences.getEncoding())); - unknownFormatImport.parserResult.setFile(filePath.toFile()); - return unknownFormatImport; } /** @@ -198,7 +213,7 @@ private UnknownFormatImport importUnknownFormat(CheckedFunction getIdBasedFetchers(ImportFormatPreferenc /** * @return sorted set containing entry based fetchers */ - public static SortedSet getEntryBasedFetchers(ImportFormatPreferences importFormatPreferences) { + public static SortedSet getEntryBasedFetchers(ImportFormatPreferences importFormatPreferences, FilePreferences filePreferences, BibDatabaseContext databaseContext, Charset defaultEncoding) { SortedSet set = new TreeSet<>(Comparator.comparing(WebFetcher::getName)); set.add(new AstrophysicsDataSystem(importFormatPreferences)); set.add(new DoiFetcher(importFormatPreferences)); @@ -141,6 +145,7 @@ public static SortedSet getEntryBasedFetchers(ImportFormatPre set.add(new MathSciNet(importFormatPreferences)); set.add(new CrossRef()); set.add(new ZbMATH(importFormatPreferences)); + set.add(new PdfMergeMetadataImporter.EntryBasedFetcherWrapper(importFormatPreferences, filePreferences, databaseContext, defaultEncoding)); return set; } diff --git a/src/main/java/org/jabref/logic/importer/fetcher/GrobidCitationFetcher.java b/src/main/java/org/jabref/logic/importer/fetcher/GrobidCitationFetcher.java index bf16d71570d..7955e206fd7 100644 --- a/src/main/java/org/jabref/logic/importer/fetcher/GrobidCitationFetcher.java +++ b/src/main/java/org/jabref/logic/importer/fetcher/GrobidCitationFetcher.java @@ -23,9 +23,10 @@ public class GrobidCitationFetcher implements SearchBasedFetcher { + public static final String GROBID_URL = "http://grobid.jabref.org:8070"; + private static final Logger LOGGER = LoggerFactory.getLogger(GrobidCitationFetcher.class); - private static final String GROBID_URL = "http://grobid.jabref.org:8070"; private ImportFormatPreferences importFormatPreferences; private GrobidService grobidService; diff --git a/src/main/java/org/jabref/logic/importer/fileformat/PdfEmbeddedBibFileImporter.java b/src/main/java/org/jabref/logic/importer/fileformat/PdfEmbeddedBibFileImporter.java new file mode 100644 index 00000000000..e91754afeb4 --- /dev/null +++ b/src/main/java/org/jabref/logic/importer/fileformat/PdfEmbeddedBibFileImporter.java @@ -0,0 +1,166 @@ +package org.jabref.logic.importer.fileformat; + +import java.io.BufferedReader; +import java.io.IOException; +import java.nio.charset.Charset; +import java.nio.file.Path; +import java.util.ArrayList; +import java.util.List; +import java.util.Map; +import java.util.Objects; + +import org.jabref.logic.importer.ImportFormatPreferences; +import org.jabref.logic.importer.Importer; +import org.jabref.logic.importer.ParseException; +import org.jabref.logic.importer.ParserResult; +import org.jabref.logic.l10n.Localization; +import org.jabref.logic.util.StandardFileType; +import org.jabref.logic.util.io.FileUtil; +import org.jabref.logic.xmp.EncryptedPdfsNotSupportedException; +import org.jabref.logic.xmp.XmpUtilReader; +import org.jabref.model.entry.BibEntry; +import org.jabref.model.util.DummyFileUpdateMonitor; + +import org.apache.pdfbox.pdmodel.PDDocument; +import org.apache.pdfbox.pdmodel.PDDocumentNameDictionary; +import org.apache.pdfbox.pdmodel.PDEmbeddedFilesNameTreeNode; +import org.apache.pdfbox.pdmodel.PDPage; +import org.apache.pdfbox.pdmodel.common.PDNameTreeNode; +import org.apache.pdfbox.pdmodel.common.filespecification.PDComplexFileSpecification; +import org.apache.pdfbox.pdmodel.common.filespecification.PDEmbeddedFile; +import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotation; +import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotationFileAttachment; + +/** + * PdfEmbeddedBibFileImporter imports an embedded Bib-File from the PDF. + */ +public class PdfEmbeddedBibFileImporter extends Importer { + + private final ImportFormatPreferences importFormatPreferences; + private final BibtexParser bibtexParser; + + public PdfEmbeddedBibFileImporter(ImportFormatPreferences importFormatPreferences) { + this.importFormatPreferences = importFormatPreferences; + bibtexParser = new BibtexParser(importFormatPreferences, new DummyFileUpdateMonitor()); + } + + @Override + public boolean isRecognizedFormat(BufferedReader input) throws IOException { + return input.readLine().startsWith("%PDF"); + } + + @Override + public ParserResult importDatabase(BufferedReader reader) throws IOException { + Objects.requireNonNull(reader); + throw new UnsupportedOperationException("PdfEmbeddedBibFileImporter does not support importDatabase(BufferedReader reader)." + + "Instead use importDatabase(Path filePath, Charset defaultEncoding)."); + } + + @Override + public ParserResult importDatabase(String data) throws IOException { + Objects.requireNonNull(data); + throw new UnsupportedOperationException("PdfEmbeddedBibFileImporter does not support importDatabase(String data)." + + "Instead use importDatabase(Path filePath, Charset defaultEncoding)."); + } + + @Override + public ParserResult importDatabase(Path filePath, Charset defaultEncoding) { + try (PDDocument document = XmpUtilReader.loadWithAutomaticDecryption(filePath)) { + return new ParserResult(getEmbeddedBibFileEntries(document)); + } catch (EncryptedPdfsNotSupportedException e) { + return ParserResult.fromErrorMessage(Localization.lang("Decryption not supported.")); + } catch (IOException | ParseException e) { + return ParserResult.fromError(e); + } + } + + /** + * Extraction of embedded files in pdfs adapted from: + * Adapted from https://svn.apache.org/repos/asf/pdfbox/trunk/examples/src/main/java/org/apache/pdfbox/examples/pdmodel/ExtractEmbeddedFiles.javaj + */ + + private List getEmbeddedBibFileEntries(PDDocument document) throws IOException, ParseException { + List allParsedEntries = new ArrayList<>(); + PDDocumentNameDictionary nameDictionary = document.getDocumentCatalog().getNames(); + if (nameDictionary != null) { + PDEmbeddedFilesNameTreeNode efTree = nameDictionary.getEmbeddedFiles(); + if (efTree != null) { + Map names = efTree.getNames(); + if (names != null) { + allParsedEntries.addAll(extractAndParseFiles(names)); + } else { + List> kids = efTree.getKids(); + for (PDNameTreeNode node : kids) { + names = node.getNames(); + allParsedEntries.addAll(extractAndParseFiles(names)); + } + } + } + } + // extract files from annotations + for (PDPage page : document.getPages()) { + for (PDAnnotation annotation : page.getAnnotations()) { + if (annotation instanceof PDAnnotationFileAttachment) { + PDAnnotationFileAttachment annotationFileAttachment = (PDAnnotationFileAttachment) annotation; + PDComplexFileSpecification fileSpec = (PDComplexFileSpecification) annotationFileAttachment.getFile(); + allParsedEntries.addAll(extractAndParseFile(getEmbeddedFile(fileSpec))); + } + } + } + return allParsedEntries; + } + + private List extractAndParseFiles(Map names) throws IOException, ParseException { + List allParsedEntries = new ArrayList<>(); + for (Map.Entry entry : names.entrySet()) { + String filename = entry.getKey(); + FileUtil.getFileExtension(filename); + if (FileUtil.isBibFile(Path.of(filename))) { + PDComplexFileSpecification fileSpec = entry.getValue(); + allParsedEntries.addAll(extractAndParseFile(getEmbeddedFile(fileSpec))); + } + } + return allParsedEntries; + } + + private List extractAndParseFile(PDEmbeddedFile embeddedFile) throws IOException, ParseException { + return bibtexParser.parseEntries(embeddedFile.createInputStream()); + } + + private static PDEmbeddedFile getEmbeddedFile(PDComplexFileSpecification fileSpec) { + // search for the first available alternative of the embedded file + PDEmbeddedFile embeddedFile = null; + if (fileSpec != null) { + embeddedFile = fileSpec.getEmbeddedFileUnicode(); + if (embeddedFile == null) { + embeddedFile = fileSpec.getEmbeddedFileDos(); + } + if (embeddedFile == null) { + embeddedFile = fileSpec.getEmbeddedFileMac(); + } + if (embeddedFile == null) { + embeddedFile = fileSpec.getEmbeddedFileUnix(); + } + if (embeddedFile == null) { + embeddedFile = fileSpec.getEmbeddedFile(); + } + } + return embeddedFile; + } + + @Override + public String getName() { + return "PDFembeddedbibfile"; + } + + @Override + public StandardFileType getFileType() { + return StandardFileType.PDF; + } + + @Override + public String getDescription() { + return "PdfEmbeddedBibFileImporter imports an embedded Bib-File from the PDF."; + } + +} diff --git a/src/main/java/org/jabref/logic/importer/fileformat/PdfGrobidImporter.java b/src/main/java/org/jabref/logic/importer/fileformat/PdfGrobidImporter.java new file mode 100644 index 00000000000..84f952d0601 --- /dev/null +++ b/src/main/java/org/jabref/logic/importer/fileformat/PdfGrobidImporter.java @@ -0,0 +1,100 @@ +package org.jabref.logic.importer.fileformat; + +import java.io.BufferedReader; +import java.io.IOException; +import java.nio.charset.Charset; +import java.nio.file.Path; +import java.util.List; +import java.util.Objects; +import java.util.Optional; + +import org.jabref.logic.importer.ImportFormatPreferences; +import org.jabref.logic.importer.Importer; +import org.jabref.logic.importer.ParserResult; +import org.jabref.logic.importer.util.GrobidService; +import org.jabref.logic.util.StandardFileType; +import org.jabref.model.entry.BibEntry; +import org.jabref.model.entry.LinkedFile; +import org.jabref.model.util.FileHelper; + +/** + * Wraps the GrobidService function to be used as an Importer. + */ +public class PdfGrobidImporter extends Importer { + + private final GrobidService grobidService; + private final ImportFormatPreferences importFormatPreferences; + + public PdfGrobidImporter(String grobidServerURL, ImportFormatPreferences importFormatPreferences) { + this.grobidService = new GrobidService(grobidServerURL); + this.importFormatPreferences = importFormatPreferences; + } + + @Override + public String getName() { + return "Grobid"; + } + + @Override + public StandardFileType getFileType() { + return StandardFileType.PDF; + } + + @Override + public ParserResult importDatabase(BufferedReader reader) throws IOException { + Objects.requireNonNull(reader); + throw new UnsupportedOperationException( + "PdfGrobidImporter does not support importDatabase(BufferedReader reader)." + + "Instead use importDatabase(Path filePath, Charset defaultEncoding)."); + } + + @Override + public ParserResult importDatabase(String data) throws IOException { + Objects.requireNonNull(data); + throw new UnsupportedOperationException( + "PdfGrobidImporter does not support importDatabase(String data)." + + "Instead use importDatabase(Path filePath, Charset defaultEncoding)."); + } + + @Override + public ParserResult importDatabase(Path filePath, Charset defaultEncoding) { + Objects.requireNonNull(filePath); + try { + List result = grobidService.processPDF(filePath, importFormatPreferences); + result.forEach(entry -> entry.addFile(new LinkedFile("", filePath.toAbsolutePath(), "PDF"))); + return new ParserResult(result); + } catch (Exception exception) { + return ParserResult.fromError(exception); + } + } + + @Override + public boolean isRecognizedFormat(BufferedReader reader) throws IOException { + Objects.requireNonNull(reader); + return false; + } + + /** + * Returns whether the given stream contains data that is a.) a pdf and b.) + * contains at least one BibEntry. + */ + @Override + public boolean isRecognizedFormat(Path filePath, Charset defaultEncoding) throws IOException { + Objects.requireNonNull(filePath); + Optional extension = FileHelper.getFileExtension(filePath); + if (extension.isEmpty()) { + return false; + } + return getFileType().getExtensions().contains(extension.get()); + } + + @Override + public String getId() { + return "grobidPdf"; + } + + @Override + public String getDescription() { + return "Wraps the GrobidService function to be used as an Importer."; + } +} diff --git a/src/main/java/org/jabref/logic/importer/fileformat/PdfMergeMetadataImporter.java b/src/main/java/org/jabref/logic/importer/fileformat/PdfMergeMetadataImporter.java new file mode 100644 index 00000000000..2d301e70307 --- /dev/null +++ b/src/main/java/org/jabref/logic/importer/fileformat/PdfMergeMetadataImporter.java @@ -0,0 +1,178 @@ +package org.jabref.logic.importer.fileformat; + +import java.io.BufferedReader; +import java.io.IOException; +import java.nio.charset.Charset; +import java.nio.file.Path; +import java.util.ArrayList; +import java.util.List; +import java.util.Map; +import java.util.Objects; +import java.util.Optional; +import java.util.Set; + +import org.jabref.gui.DefaultInjector; +import org.jabref.logic.importer.EntryBasedFetcher; +import org.jabref.logic.importer.FetcherException; +import org.jabref.logic.importer.ImportFormatPreferences; +import org.jabref.logic.importer.Importer; +import org.jabref.logic.importer.ParserResult; +import org.jabref.logic.importer.fetcher.DoiFetcher; +import org.jabref.logic.importer.fetcher.GrobidCitationFetcher; +import org.jabref.logic.importer.fetcher.IsbnFetcher; +import org.jabref.logic.importer.util.FileFieldParser; +import org.jabref.logic.util.StandardFileType; +import org.jabref.model.database.BibDatabaseContext; +import org.jabref.model.entry.BibEntry; +import org.jabref.model.entry.LinkedFile; +import org.jabref.model.entry.field.Field; +import org.jabref.model.entry.field.StandardField; +import org.jabref.preferences.FilePreferences; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * PdfEmbeddedBibFileImporter imports an embedded Bib-File from the PDF. + */ +public class PdfMergeMetadataImporter extends Importer { + + private static final Logger LOGGER = LoggerFactory.getLogger(DefaultInjector.class); + + private final List metadataImporters; + private final ImportFormatPreferences importFormatPreferences; + + public PdfMergeMetadataImporter(ImportFormatPreferences importFormatPreferences) { + this.importFormatPreferences = importFormatPreferences; + this.metadataImporters = List.of( + new PdfGrobidImporter(GrobidCitationFetcher.GROBID_URL, importFormatPreferences), + new PdfEmbeddedBibFileImporter(importFormatPreferences), + new PdfXmpImporter(importFormatPreferences.getXmpPreferences()), + new PdfVerbatimBibTextImporter(importFormatPreferences), + new PdfContentImporter(importFormatPreferences) + ); + } + + @Override + public boolean isRecognizedFormat(BufferedReader input) throws IOException { + return input.readLine().startsWith("%PDF"); + } + + @Override + public ParserResult importDatabase(BufferedReader reader) throws IOException { + Objects.requireNonNull(reader); + throw new UnsupportedOperationException("PdfMergeMetadataImporter does not support importDatabase(BufferedReader reader)." + + "Instead use importDatabase(Path filePath, Charset defaultEncoding)."); + } + + @Override + public ParserResult importDatabase(String data) throws IOException { + Objects.requireNonNull(data); + throw new UnsupportedOperationException("PdfMergeMetadataImporter does not support importDatabase(String data)." + + "Instead use importDatabase(Path filePath, Charset defaultEncoding)."); + } + + @Override + public ParserResult importDatabase(Path filePath, Charset defaultEncoding) throws IOException { + List candidates = new ArrayList<>(); + + for (Importer metadataImporter : metadataImporters) { + List extractedEntries = metadataImporter.importDatabase(filePath, defaultEncoding).getDatabase().getEntries(); + if (extractedEntries.size() == 0) { + continue; + } + candidates.add(extractedEntries.get(0)); + } + if (candidates.isEmpty()) { + return new ParserResult(); + } + List fetchedCandidates = new ArrayList<>(); + for (BibEntry candidate : candidates) { + if (candidate.hasField(StandardField.DOI)) { + try { + new DoiFetcher(importFormatPreferences).performSearchById(candidate.getField(StandardField.DOI).get()).ifPresent((fromDoi) -> fetchedCandidates.add(fromDoi)); + } catch (FetcherException e) { + LOGGER.error("Fetching failed for DOI \"{}\".", candidate.getField(StandardField.DOI).get(), e); + } + } + if (candidate.hasField(StandardField.ISBN)) { + try { + new IsbnFetcher(importFormatPreferences).performSearchById(candidate.getField(StandardField.ISBN).get()).ifPresent((fromISBN) -> fetchedCandidates.add(fromISBN)); + } catch (FetcherException e) { + LOGGER.error("Fetching failed for ISBN \"{}\".", candidate.getField(StandardField.ISBN).get(), e); + } + } + } + candidates.addAll(0, fetchedCandidates); + BibEntry entry = new BibEntry(); + for (BibEntry candidate : candidates) { + if (BibEntry.DEFAULT_TYPE.equals(entry.getType())) { + entry.setType(candidate.getType()); + } + Set presentFields = entry.getFields(); + for (Map.Entry fieldEntry : candidate.getFieldMap().entrySet()) { + // Don't merge FILE fields that point to a stored file as we set that to filePath anyway. + // Nevertheless, retain online links. + if (StandardField.FILE.equals(fieldEntry.getKey()) && + !FileFieldParser.parse(fieldEntry.getValue()).stream().anyMatch((linkedFile) -> linkedFile.isOnlineLink())) { + continue; + } + // Only overwrite non-present fields + if (!presentFields.contains(fieldEntry.getKey())) { + entry.setField(fieldEntry.getKey(), fieldEntry.getValue()); + } + } + } + + entry.addFile(new LinkedFile("", filePath, StandardFileType.PDF.getName())); + return new ParserResult(List.of(entry)); + } + + @Override + public String getName() { + return "PDFmergemetadata"; + } + + @Override + public StandardFileType getFileType() { + return StandardFileType.PDF; + } + + @Override + public String getDescription() { + return "PdfMergeMetadataImporter imports metadata from a PDF using multiple strategies and merging the result."; + } + + public static class EntryBasedFetcherWrapper extends PdfMergeMetadataImporter implements EntryBasedFetcher { + + private static final Logger LOGGER = LoggerFactory.getLogger(DefaultInjector.class); + private final FilePreferences filePreferences; + private final BibDatabaseContext databaseContext; + private final Charset defaultEncoding; + + public EntryBasedFetcherWrapper(ImportFormatPreferences importFormatPreferences, FilePreferences filePreferences, BibDatabaseContext context, Charset defaultEncoding) { + super(importFormatPreferences); + this.filePreferences = filePreferences; + this.databaseContext = context; + this.defaultEncoding = defaultEncoding; + } + + @Override + public List performSearch(BibEntry entry) throws FetcherException { + for (LinkedFile file : entry.getFiles()) { + Optional filePath = file.findIn(databaseContext, filePreferences); + if (filePath.isPresent()) { + try { + ParserResult result = importDatabase(filePath.get(), defaultEncoding); + if (!result.isEmpty()) { + return result.getDatabase().getEntries(); + } + } catch (IOException e) { + LOGGER.error("Cannot read \"{}\"", filePath.get(), e); + } + } + } + return List.of(); + } + } +} diff --git a/src/main/java/org/jabref/logic/importer/fileformat/PdfVerbatimBibTextImporter.java b/src/main/java/org/jabref/logic/importer/fileformat/PdfVerbatimBibTextImporter.java new file mode 100644 index 00000000000..60405b76f33 --- /dev/null +++ b/src/main/java/org/jabref/logic/importer/fileformat/PdfVerbatimBibTextImporter.java @@ -0,0 +1,103 @@ +package org.jabref.logic.importer.fileformat; + +import java.io.BufferedReader; +import java.io.IOException; +import java.io.StringWriter; +import java.nio.charset.Charset; +import java.nio.file.Path; +import java.util.ArrayList; +import java.util.List; +import java.util.Objects; + +import org.jabref.logic.importer.ImportFormatPreferences; +import org.jabref.logic.importer.Importer; +import org.jabref.logic.importer.ParseException; +import org.jabref.logic.importer.ParserResult; +import org.jabref.logic.l10n.Localization; +import org.jabref.logic.util.StandardFileType; +import org.jabref.logic.xmp.EncryptedPdfsNotSupportedException; +import org.jabref.logic.xmp.XmpUtilReader; +import org.jabref.model.entry.BibEntry; +import org.jabref.model.entry.LinkedFile; +import org.jabref.model.util.DummyFileUpdateMonitor; + +import org.apache.pdfbox.pdmodel.PDDocument; +import org.apache.pdfbox.text.PDFTextStripper; + +/** + * This importer imports a verbatim BibTeX entry from the first page of the PDF. + */ +public class PdfVerbatimBibTextImporter extends Importer { + + private final ImportFormatPreferences importFormatPreferences; + + public PdfVerbatimBibTextImporter(ImportFormatPreferences importFormatPreferences) { + this.importFormatPreferences = importFormatPreferences; + } + + @Override + public boolean isRecognizedFormat(BufferedReader input) throws IOException { + return input.readLine().startsWith("%PDF"); + } + + @Override + public ParserResult importDatabase(BufferedReader reader) throws IOException { + Objects.requireNonNull(reader); + throw new UnsupportedOperationException("PdfVerbatimBibTextImporter does not support importDatabase(BufferedReader reader)." + + "Instead use importDatabase(Path filePath, Charset defaultEncoding)."); + } + + @Override + public ParserResult importDatabase(String data) throws IOException { + Objects.requireNonNull(data); + throw new UnsupportedOperationException("PdfVerbatimBibTextImporter does not support importDatabase(String data)." + + "Instead use importDatabase(Path filePath, Charset defaultEncoding)."); + } + + @Override + public ParserResult importDatabase(Path filePath, Charset defaultEncoding) { + List result = new ArrayList<>(1); + try (PDDocument document = XmpUtilReader.loadWithAutomaticDecryption(filePath)) { + String firstPageContents = getFirstPageContents(document); + BibtexParser parser = new BibtexParser(importFormatPreferences, new DummyFileUpdateMonitor()); + result = parser.parseEntries(firstPageContents); + } catch (EncryptedPdfsNotSupportedException e) { + return ParserResult.fromErrorMessage(Localization.lang("Decryption not supported.")); + } catch (IOException | ParseException e) { + return ParserResult.fromError(e); + } + + result.forEach(entry -> entry.addFile(new LinkedFile("", filePath.toAbsolutePath(), "PDF"))); + result.forEach(entry -> entry.setCommentsBeforeEntry("")); + return new ParserResult(result); + } + + private String getFirstPageContents(PDDocument document) throws IOException { + PDFTextStripper stripper = new PDFTextStripper(); + + stripper.setStartPage(1); + stripper.setEndPage(1); + stripper.setSortByPosition(true); + stripper.setParagraphEnd(System.lineSeparator()); + StringWriter writer = new StringWriter(); + stripper.writeText(document, writer); + + return writer.toString(); + } + + @Override + public String getName() { + return "PdfVerbatimBibText"; + } + + @Override + public StandardFileType getFileType() { + return StandardFileType.PDF; + } + + @Override + public String getDescription() { + return "PdfVerbatimBibTextImporter imports a verbatim BibTeX entry from the first page of the PDF."; + } + +} diff --git a/src/main/java/org/jabref/logic/importer/util/GrobidService.java b/src/main/java/org/jabref/logic/importer/util/GrobidService.java index ec9b7567fc0..3259a44869a 100644 --- a/src/main/java/org/jabref/logic/importer/util/GrobidService.java +++ b/src/main/java/org/jabref/logic/importer/util/GrobidService.java @@ -1,11 +1,18 @@ package org.jabref.logic.importer.util; import java.io.IOException; -import java.net.URLEncoder; -import java.nio.charset.StandardCharsets; -import java.time.Duration; +import java.nio.file.Files; +import java.nio.file.Path; +import java.util.List; -import org.jabref.logic.net.URLDownload; +import org.jabref.logic.importer.ImportFormatPreferences; +import org.jabref.logic.importer.ParseException; +import org.jabref.logic.importer.fileformat.BibtexParser; +import org.jabref.model.entry.BibEntry; +import org.jabref.model.util.DummyFileUpdateMonitor; + +import org.jsoup.Connection; +import org.jsoup.Jsoup; /** * Implements an API to a GROBID server, as described at @@ -45,18 +52,41 @@ public GrobidService(String grobidServerURL) { * @throws IOException if an I/O excecption during the call ocurred or no BibTeX entry could be determiend */ public String processCitation(String rawCitation, ConsolidateCitations consolidateCitations) throws IOException { - rawCitation = URLEncoder.encode(rawCitation, StandardCharsets.UTF_8); - URLDownload urlDownload = new URLDownload(grobidServerURL - + "/api/processCitation"); - urlDownload.setConnectTimeout(Duration.ofSeconds(5)); - urlDownload.addHeader("Accept", MediaTypes.APPLICATION_BIBTEX); - urlDownload.setPostData("citations=" + rawCitation + "&consolidateCitations=" + consolidateCitations); - String httpResponse = urlDownload.asString(); + Connection.Response response = Jsoup.connect(grobidServerURL + "/api/processCitation") + .header("Accept", MediaTypes.APPLICATION_BIBTEX) + .data("citations", rawCitation) + .data("consolidateCitations", String.valueOf(consolidateCitations.getCode())) + .method(Connection.Method.POST) + .ignoreContentType(true) + .timeout(20000) + .execute(); + String httpResponse = response.body(); - if (httpResponse == null || httpResponse.equals("@misc{-1,\n author = {}\n}\n")) { // This filters empty BibTeX entries + if (httpResponse == null || httpResponse.equals("@misc{-1,\n author = {}\n}\n") || httpResponse.equals("@misc{-1,\n author = {" + rawCitation + "}\n}\n")) { // This filters empty BibTeX entries throw new IOException("The GROBID server response does not contain anything."); } return httpResponse; } + + public List processPDF(Path filePath, ImportFormatPreferences importFormatPreferences) throws IOException, ParseException { + Connection.Response response = Jsoup.connect(grobidServerURL + "/api/processHeaderDocument") + .header("Accept", MediaTypes.APPLICATION_BIBTEX) + .data("input", filePath.toString(), Files.newInputStream(filePath)) + .method(Connection.Method.POST) + .ignoreContentType(true) + .timeout(20000) + .execute(); + + String httpResponse = response.body(); + + if (httpResponse == null || httpResponse.equals("@misc{-1,\n author = {}\n}\n")) { // This filters empty BibTeX entries + throw new IOException("The GROBID server response does not contain anything."); + } + + BibtexParser parser = new BibtexParser(importFormatPreferences, new DummyFileUpdateMonitor()); + List result = parser.parseEntries(httpResponse); + result.stream().forEach((entry) -> entry.setCitationKey("")); + return result; + } } diff --git a/src/test/java/org/jabref/logic/importer/ImportFormatReaderIntegrationTest.java b/src/test/java/org/jabref/logic/importer/ImportFormatReaderIntegrationTest.java index 3f338e8d6e8..8a32f29391d 100644 --- a/src/test/java/org/jabref/logic/importer/ImportFormatReaderIntegrationTest.java +++ b/src/test/java/org/jabref/logic/importer/ImportFormatReaderIntegrationTest.java @@ -5,6 +5,7 @@ import java.nio.file.Path; import java.util.ArrayList; import java.util.Collection; +import java.util.Set; import java.util.stream.Stream; import org.jabref.logic.preferences.TimestampPreferences; @@ -31,6 +32,7 @@ void setUp() { reader = new ImportFormatReader(); ImportFormatPreferences importFormatPreferences = mock(ImportFormatPreferences.class, Answers.RETURNS_DEEP_STUBS); when(importFormatPreferences.getEncoding()).thenReturn(StandardCharsets.UTF_8); + when(importFormatPreferences.getCustomImportList()).thenReturn(Set.of()); when(timestampPreferences.getTimestampField()).then(invocation -> StandardField.TIMESTAMP); reader.resetImportFormats(importFormatPreferences, mock(XmpPreferences.class), new DummyFileUpdateMonitor()); } diff --git a/src/test/java/org/jabref/logic/importer/ImportFormatReaderTestParameterless.java b/src/test/java/org/jabref/logic/importer/ImportFormatReaderTestParameterless.java index ab838803ae2..b44125c11bb 100644 --- a/src/test/java/org/jabref/logic/importer/ImportFormatReaderTestParameterless.java +++ b/src/test/java/org/jabref/logic/importer/ImportFormatReaderTestParameterless.java @@ -2,6 +2,7 @@ import java.nio.charset.StandardCharsets; import java.nio.file.Path; +import java.util.Set; import org.jabref.logic.preferences.TimestampPreferences; import org.jabref.logic.xmp.XmpPreferences; @@ -27,6 +28,7 @@ void setUp() { reader = new ImportFormatReader(); ImportFormatPreferences importFormatPreferences = mock(ImportFormatPreferences.class, Answers.RETURNS_DEEP_STUBS); when(importFormatPreferences.getEncoding()).thenReturn(StandardCharsets.UTF_8); + when(importFormatPreferences.getCustomImportList()).thenReturn(Set.of()); reader.resetImportFormats(importFormatPreferences, mock(XmpPreferences.class), fileMonitor); } diff --git a/src/test/java/org/jabref/logic/importer/WebFetchersTest.java b/src/test/java/org/jabref/logic/importer/WebFetchersTest.java index 458a5b0985d..0efb6c29d24 100644 --- a/src/test/java/org/jabref/logic/importer/WebFetchersTest.java +++ b/src/test/java/org/jabref/logic/importer/WebFetchersTest.java @@ -1,5 +1,6 @@ package org.jabref.logic.importer; +import java.nio.charset.Charset; import java.util.Collection; import java.util.HashSet; import java.util.Set; @@ -13,6 +14,8 @@ import org.jabref.logic.importer.fetcher.IsbnViaOttoBibFetcher; import org.jabref.logic.importer.fetcher.JstorFetcher; import org.jabref.logic.importer.fetcher.MrDLibFetcher; +import org.jabref.model.database.BibDatabaseContext; +import org.jabref.preferences.FilePreferences; import io.github.classgraph.ClassGraph; import io.github.classgraph.ClassInfoList; @@ -62,7 +65,7 @@ void getIdBasedFetchersReturnsAllFetcherDerivingFromIdBasedFetcher() throws Exce @Test void getEntryBasedFetchersReturnsAllFetcherDerivingFromEntryBasedFetcher() throws Exception { - Set idFetchers = WebFetchers.getEntryBasedFetchers(importFormatPreferences); + Set idFetchers = WebFetchers.getEntryBasedFetchers(importFormatPreferences, mock(FilePreferences.class), mock(BibDatabaseContext.class), Charset.defaultCharset()); try (ScanResult scanResult = classGraph.scan()) { ClassInfoList controlClasses = scanResult.getClassesImplementing(EntryBasedFetcher.class.getCanonicalName()); diff --git a/src/test/java/org/jabref/logic/importer/fetcher/GrobidCitationFetcherTest.java b/src/test/java/org/jabref/logic/importer/fetcher/GrobidCitationFetcherTest.java index f970273c34b..a8b4f5267d4 100644 --- a/src/test/java/org/jabref/logic/importer/fetcher/GrobidCitationFetcherTest.java +++ b/src/test/java/org/jabref/logic/importer/fetcher/GrobidCitationFetcherTest.java @@ -35,18 +35,23 @@ public class GrobidCitationFetcherTest { static String example1 = "Derwing, T. M., Rossiter, M. J., & Munro, M. J. (2002). Teaching native speakers to listen to foreign-accented speech. Journal of Multilingual and Multicultural Development, 23(4), 245-259."; static BibEntry example1AsBibEntry = new BibEntry(StandardEntryType.Article).withCitationKey("-1") - .withField(StandardField.AUTHOR, "Derwing, T and Rossiter, M and Munro, M") - .withField(StandardField.TITLE, "Teaching native speakers to listen to foreign-accented speech") + .withField(StandardField.AUTHOR, "Derwing, Tracey and Rossiter, Marian and Munro, Murray") + .withField(StandardField.TITLE, "Teaching Native Speakers to Listen to Foreign-accented Speech") .withField(StandardField.JOURNAL, "Journal of Multilingual and Multicultural Development") + .withField(StandardField.DOI, "10.1080/01434630208666468") + .withField(StandardField.DATE, "2002-09") .withField(StandardField.YEAR, "2002") - .withField(StandardField.PAGES, "245--259") + .withField(StandardField.MONTH, "9") + .withField(StandardField.PAGES, "245-259") .withField(StandardField.VOLUME, "23") + .withField(StandardField.PUBLISHER, "Informa UK Limited") .withField(StandardField.NUMBER, "4"); static String example2 = "Thomas, H. K. (2004). Training strategies for improving listeners' comprehension of foreign-accented speech (Doctoral dissertation). University of Colorado, Boulder."; static BibEntry example2AsBibEntry = new BibEntry(BibEntry.DEFAULT_TYPE).withCitationKey("-1") .withField(StandardField.AUTHOR, "Thomas, H") .withField(StandardField.TITLE, "Training strategies for improving listeners' comprehension of foreign-accented speech (Doctoral dissertation)") + .withField(StandardField.DATE, "2004") .withField(StandardField.YEAR, "2004") .withField(StandardField.ADDRESS, "Boulder"); @@ -55,6 +60,7 @@ public class GrobidCitationFetcherTest { .withField(StandardField.AUTHOR, "Turk, J and Graham, P and Verhulst, F") .withField(StandardField.TITLE, "Child and adolescent psychiatry : A developmental approach") .withField(StandardField.PUBLISHER, "Oxford University Press") + .withField(StandardField.DATE, "2007") .withField(StandardField.YEAR, "2007") .withField(StandardField.ADDRESS, "Oxford, England"); @@ -63,6 +69,7 @@ public class GrobidCitationFetcherTest { .withField(StandardField.AUTHOR, "Carr, I and Kidner, R") .withField(StandardField.BOOKTITLE, "Statutes and conventions on international trade law") .withField(StandardField.PUBLISHER, "Cavendish") + .withField(StandardField.DATE, "2003") .withField(StandardField.YEAR, "2003") .withField(StandardField.ADDRESS, "London, England"); diff --git a/src/test/java/org/jabref/logic/importer/fileformat/PdfEmbeddedBibFileImporterTest.java b/src/test/java/org/jabref/logic/importer/fileformat/PdfEmbeddedBibFileImporterTest.java new file mode 100644 index 00000000000..dc5dc085672 --- /dev/null +++ b/src/test/java/org/jabref/logic/importer/fileformat/PdfEmbeddedBibFileImporterTest.java @@ -0,0 +1,66 @@ +package org.jabref.logic.importer.fileformat; + +import java.nio.charset.StandardCharsets; +import java.nio.file.Path; +import java.util.Collections; +import java.util.List; + +import org.jabref.logic.importer.ImportFormatPreferences; +import org.jabref.logic.util.StandardFileType; +import org.jabref.model.entry.BibEntry; +import org.jabref.model.entry.field.StandardField; +import org.jabref.model.entry.types.StandardEntryType; + +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; +import org.mockito.Answers; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.mockito.Mockito.mock; +import static org.mockito.Mockito.when; + +class PdfEmbeddedBibFileImporterTest { + + private PdfEmbeddedBibFileImporter importer; + private ImportFormatPreferences importFormatPreferences; + + @BeforeEach + void setUp() { + importFormatPreferences = mock(ImportFormatPreferences.class, Answers.RETURNS_DEEP_STUBS); + when(importFormatPreferences.getFieldContentFormatterPreferences().getNonWrappableFields()).thenReturn(List.of()); + importer = new PdfEmbeddedBibFileImporter(importFormatPreferences); + } + + @Test + void testsGetExtensions() { + assertEquals(StandardFileType.PDF, importer.getFileType()); + } + + @Test + void testGetDescription() { + assertEquals("PdfEmbeddedBibFileImporter imports an embedded Bib-File from the PDF.", + importer.getDescription()); + } + + @Test + void doesNotHandleEncryptedPdfs() throws Exception { + Path file = Path.of(PdfEmbeddedBibFileImporter.class.getResource("/pdfs/encrypted.pdf").toURI()); + List result = importer.importDatabase(file, StandardCharsets.UTF_8).getDatabase().getEntries(); + assertEquals(Collections.emptyList(), result); + } + + @Test + void importWorksAsExpected() throws Exception { + Path file = Path.of(PdfEmbeddedBibFileImporterTest.class.getResource("mixedMetadata.pdf").toURI()); + List result = importer.importDatabase(file, StandardCharsets.UTF_8).getDatabase().getEntries(); + + BibEntry expected = new BibEntry(StandardEntryType.Misc); + expected.setCitationKey("jabreftext2021"); + expected.setField(StandardField.AUTHOR, "Someone embedded"); + expected.setField(StandardField.TITLE, "I like beds"); + expected.setField(StandardField.DOI, "10.1002/9781118257517"); + expected.setField(StandardField.COMMENT, "From embedded bib"); + + assertEquals(Collections.singletonList(expected), result); + } +} diff --git a/src/test/java/org/jabref/logic/importer/fileformat/PdfGrobidImporterTest.java b/src/test/java/org/jabref/logic/importer/fileformat/PdfGrobidImporterTest.java new file mode 100644 index 00000000000..00de1468886 --- /dev/null +++ b/src/test/java/org/jabref/logic/importer/fileformat/PdfGrobidImporterTest.java @@ -0,0 +1,71 @@ +package org.jabref.logic.importer.fileformat; + +import java.io.IOException; +import java.net.URISyntaxException; +import java.nio.charset.StandardCharsets; +import java.nio.file.Path; +import java.util.List; +import java.util.Optional; + +import org.jabref.logic.importer.ImportFormatPreferences; +import org.jabref.logic.importer.fetcher.GrobidCitationFetcher; +import org.jabref.logic.util.StandardFileType; +import org.jabref.model.entry.BibEntry; +import org.jabref.model.entry.field.StandardField; + +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; +import org.mockito.Answers; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertFalse; +import static org.junit.jupiter.api.Assertions.assertTrue; +import static org.mockito.Mockito.mock; +import static org.mockito.Mockito.when; + +public class PdfGrobidImporterTest { + + private PdfGrobidImporter importer; + private ImportFormatPreferences importFormatPreferences; + + @BeforeEach + public void setUp() { + importFormatPreferences = mock(ImportFormatPreferences.class, Answers.RETURNS_DEEP_STUBS); + when(importFormatPreferences.getKeywordSeparator()).thenReturn(','); + importer = new PdfGrobidImporter(GrobidCitationFetcher.GROBID_URL, importFormatPreferences); + } + + @Test + public void testsGetExtensions() { + assertEquals(StandardFileType.PDF, importer.getFileType()); + } + + @Test + public void testImportEntries() throws URISyntaxException { + Path file = Path.of(PdfGrobidImporterTest.class.getResource("LNCS-minimal.pdf").toURI()); + List bibEntries = importer.importDatabase(file, StandardCharsets.UTF_8).getDatabase().getEntries(); + + assertEquals(1, bibEntries.size()); + + BibEntry be0 = bibEntries.get(0); + assertEquals(Optional.of("Lastname, Firstname"), be0.getField(StandardField.AUTHOR)); + assertEquals(Optional.of("Paper Title"), be0.getField(StandardField.TITLE)); + } + + @Test + public void testIsRecognizedFormat() throws IOException, URISyntaxException { + Path file = Path.of(PdfGrobidImporterTest.class.getResource("annotated.pdf").toURI()); + assertTrue(importer.isRecognizedFormat(file, StandardCharsets.UTF_8)); + } + + @Test + public void testIsRecognizedFormatReject() throws IOException, URISyntaxException { + Path file = Path.of(PdfGrobidImporterTest.class.getResource("BibtexImporter.examples.bib").toURI()); + assertFalse(importer.isRecognizedFormat(file, StandardCharsets.UTF_8)); + } + + @Test + public void testGetCommandLineId() { + assertEquals("grobidPdf", importer.getId()); + } +} diff --git a/src/test/java/org/jabref/logic/importer/fileformat/PdfMergeMetadataImporterTest.java b/src/test/java/org/jabref/logic/importer/fileformat/PdfMergeMetadataImporterTest.java new file mode 100644 index 00000000000..2a9bfbadb4e --- /dev/null +++ b/src/test/java/org/jabref/logic/importer/fileformat/PdfMergeMetadataImporterTest.java @@ -0,0 +1,89 @@ +package org.jabref.logic.importer.fileformat; + +import java.nio.charset.StandardCharsets; +import java.nio.file.Path; +import java.util.Collections; +import java.util.List; + +import org.jabref.logic.importer.ImportFormatPreferences; +import org.jabref.logic.util.StandardFileType; +import org.jabref.logic.xmp.XmpPreferences; +import org.jabref.model.entry.BibEntry; +import org.jabref.model.entry.field.StandardField; +import org.jabref.model.entry.field.UnknownField; +import org.jabref.model.entry.types.StandardEntryType; + +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; +import org.mockito.Answers; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.mockito.Mockito.mock; +import static org.mockito.Mockito.when; + +class PdfMergeMetadataImporterTest { + + private PdfMergeMetadataImporter importer; + private ImportFormatPreferences importFormatPreferences; + private XmpPreferences xmpPreferences; + + @BeforeEach + void setUp() { + importFormatPreferences = mock(ImportFormatPreferences.class, Answers.RETURNS_DEEP_STUBS); + when(importFormatPreferences.getFieldContentFormatterPreferences().getNonWrappableFields()).thenReturn(List.of()); + importer = new PdfMergeMetadataImporter(importFormatPreferences); + } + + @Test + void testsGetExtensions() { + assertEquals(StandardFileType.PDF, importer.getFileType()); + } + + @Test + void testGetDescription() { + assertEquals("PdfMergeMetadataImporter imports metadata from a PDF using multiple strategies and merging the result.", + importer.getDescription()); + } + + @Test + void doesNotHandleEncryptedPdfs() throws Exception { + Path file = Path.of(PdfMergeMetadataImporter.class.getResource("/pdfs/encrypted.pdf").toURI()); + List result = importer.importDatabase(file, StandardCharsets.UTF_8).getDatabase().getEntries(); + assertEquals(Collections.emptyList(), result); + } + + @Test + void importWorksAsExpected() throws Exception { + Path file = Path.of(PdfMergeMetadataImporterTest.class.getResource("mixedMetadata.pdf").toURI()); + List result = importer.importDatabase(file, StandardCharsets.UTF_8).getDatabase().getEntries(); + + // From DOI (contained in embedded bib file) + BibEntry expected = new BibEntry(StandardEntryType.Book); + expected.setCitationKey("Burd_2011"); + expected.setField(StandardField.AUTHOR, "Barry Burd"); + expected.setField(StandardField.TITLE, "Java{\\textregistered} For Dummies{\\textregistered}"); + expected.setField(StandardField.PUBLISHER, "Wiley Publishing, Inc."); + expected.setField(StandardField.YEAR, "2011"); + expected.setField(StandardField.AUTHOR, "Barry Burd"); + expected.setField(StandardField.MONTH, "jul"); + expected.setField(StandardField.DOI, "10.1002/9781118257517"); + + // From ISBN (contained on first page verbatim bib entry) + expected.setField(StandardField.DATE, "2018-01-01"); + expected.setField(new UnknownField("ean"), "9780134685991"); + expected.setField(StandardField.ISBN, "0134685997"); + expected.setField(StandardField.URL, "https://www.ebook.de/de/product/28983211/joshua_bloch_effective_java.html"); + + // From embedded bib file + expected.setField(StandardField.COMMENT, "From embedded bib"); + + // From first page verbatim bib entry + expected.setField(StandardField.JOURNAL, "Some Journal"); + expected.setField(StandardField.VOLUME, "1"); + + // From merge + expected.setField(StandardField.FILE, ":" + file.toAbsolutePath().toString() + ":" + StandardFileType.PDF.getName()); + + assertEquals(Collections.singletonList(expected), result); + } +} diff --git a/src/test/java/org/jabref/logic/importer/fileformat/PdfVerbatimBibTextImporterTest.java b/src/test/java/org/jabref/logic/importer/fileformat/PdfVerbatimBibTextImporterTest.java new file mode 100644 index 00000000000..334f0cb76c0 --- /dev/null +++ b/src/test/java/org/jabref/logic/importer/fileformat/PdfVerbatimBibTextImporterTest.java @@ -0,0 +1,72 @@ +package org.jabref.logic.importer.fileformat; + +import java.nio.charset.StandardCharsets; +import java.nio.file.Path; +import java.util.Collections; +import java.util.List; + +import org.jabref.logic.importer.ImportFormatPreferences; +import org.jabref.logic.util.StandardFileType; +import org.jabref.model.entry.BibEntry; +import org.jabref.model.entry.LinkedFile; +import org.jabref.model.entry.field.StandardField; +import org.jabref.model.entry.types.StandardEntryType; + +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; +import org.mockito.Answers; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.mockito.Mockito.mock; +import static org.mockito.Mockito.when; + +class PdfVerbatimBibTextImporterTest { + + private PdfVerbatimBibTextImporter importer; + private ImportFormatPreferences importFormatPreferences; + + @BeforeEach + void setUp() { + importFormatPreferences = mock(ImportFormatPreferences.class, Answers.RETURNS_DEEP_STUBS); + when(importFormatPreferences.getFieldContentFormatterPreferences().getNonWrappableFields()).thenReturn(List.of()); + importer = new PdfVerbatimBibTextImporter(importFormatPreferences); + } + + @Test + void testsGetExtensions() { + assertEquals(StandardFileType.PDF, importer.getFileType()); + } + + @Test + void testGetDescription() { + assertEquals("PdfVerbatimBibTextImporter imports a verbatim BibTeX entry from the first page of the PDF.", + importer.getDescription()); + } + + @Test + void doesNotHandleEncryptedPdfs() throws Exception { + Path file = Path.of(PdfVerbatimBibTextImporter.class.getResource("/pdfs/encrypted.pdf").toURI()); + List result = importer.importDatabase(file, StandardCharsets.UTF_8).getDatabase().getEntries(); + assertEquals(Collections.emptyList(), result); + } + + @Test + void importTwiceWorksAsExpected() throws Exception { + Path file = Path.of(PdfVerbatimBibTextImporterTest.class.getResource("mixedMetadata.pdf").toURI()); + List result = importer.importDatabase(file, StandardCharsets.UTF_8).getDatabase().getEntries(); + + BibEntry expected = new BibEntry(StandardEntryType.Article); + expected.setCitationKey("jabreftest2021"); + expected.setField(StandardField.AUTHOR, "Me, myself and I"); + expected.setField(StandardField.TITLE, "Something"); + expected.setField(StandardField.VOLUME, "1"); + expected.setField(StandardField.JOURNAL, "Some Journal"); + expected.setField(StandardField.YEAR, "2021"); + expected.setField(StandardField.ISBN, "0134685997"); + expected.setFiles(Collections.singletonList(new LinkedFile("", file.toAbsolutePath(), "PDF"))); + + List resultSecondImport = importer.importDatabase(file, StandardCharsets.UTF_8).getDatabase().getEntries(); + assertEquals(Collections.singletonList(expected), result); + assertEquals(Collections.singletonList(expected), resultSecondImport); + } +} diff --git a/src/test/java/org/jabref/logic/importer/util/GrobidServiceTest.java b/src/test/java/org/jabref/logic/importer/util/GrobidServiceTest.java index d73a8911ce0..b4a0f9e8f0d 100644 --- a/src/test/java/org/jabref/logic/importer/util/GrobidServiceTest.java +++ b/src/test/java/org/jabref/logic/importer/util/GrobidServiceTest.java @@ -1,25 +1,41 @@ package org.jabref.logic.importer.util; import java.io.IOException; +import java.net.URISyntaxException; +import java.nio.file.Path; +import java.util.List; +import java.util.Optional; +import org.jabref.logic.importer.ImportFormatPreferences; +import org.jabref.logic.importer.ParseException; +import org.jabref.logic.importer.fetcher.GrobidCitationFetcher; +import org.jabref.logic.importer.fileformat.PdfGrobidImporterTest; +import org.jabref.model.entry.BibEntry; +import org.jabref.model.entry.field.StandardField; import org.jabref.testutils.category.FetcherTest; import org.junit.jupiter.api.BeforeAll; import org.junit.jupiter.api.Test; +import org.mockito.Answers; import static org.junit.jupiter.api.Assertions.assertEquals; import static org.junit.jupiter.api.Assertions.assertNotNull; import static org.junit.jupiter.api.Assertions.assertThrows; import static org.junit.jupiter.api.Assertions.assertTrue; +import static org.mockito.Mockito.mock; +import static org.mockito.Mockito.when; @FetcherTest public class GrobidServiceTest { private static GrobidService grobidService; + private static ImportFormatPreferences importFormatPreferences; @BeforeAll public static void setup() { - grobidService = new GrobidService("http://grobid.jabref.org:8070"); + grobidService = new GrobidService(GrobidCitationFetcher.GROBID_URL); + importFormatPreferences = mock(ImportFormatPreferences.class, Answers.RETURNS_DEEP_STUBS); + when(importFormatPreferences.getKeywordSeparator()).thenReturn(','); } @Test @@ -30,13 +46,17 @@ public void processValidCitationTest() throws IOException { String[] responseRows = response.split("\n"); assertNotNull(response); assertEquals('@', response.charAt(0)); - assertTrue(responseRows[1].contains("author") && responseRows[1].contains("Derwing, T and Rossiter, M")); - assertTrue(responseRows[2].contains("title") && responseRows[2].contains("Teaching native speakers")); + assertTrue(responseRows[1].contains("author") && responseRows[1].contains("Derwing, Tracey and Rossiter, Marian and Munro, Murray")); + assertTrue(responseRows[2].contains("title") && responseRows[2].contains("Teaching Native Speakers to Listen to Foreign-accented Speech")); assertTrue(responseRows[3].contains("journal") && responseRows[3].contains("Journal of Multilingual and Multicultural")); - assertTrue(responseRows[4].contains("year") && responseRows[4].contains("2002")); - assertTrue(responseRows[5].contains("pages") && responseRows[5].contains("245--259")); - assertTrue(responseRows[6].contains("volume") && responseRows[6].contains("23")); - assertTrue(responseRows[7].contains("number") && responseRows[7].contains("4")); + assertTrue(responseRows[4].contains("publisher") && responseRows[4].contains("Informa UK Limited")); + assertTrue(responseRows[5].contains("date") && responseRows[5].contains("2002-09")); + assertTrue(responseRows[6].contains("year") && responseRows[6].contains("2002")); + assertTrue(responseRows[7].contains("month") && responseRows[7].contains("9")); + assertTrue(responseRows[8].contains("pages") && responseRows[8].contains("245-259")); + assertTrue(responseRows[9].contains("volume") && responseRows[9].contains("23")); + assertTrue(responseRows[10].contains("number") && responseRows[10].contains("4")); + assertTrue(responseRows[11].contains("doi") && responseRows[11].contains("10.1080/01434630208666468")); } @Test @@ -51,4 +71,15 @@ public void processInvalidCitationTest() { assertThrows(IOException.class, () -> grobidService.processCitation("iiiiiiiiiiiiiiiiiiiiiiii", GrobidService.ConsolidateCitations.WITH_METADATA)); } + @Test + public void processPdfTest() throws IOException, ParseException, URISyntaxException { + Path file = Path.of(PdfGrobidImporterTest.class.getResource("LNCS-minimal.pdf").toURI()); + List response = grobidService.processPDF(file, importFormatPreferences); + assertEquals(1, response.size()); + BibEntry be0 = response.get(0); + assertEquals(Optional.of("Lastname, Firstname"), be0.getField(StandardField.AUTHOR)); + assertEquals(Optional.of("Paper Title"), be0.getField(StandardField.TITLE)); + assertEquals(Optional.of("2014-10-05"), be0.getField(StandardField.DATE)); + } + } diff --git a/src/test/resources/org/jabref/logic/importer/fileformat/mixedMetadata.pdf b/src/test/resources/org/jabref/logic/importer/fileformat/mixedMetadata.pdf new file mode 100644 index 00000000000..535de7264d5 Binary files /dev/null and b/src/test/resources/org/jabref/logic/importer/fileformat/mixedMetadata.pdf differ