diff --git a/CHANGELOG.md b/CHANGELOG.md index 93532bbfa36..806c9ea2636 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -131,6 +131,7 @@ We refer to [GitHub issues](https://github.com/JabRef/jabref/issues) by using `# - We fixed an issue where the same menu for changing entry type had two different sizes and weights. [#4977](https://github.com/JabRef/jabref/issues/4977) - We fixed an issue where the "Attach file" dialog, in the right-click menu for an entry, started on the working directory instead of the user's main directory. [#4995](https://github.com/JabRef/jabref/issues/4995) - We fixed an issue where the JabRef Icon in the macOS launchpad was not displayed correctly [#5003](https://github.com/JabRef/jabref/issues/5003) +- We fixed an issue where the "Search for unlinked local files" would throw an exception when parsing the content of a PDF-file with missing "series" information [#5128](https://github.com/JabRef/jabref/issues/5128) ### Removed diff --git a/src/main/java/org/jabref/logic/importer/fileformat/PdfContentImporter.java b/src/main/java/org/jabref/logic/importer/fileformat/PdfContentImporter.java index da14ac8a3c3..0ff9114e7d3 100644 --- a/src/main/java/org/jabref/logic/importer/fileformat/PdfContentImporter.java +++ b/src/main/java/org/jabref/logic/importer/fileformat/PdfContentImporter.java @@ -18,6 +18,7 @@ import org.jabref.logic.importer.ParserResult; import org.jabref.logic.importer.fetcher.DoiFetcher; import org.jabref.logic.l10n.Localization; +import org.jabref.logic.util.OS; import org.jabref.logic.util.StandardFileType; import org.jabref.logic.xmp.EncryptedPdfsNotSupportedException; import org.jabref.logic.xmp.XmpUtilReader; @@ -27,6 +28,7 @@ import org.jabref.model.entry.StandardEntryType; import org.jabref.model.entry.field.StandardField; import org.jabref.model.entry.identifier.DOI; +import org.jabref.model.strings.StringUtil; import com.google.common.base.Strings; import org.apache.pdfbox.pdmodel.PDDocument; @@ -37,7 +39,6 @@ *

* Currently, Springer and IEEE formats are supported. *

- * Integrating XMP support is future work */ public class PdfContentImporter extends Importer { @@ -50,11 +51,11 @@ public class PdfContentImporter extends Importer { private String curString; private String year; - public PdfContentImporter(ImportFormatPreferences importFormatPreferences) { this.importFormatPreferences = importFormatPreferences; } + /** * Removes all non-letter characters at the end *

@@ -64,7 +65,7 @@ public PdfContentImporter(ImportFormatPreferences importFormatPreferences) { * TODO: Additionally replace multiple subsequent spaces by one space, which will cause a rename of this method *

*/ - private static String removeNonLettersAtEnd(String input) { + private String removeNonLettersAtEnd(String input) { String result = input.trim(); if (result.isEmpty()) { return result; @@ -82,7 +83,7 @@ private static String removeNonLettersAtEnd(String input) { return result; } - private static String streamlineNames(String names) { + private String streamlineNames(String names) { // TODO: replace with NormalizeNamesFormatter?! String res; // supported formats: @@ -163,7 +164,7 @@ private static String streamlineNames(String names) { res = res.concat(" and "); } if ("et".equalsIgnoreCase(splitNames[i]) && (splitNames.length > (i + 1)) - && "al.".equalsIgnoreCase(splitNames[i + 1])) { + && "al.".equalsIgnoreCase(splitNames[i + 1])) { res = res.concat("others"); break; } else { @@ -178,7 +179,7 @@ private static String streamlineNames(String names) { return res; } - private static String streamlineTitle(String title) { + private String streamlineTitle(String title) { return removeNonLettersAtEnd(title); } @@ -190,17 +191,15 @@ public boolean isRecognizedFormat(BufferedReader input) throws IOException { @Override public ParserResult importDatabase(BufferedReader reader) throws IOException { Objects.requireNonNull(reader); - throw new UnsupportedOperationException( - "PdfContentImporter does not support importDatabase(BufferedReader reader)." - + "Instead use importDatabase(Path filePath, Charset defaultEncoding)."); + throw new UnsupportedOperationException("PdfContentImporter does not support importDatabase(BufferedReader reader)." + + "Instead use importDatabase(Path filePath, Charset defaultEncoding)."); } @Override public ParserResult importDatabase(String data) throws IOException { Objects.requireNonNull(data); - throw new UnsupportedOperationException( - "PdfContentImporter does not support importDatabase(String data)." - + "Instead use importDatabase(Path filePath, Charset defaultEncoding)."); + throw new UnsupportedOperationException("PdfContentImporter does not support importDatabase(String data)." + + "Instead use importDatabase(Path filePath, Charset defaultEncoding)."); } @Override @@ -218,280 +217,297 @@ public ParserResult importDatabase(Path filePath, Charset defaultEncoding) { return parserResult; } - // idea: split[] contains the different lines - // blocks are separated by empty lines - // treat each block - // or do special treatment at authors (which are not broken) - // therefore, we do a line-based and not a block-based splitting - // i points to the current line - // curString (mostly) contains the current block - // the different lines are joined into one and thereby separated by " " - lines = firstPageContents.split(System.lineSeparator()); - - lineIndex = 0; //to prevent array index out of bounds exception on second run we need to reset i to zero - - proceedToNextNonEmptyLine(); - if (lineIndex >= lines.length) { - // PDF could not be parsed or is empty - // return empty list - return new ParserResult(); - } + Optional entry = getEntryFromPDFContent(firstPageContents, OS.NEWLINE); + entry.ifPresent(result::add); - // we start at the current line - curString = lines[lineIndex]; - // i might get incremented later and curString modified, too - lineIndex = lineIndex + 1; - - String author; - String editor = null; - String abstractT = null; - String keywords = null; - String title; - String conference = null; - String DOI = null; - String series = null; - String volume = null; - String number = null; - String pages = null; - // year is a class variable as the method extractYear() uses it; - String publisher = null; - - EntryType type = StandardEntryType.InProceedings; - if (curString.length() > 4) { - // special case: possibly conference as first line on the page - extractYear(); - if (curString.contains("Conference")) { + } catch (EncryptedPdfsNotSupportedException e) { + return ParserResult.fromErrorMessage(Localization.lang("Decryption not supported.")); + } catch (IOException exception) { + return ParserResult.fromError(exception); + } catch (FetcherException e) { + return ParserResult.fromErrorMessage(e.getMessage()); + } + + result.forEach(entry -> entry.addFile(new LinkedFile("", filePath.toAbsolutePath().toString(), "PDF"))); + return new ParserResult(result); + } + + //make this method package visible so we can test it + Optional getEntryFromPDFContent(String firstpageContents, String lineSeparator) { + + // idea: split[] contains the different lines + // blocks are separated by empty lines + // treat each block + // or do special treatment at authors (which are not broken) + // therefore, we do a line-based and not a block-based splitting + // i points to the current line + // curString (mostly) contains the current block + // the different lines are joined into one and thereby separated by " " + + String firstpageContentsUnifiedLineBreaks = StringUtil.unifyLineBreaks(firstpageContents, lineSeparator); + + lines = firstpageContentsUnifiedLineBreaks.split(lineSeparator); + + lineIndex = 0; //to prevent array index out of bounds exception on second run we need to reset i to zero + + proceedToNextNonEmptyLine(); + if (lineIndex >= lines.length) { + // PDF could not be parsed or is empty + // return empty list + return Optional.empty(); + } + + // we start at the current line + curString = lines[lineIndex]; + // i might get incremented later and curString modified, too + lineIndex = lineIndex + 1; + + String author; + String editor = null; + String abstractT = null; + String keywords = null; + String title; + String conference = null; + String DOI = null; + String series = null; + String volume = null; + String number = null; + String pages = null; + // year is a class variable as the method extractYear() uses it; + String publisher = null; + + EntryType type = StandardEntryType.InProceedings; + if (curString.length() > 4) { + // special case: possibly conference as first line on the page + extractYear(); + if (curString.contains("Conference")) { + fillCurStringWithNonEmptyLines(); + conference = curString; + curString = ""; + } else { + // e.g. Copyright (c) 1998 by the Genetics Society of America + // future work: get year using RegEx + String lower = curString.toLowerCase(Locale.ROOT); + if (lower.contains("copyright")) { fillCurStringWithNonEmptyLines(); - conference = curString; + publisher = curString; curString = ""; - } else { - // e.g. Copyright (c) 1998 by the Genetics Society of America - // future work: get year using RegEx - String lower = curString.toLowerCase(Locale.ROOT); - if (lower.contains("copyright")) { - fillCurStringWithNonEmptyLines(); - publisher = curString; - curString = ""; - } } } + } + + // start: title + fillCurStringWithNonEmptyLines(); + title = streamlineTitle(curString); + curString = ""; + //i points to the next non-empty line - // start: title - fillCurStringWithNonEmptyLines(); - title = streamlineTitle(curString); - curString = ""; - //i points to the next non-empty line - - // after title: authors - author = null; - while ((lineIndex < lines.length) && !"".equals(lines[lineIndex])) { - // author names are unlikely to be lines among different lines - // treat them line by line - curString = streamlineNames(lines[lineIndex]); - if (author == null) { - author = curString; + // after title: authors + author = null; + while ((lineIndex < lines.length) && !"".equals(lines[lineIndex])) { + // author names are unlikely to be lines among different lines + // treat them line by line + curString = streamlineNames(lines[lineIndex]); + if (author == null) { + author = curString; + } else { + if ("".equals(curString)) { + // if lines[i] is "and" then "" is returned by streamlineNames -> do nothing } else { - if ("".equals(curString)) { - // if lines[i] is "and" then "" is returned by streamlineNames -> do nothing - } else { - author = author.concat(" and ").concat(curString); - } + author = author.concat(" and ").concat(curString); } - lineIndex++; } - curString = ""; lineIndex++; + } + curString = ""; + lineIndex++; - // then, abstract and keywords follow - while (lineIndex < lines.length) { - curString = lines[lineIndex]; - if ((curString.length() >= "Abstract".length()) && "Abstract".equalsIgnoreCase(curString.substring(0, "Abstract".length()))) { - if (curString.length() == "Abstract".length()) { - // only word "abstract" found -- skip line - curString = ""; - } else { - curString = curString.substring("Abstract".length() + 1).trim().concat(System.lineSeparator()); - } - lineIndex++; - // fillCurStringWithNonEmptyLines() cannot be used as that uses " " as line separator - // whereas we need linebreak as separator - while ((lineIndex < lines.length) && !"".equals(lines[lineIndex])) { - curString = curString.concat(lines[lineIndex]).concat(System.lineSeparator()); - lineIndex++; - } - abstractT = curString.trim(); - lineIndex++; - } else if ((curString.length() >= "Keywords".length()) && "Keywords".equalsIgnoreCase(curString.substring(0, "Keywords".length()))) { - if (curString.length() == "Keywords".length()) { - // only word "Keywords" found -- skip line - curString = ""; - } else { - curString = curString.substring("Keywords".length() + 1).trim(); - } + // then, abstract and keywords follow + while (lineIndex < lines.length) { + curString = lines[lineIndex]; + if ((curString.length() >= "Abstract".length()) && "Abstract".equalsIgnoreCase(curString.substring(0, "Abstract".length()))) { + if (curString.length() == "Abstract".length()) { + // only word "abstract" found -- skip line + curString = ""; + } else { + curString = curString.substring("Abstract".length() + 1).trim().concat(System.lineSeparator()); + } + lineIndex++; + // fillCurStringWithNonEmptyLines() cannot be used as that uses " " as line separator + // whereas we need linebreak as separator + while ((lineIndex < lines.length) && !"".equals(lines[lineIndex])) { + curString = curString.concat(lines[lineIndex]).concat(System.lineSeparator()); lineIndex++; - fillCurStringWithNonEmptyLines(); - keywords = removeNonLettersAtEnd(curString); + } + abstractT = curString.trim(); + lineIndex++; + } else if ((curString.length() >= "Keywords".length()) && "Keywords".equalsIgnoreCase(curString.substring(0, "Keywords".length()))) { + if (curString.length() == "Keywords".length()) { + // only word "Keywords" found -- skip line + curString = ""; } else { - String lower = curString.toLowerCase(Locale.ROOT); + curString = curString.substring("Keywords".length() + 1).trim(); + } + lineIndex++; + fillCurStringWithNonEmptyLines(); + keywords = removeNonLettersAtEnd(curString); + } else { + String lower = curString.toLowerCase(Locale.ROOT); - int pos = lower.indexOf("technical"); + int pos = lower.indexOf("technical"); + if (pos >= 0) { + type = StandardEntryType.TechReport; + pos = curString.trim().lastIndexOf(' '); if (pos >= 0) { - type = StandardEntryType.TechReport; - pos = curString.trim().lastIndexOf(' '); - if (pos >= 0) { - // assumption: last character of curString is NOT ' ' - // otherwise pos+1 leads to an out-of-bounds exception - number = curString.substring(pos + 1); - } + // assumption: last character of curString is NOT ' ' + // otherwise pos+1 leads to an out-of-bounds exception + number = curString.substring(pos + 1); } - - lineIndex++; - proceedToNextNonEmptyLine(); } + + lineIndex++; + proceedToNextNonEmptyLine(); } + } - lineIndex = lines.length - 1; + lineIndex = lines.length - 1; - // last block: DOI, detailed information - // sometimes, this information is in the third last block etc... - // therefore, read until the beginning of the file + // last block: DOI, detailed information + // sometimes, this information is in the third last block etc... + // therefore, read until the beginning of the file - while (lineIndex >= 0) { - readLastBlock(); - // i now points to the block before or is -1 - // curString contains the last block, separated by " " + while (lineIndex >= 0) { + readLastBlock(); + // i now points to the block before or is -1 + // curString contains the last block, separated by " " - extractYear(); + extractYear(); - int pos = curString.indexOf("(Eds.)"); - if ((pos >= 0) && (publisher == null)) { - // looks like a Springer last line - // e.g: A. Persson and J. Stirna (Eds.): PoEM 2009, LNBIP 39, pp. 161-175, 2009. - publisher = "Springer"; - editor = streamlineNames(curString.substring(0, pos - 1)); - curString = curString.substring(pos + "(Eds.)".length() + 2); //+2 because of ":" after (Eds.) and the subsequent space - String[] springerSplit = curString.split(", "); - if (springerSplit.length >= 4) { - conference = springerSplit[0]; + int pos = curString.indexOf("(Eds.)"); + if ((pos >= 0) && (publisher == null)) { + // looks like a Springer last line + // e.g: A. Persson and J. Stirna (Eds.): PoEM 2009, LNBIP 39, pp. 161-175, 2009. + publisher = "Springer"; + editor = streamlineNames(curString.substring(0, pos - 1)); - String seriesData = springerSplit[1]; - int lastSpace = seriesData.lastIndexOf(' '); - series = seriesData.substring(0, lastSpace); - volume = seriesData.substring(lastSpace + 1); + int edslength = "(Eds.)".length(); + int posWithEditor = pos + edslength + 2;//+2 because of ":" after (Eds.) and the subsequent space + if (posWithEditor > curString.length()) { + curString = curString.substring(posWithEditor - 2); //we don't have any spaces after Eds so we substract the 2 + } else { + curString = curString.substring(posWithEditor); + } + String[] springerSplit = curString.split(", "); + if (springerSplit.length >= 4) { + conference = springerSplit[0]; - pages = springerSplit[2].substring(4); + String seriesData = springerSplit[1]; + int lastSpace = seriesData.lastIndexOf(' '); + series = seriesData.substring(0, lastSpace); + volume = seriesData.substring(lastSpace + 1); - if (springerSplit[3].length() >= 4) { - year = springerSplit[3].substring(0, 4); - } + pages = springerSplit[2].substring(4); + + if (springerSplit[3].length() >= 4) { + year = springerSplit[3].substring(0, 4); } - } else { - if (DOI == null) { - pos = curString.indexOf("DOI"); - if (pos < 0) { - pos = curString.indexOf(StandardField.DOI.getName()); + } + } else { + if (DOI == null) { + pos = curString.indexOf("DOI"); + if (pos < 0) { + pos = curString.indexOf(StandardField.DOI.getName()); + } + if (pos >= 0) { + pos += 3; + char delimiter = curString.charAt(pos); + if ((delimiter == ':') || (delimiter == ' ')) { + pos++; } - if (pos >= 0) { - pos += 3; - char delimiter = curString.charAt(pos); - if ((delimiter == ':') || (delimiter == ' ')) { - pos++; - } - int nextSpace = curString.indexOf(' ', pos); - if (nextSpace > 0) { - DOI = curString.substring(pos, nextSpace); - } else { - DOI = curString.substring(pos); - } + int nextSpace = curString.indexOf(' ', pos); + if (nextSpace > 0) { + DOI = curString.substring(pos, nextSpace); + } else { + DOI = curString.substring(pos); } } + } - if ((publisher == null) && curString.contains("IEEE")) { - // IEEE has the conference things at the end - publisher = "IEEE"; - - // year is extracted by extractYear - // otherwise, we could it determine as follows: - // String yearStr = curString.substring(curString.length()-4); - // if (isYear(yearStr)) { - // year = yearStr; - // } - - if (conference == null) { - pos = curString.indexOf('$'); + if ((publisher == null) && curString.contains("IEEE")) { + // IEEE has the conference things at the end + publisher = "IEEE"; + + // year is extracted by extractYear + // otherwise, we could it determine as follows: + // String yearStr = curString.substring(curString.length()-4); + // if (isYear(yearStr)) { + // year = yearStr; + // } + + if (conference == null) { + pos = curString.indexOf('$'); + if (pos > 0) { + // we found the price + // before the price, the ISSN is stated + // skip that + pos -= 2; + while ((pos >= 0) && (curString.charAt(pos) != ' ')) { + pos--; + } if (pos > 0) { - // we found the price - // before the price, the ISSN is stated - // skip that - pos -= 2; - while ((pos >= 0) && (curString.charAt(pos) != ' ')) { - pos--; - } - if (pos > 0) { - conference = curString.substring(0, pos); - } + conference = curString.substring(0, pos); } } } } } + } - BibEntry entry = new BibEntry(); - entry.setType(type); + BibEntry entry = new BibEntry(); + entry.setType(type); - // TODO: institution parsing missing + // TODO: institution parsing missing - if (author != null) { - entry.setField(StandardField.AUTHOR, author); - } - if (editor != null) { - entry.setField(StandardField.EDITOR, editor); - } - if (abstractT != null) { - entry.setField(StandardField.ABSTRACT, abstractT); - } - if (!Strings.isNullOrEmpty(keywords)) { - entry.setField(StandardField.KEYWORDS, keywords); - } - if (title != null) { - entry.setField(StandardField.TITLE, title); - } - if (conference != null) { - entry.setField(StandardField.BOOKTITLE, conference); - } - if (DOI != null) { - entry.setField(StandardField.DOI, DOI); - } - if (series != null) { - entry.setField(StandardField.SERIES, series); - } - if (volume != null) { - entry.setField(StandardField.VOLUME, volume); - } - if (number != null) { - entry.setField(StandardField.NUMBER, number); - } - if (pages != null) { - entry.setField(StandardField.PAGES, pages); - } - if (year != null) { - entry.setField(StandardField.YEAR, year); - } - if (publisher != null) { - entry.setField(StandardField.PUBLISHER, publisher); - } - - result.add(entry); - } catch (EncryptedPdfsNotSupportedException e) { - return ParserResult.fromErrorMessage(Localization.lang("Decryption not supported.")); - } catch (IOException exception) { - return ParserResult.fromError(exception); - } catch (FetcherException e) { - return ParserResult.fromErrorMessage(e.getMessage()); + if (author != null) { + entry.setField(StandardField.AUTHOR, author); } - - result.forEach(entry -> entry.addFile(new LinkedFile("", filePath.toAbsolutePath().toString(), "PDF"))); - return new ParserResult(result); + if (editor != null) { + entry.setField(StandardField.EDITOR, editor); + } + if (abstractT != null) { + entry.setField(StandardField.ABSTRACT, abstractT); + } + if (!Strings.isNullOrEmpty(keywords)) { + entry.setField(StandardField.KEYWORDS, keywords); + } + if (title != null) { + entry.setField(StandardField.TITLE, title); + } + if (conference != null) { + entry.setField(StandardField.BOOKTITLE, conference); + } + if (DOI != null) { + entry.setField(StandardField.DOI, DOI); + } + if (series != null) { + entry.setField(StandardField.SERIES, series); + } + if (volume != null) { + entry.setField(StandardField.VOLUME, volume); + } + if (number != null) { + entry.setField(StandardField.NUMBER, number); + } + if (pages != null) { + entry.setField(StandardField.PAGES, pages); + } + if (year != null) { + entry.setField(StandardField.YEAR, year); + } + if (publisher != null) { + entry.setField(StandardField.PUBLISHER, publisher); + } + return Optional.of(entry); } private String getFirstPageContents(PDDocument document) throws IOException { diff --git a/src/test/java/org/jabref/logic/importer/fileformat/PdfContentImporterTest.java b/src/test/java/org/jabref/logic/importer/fileformat/PdfContentImporterTest.java index 502d3b3ee6b..e7df979a3a1 100644 --- a/src/test/java/org/jabref/logic/importer/fileformat/PdfContentImporterTest.java +++ b/src/test/java/org/jabref/logic/importer/fileformat/PdfContentImporterTest.java @@ -5,6 +5,7 @@ import java.nio.file.Paths; import java.util.Collections; import java.util.List; +import java.util.Optional; import org.jabref.logic.importer.ImportFormatPreferences; import org.jabref.logic.util.StandardFileType; @@ -35,8 +36,7 @@ void testsGetExtensions() { @Test void testGetDescription() { - assertEquals( - "PdfContentImporter parses data of the first page of the PDF and creates a BibTeX entry. Currently, Springer and IEEE formats are supported.", + assertEquals("PdfContentImporter parses data of the first page of the PDF and creates a BibTeX entry. Currently, Springer and IEEE formats are supported.", importer.getDescription()); } @@ -62,4 +62,31 @@ void importTwiceWorksAsExpected() throws Exception { assertEquals(Collections.singletonList(expected), resultSecondImport); } + @Test + void testParsingEditorWithoutPagesorSeriesInformation() { + + BibEntry entry = new BibEntry(StandardEntryType.InProceedings); + entry.setField(StandardField.AUTHOR, "Anke Lüdeling and Merja Kytö (Eds.)"); + entry.setField(StandardField.EDITOR, "Anke Lüdeling and Merja Kytö"); + entry.setField(StandardField.PUBLISHER, "Springer"); + entry.setField(StandardField.TITLE, "Corpus Linguistics – An International Handbook – Lüdeling, Anke, Kytö, Merja (Eds.)"); + + String firstPageContents = "Corpus Linguistics – An International Handbook – Lüdeling, Anke,\n" + + "Kytö, Merja (Eds.)\n" + + "\n" + + "Anke Lüdeling, Merja Kytö (Eds.)\n" + + "\n" + + "VOLUME 2\n" + + "\n" + + "This handbook provides an up-to-date survey of the field of corpus linguistics, a Handbücher zur Sprach- und\n" + + "field whose methodology has revolutionized much of the empirical work done in Kommunikationswissenschaft / Handbooks\n" + + "\n" + + "of Linguistics and Communication Science\n" + + "most fields of linguistic study over the past decade. (HSK) 29/2\n" + + "\n" + + "vii, 578 pages\n" + + "Corpus linguistics investigates human language by starting out from large\n"; + + assertEquals(Optional.of(entry), importer.getEntryFromPDFContent(firstPageContents, "\n")); + } }