diff --git a/build.gradle b/build.gradle index a04a43b0cb1..91599bdd57e 100644 --- a/build.gradle +++ b/build.gradle @@ -60,6 +60,9 @@ repositories { maven { url 'https://oss.sonatype.org/content/groups/public' } + maven { + url 'http://maven.icm.edu.pl/artifactory/repo' + } } configurations { @@ -81,6 +84,8 @@ dependencies { compile 'org.apache.pdfbox:fontbox:1.8.13' compile 'org.apache.pdfbox:jempbox:1.8.13' + compile 'pl.edu.icm.cermine:cermine-impl:1.11' + // required for reading write-protected PDFs - see https://github.com/JabRef/jabref/pull/942#issuecomment-209252635 compile 'org.bouncycastle:bcprov-jdk15on:1.55' diff --git a/src/main/java/net/sf/jabref/logic/bibtexkeypattern/BibtexKeyPatternPreferences.java b/src/main/java/net/sf/jabref/logic/bibtexkeypattern/BibtexKeyPatternPreferences.java index c634b08b6a7..867efe29a0d 100644 --- a/src/main/java/net/sf/jabref/logic/bibtexkeypattern/BibtexKeyPatternPreferences.java +++ b/src/main/java/net/sf/jabref/logic/bibtexkeypattern/BibtexKeyPatternPreferences.java @@ -1,5 +1,7 @@ package net.sf.jabref.logic.bibtexkeypattern; +import java.util.Collections; + import net.sf.jabref.model.bibtexkeypattern.GlobalBibtexKeyPattern; public class BibtexKeyPatternPreferences { @@ -12,6 +14,16 @@ public class BibtexKeyPatternPreferences { private final GlobalBibtexKeyPattern keyPattern; private Character keywordDelimiter; + public BibtexKeyPatternPreferences() { + this.keyPatternRegex = ""; + this.keyPatternReplacement = ""; + this.alwaysAddLetter = false; + this.firstLetterA = false; + this.enforceLegalKey = false; + this.keyPattern = new GlobalBibtexKeyPattern(Collections.emptyList()); + this.keywordDelimiter = ','; + } + public BibtexKeyPatternPreferences(String keyPatternRegex, String keyPatternReplacement, boolean alwaysAddLetter, boolean firstLetterA, boolean enforceLegalKey, GlobalBibtexKeyPattern keyPattern, Character keywordDelimiter) { diff --git a/src/main/java/net/sf/jabref/logic/formatter/bibtexfields/NormalizeDateFormatter.java b/src/main/java/net/sf/jabref/logic/formatter/bibtexfields/NormalizeDateFormatter.java index b0d5d7d97b0..e640054dd8f 100644 --- a/src/main/java/net/sf/jabref/logic/formatter/bibtexfields/NormalizeDateFormatter.java +++ b/src/main/java/net/sf/jabref/logic/formatter/bibtexfields/NormalizeDateFormatter.java @@ -1,14 +1,10 @@ package net.sf.jabref.logic.formatter.bibtexfields; -import java.time.format.DateTimeFormatter; -import java.time.format.DateTimeParseException; -import java.time.temporal.TemporalAccessor; -import java.util.Arrays; -import java.util.List; import java.util.Optional; import net.sf.jabref.logic.l10n.Localization; import net.sf.jabref.model.cleanup.Formatter; +import net.sf.jabref.model.entry.Date; /** * This class transforms date to the format yyyy-mm-dd or yyyy-mm.. @@ -34,13 +30,8 @@ public String getKey() { */ @Override public String format(String value) { - Optional parsedDate = tryParseDate(value); - if (!parsedDate.isPresent()) { - return value; - } - - DateTimeFormatter dateFormatter = DateTimeFormatter.ofPattern("uuuu-MM[-dd]"); - return dateFormatter.format(parsedDate.get()); + Optional parsedDate = Date.parse(value); + return parsedDate.map(Date::getNormalized).orElse(value); } @Override @@ -53,29 +44,6 @@ public String getExampleInput() { return "29.11.2003"; } - /* - * Try to parse the following formats - * "M/y" (covers 9/15, 9/2015, and 09/2015) - * "MMMM (dd), yyyy" (covers September 1, 2015 and September, 2015) - * "yyyy-MM-dd" (covers 2009-1-15) - * "d.M.uuuu" (covers 15.1.2015) - * "uuuu.M.d" (covers 2015.1.15) - * The code is essentially taken from http://stackoverflow.com/questions/4024544/how-to-parse-dates-in-multiple-formats-using-simpledateformat. - */ - private Optional tryParseDate(String dateString) { - List formatStrings = Arrays.asList("uuuu-M-d", "uuuu-M", "M/uu", "M/uuuu", "MMMM d, uuuu", "MMMM, uuuu", - "d.M.uuuu", "uuuu.M.d"); - for (String formatString : formatStrings) { - try { - return Optional.of(DateTimeFormatter.ofPattern(formatString).parse(dateString)); - } catch (DateTimeParseException ignored) { - // Ignored - } - } - - return Optional.empty(); - } - @Override public int hashCode() { return defaultHashCode(); diff --git a/src/main/java/net/sf/jabref/logic/importer/ImportFormatPreferences.java b/src/main/java/net/sf/jabref/logic/importer/ImportFormatPreferences.java index 2fd36c1da51..89606dee9a7 100644 --- a/src/main/java/net/sf/jabref/logic/importer/ImportFormatPreferences.java +++ b/src/main/java/net/sf/jabref/logic/importer/ImportFormatPreferences.java @@ -1,6 +1,8 @@ package net.sf.jabref.logic.importer; import java.nio.charset.Charset; +import java.nio.charset.StandardCharsets; +import java.util.Collections; import java.util.Set; import net.sf.jabref.logic.bibtex.FieldContentParserPreferences; @@ -16,6 +18,15 @@ public class ImportFormatPreferences { private final FieldContentParserPreferences fieldContentParserPreferences; private final boolean keywordSyncEnabled; + public ImportFormatPreferences() { + this.customImportList = Collections.emptySet(); + this.encoding = StandardCharsets.UTF_8; + this.keywordSeparator = ','; + this.bibtexKeyPatternPreferences = new BibtexKeyPatternPreferences(); + this.fieldContentParserPreferences = new FieldContentParserPreferences(); + this.keywordSyncEnabled = false; + } + public ImportFormatPreferences(Set customImportList, Charset encoding, Character keywordSeparator, BibtexKeyPatternPreferences bibtexKeyPatternPreferences, FieldContentParserPreferences fieldContentParserPreferences, boolean keywordSyncEnabled) { diff --git a/src/main/java/net/sf/jabref/logic/importer/ParserResult.java b/src/main/java/net/sf/jabref/logic/importer/ParserResult.java index d8edd7fb780..dc10b349f7f 100644 --- a/src/main/java/net/sf/jabref/logic/importer/ParserResult.java +++ b/src/main/java/net/sf/jabref/logic/importer/ParserResult.java @@ -21,14 +21,12 @@ public class ParserResult { private static final ParserResult NULL_RESULT = new ParserResult(null, null, null); private final BibDatabase base; - private MetaData metaData; private final Map entryTypes; - private BibDatabaseContext bibDatabaseContext; - - private File file; private final List warnings = new ArrayList<>(); private final List duplicateKeys = new ArrayList<>(); - + private MetaData metaData; + private BibDatabaseContext bibDatabaseContext; + private File file; private String errorMessage; private boolean invalid; @@ -61,6 +59,18 @@ public static ParserResult fromErrorMessage(String message) { return parserResult; } + public static ParserResult getNullResult() { + return NULL_RESULT; + } + + public static ParserResult fromError(Exception exception) { + return fromErrorMessage(exception.getLocalizedMessage()); + } + + public static ParserResult fromEntry(BibEntry entry) { + return new ParserResult(Collections.singleton(entry)); + } + /** * Check if this base is marked to be added to the currently open tab. Default is false. * @@ -181,8 +191,4 @@ public boolean hasDatabaseContext() { public boolean isNullResult() { return this == NULL_RESULT; } - - public static ParserResult getNullResult() { - return NULL_RESULT; - } } diff --git a/src/main/java/net/sf/jabref/logic/importer/fileformat/PdfContentImporter.java b/src/main/java/net/sf/jabref/logic/importer/fileformat/PdfContentImporter.java index 75e0f05ec8f..d82d837866e 100644 --- a/src/main/java/net/sf/jabref/logic/importer/fileformat/PdfContentImporter.java +++ b/src/main/java/net/sf/jabref/logic/importer/fileformat/PdfContentImporter.java @@ -2,587 +2,126 @@ import java.io.BufferedReader; import java.io.FileInputStream; +import java.io.FileNotFoundException; import java.io.IOException; -import java.io.StringWriter; +import java.io.InputStream; import java.nio.charset.Charset; import java.nio.file.Path; -import java.util.ArrayList; +import java.util.List; import java.util.Objects; -import java.util.Optional; -import java.util.regex.Matcher; -import java.util.regex.Pattern; +import java.util.stream.Collectors; -import net.sf.jabref.logic.importer.FetcherException; import net.sf.jabref.logic.importer.ImportFormatPreferences; import net.sf.jabref.logic.importer.Importer; import net.sf.jabref.logic.importer.ParserResult; -import net.sf.jabref.logic.importer.fetcher.DoiFetcher; -import net.sf.jabref.logic.l10n.Localization; -import net.sf.jabref.logic.util.DOI; import net.sf.jabref.logic.util.FileExtensions; -import net.sf.jabref.logic.xmp.EncryptedPdfsNotSupportedException; -import net.sf.jabref.logic.xmp.XMPUtil; import net.sf.jabref.model.entry.BibEntry; -import net.sf.jabref.model.entry.BibtexEntryTypes; -import net.sf.jabref.model.entry.EntryType; +import net.sf.jabref.model.entry.BibLatexEntryTypes; +import net.sf.jabref.model.entry.Date; import net.sf.jabref.model.entry.FieldName; -import com.google.common.base.Strings; -import org.apache.pdfbox.pdmodel.PDDocument; -import org.apache.pdfbox.util.PDFTextStripper; +import org.apache.commons.io.input.ReaderInputStream; +import pl.edu.icm.cermine.ContentExtractor; +import pl.edu.icm.cermine.exception.AnalysisException; +import pl.edu.icm.cermine.metadata.model.DocumentAuthor; +import pl.edu.icm.cermine.metadata.model.DocumentDate; +import pl.edu.icm.cermine.metadata.model.DocumentMetadata; -/** - * PdfContentImporter parses data of the first page of the PDF and creates a BibTeX entry. - *

- * Currently, Springer and IEEE formats are supported. - *

- * Integrating XMP support is future work - */ public class PdfContentImporter extends Importer { - private static final Pattern YEAR_EXTRACT_PATTERN = Pattern.compile("\\d{4}"); - - // input lines into several lines - private String[] lines; - - // current index in lines - private int i; - - private String curString; - - private String year; - private final ImportFormatPreferences importFormatPreferences; - public PdfContentImporter(ImportFormatPreferences importFormatPreferences) { - this.importFormatPreferences = importFormatPreferences; - } - /** - * Removes all non-letter characters at the end - *

- * EXCEPTION: a closing bracket is NOT removed - *

- *

- * TODO: Additionally replace multiple subsequent spaces by one space, which will cause a rename of this method - *

- */ - private static String removeNonLettersAtEnd(String input) { - String result = input.trim(); - if (result.isEmpty()) { - return result; - } - char lastC = result.charAt(result.length() - 1); - while (!Character.isLetter(lastC) && (lastC != ')')) { - // if there is an asterix, a dot or something else at the end: remove it - result = result.substring(0, result.length() - 1); - if (result.isEmpty()) { - break; - } else { - lastC = result.charAt(result.length() - 1); - } - } - return result; - } - - private static String streamlineNames(String names) { - // TODO: replace with NormalizeNamesFormatter?! - String res; - // supported formats: - // Matthias Schrepfer1, Johannes Wolf1, Jan Mendling1, and Hajo A. Reijers2 - if (names.contains(",")) { - String[] splitNames = names.split(","); - res = ""; - boolean isFirst = true; - for (String splitName : splitNames) { - String curName = removeNonLettersAtEnd(splitName); - if (curName.indexOf("and") == 0) { - // skip possible ands between names - curName = curName.substring(3).trim(); - } else { - int posAnd = curName.indexOf(" and "); - if (posAnd >= 0) { - String nameBefore = curName.substring(0, posAnd); - // cannot be first name as "," is contained in the string - res = res.concat(" and ").concat(removeNonLettersAtEnd(nameBefore)); - curName = curName.substring(posAnd + 5); - } - } - - if (!"".equals(curName)) { - if ("et al.".equalsIgnoreCase(curName)) { - curName = "others"; - } - if (isFirst) { - isFirst = false; - } else { - res = res.concat(" and "); - } - res = res.concat(curName); - } - } - } else { - // assumption: names separated by space - - String[] splitNames = names.split(" "); - if (splitNames.length == 0) { - // empty names... something was really wrong... - return ""; - } - - boolean workedOnFirstOrMiddle = false; - boolean isFirst = true; - int i = 0; - res = ""; - do { - if (workedOnFirstOrMiddle) { - // last item was a first or a middle name - // we have to check whether we are on a middle name - // if not, just add the item as last name and add an "and" - if (splitNames[i].contains(".")) { - // we found a middle name - res = res.concat(splitNames[i]).concat(" "); - } else { - // last name found - res = res.concat(removeNonLettersAtEnd(splitNames[i])); - - if (!splitNames[i].isEmpty() && Character.isLowerCase(splitNames[i].charAt(0))) { - // it is probably be "van", "vom", ... - // we just rely on the fact that these things are written in lower case letters - // do NOT finish name - res = res.concat(" "); - } else { - // finish this name - workedOnFirstOrMiddle = false; - } - } - } else { - if ("and".equalsIgnoreCase(splitNames[i])) { - // do nothing, just increment i at the end of this iteration - } else { - if (isFirst) { - isFirst = false; - } else { - res = res.concat(" and "); - } - if ("et".equalsIgnoreCase(splitNames[i]) && (splitNames.length > (i + 1)) - && "al.".equalsIgnoreCase(splitNames[i + 1])) { - res = res.concat("others"); - break; - } else { - res = res.concat(splitNames[i]).concat(" "); - workedOnFirstOrMiddle = true; - } - } - } - i++; - } while (i < splitNames.length); - } - return res; - } - - private static String streamlineTitle(String title) { - return removeNonLettersAtEnd(title); + this.importFormatPreferences = Objects.requireNonNull(importFormatPreferences); } @Override - public boolean isRecognizedFormat(BufferedReader reader) throws IOException { - Objects.requireNonNull(reader); + public boolean isRecognizedFormat(BufferedReader input) throws IOException { + Objects.requireNonNull(input); return false; } @Override - public ParserResult importDatabase(BufferedReader reader) throws IOException { - Objects.requireNonNull(reader); - throw new UnsupportedOperationException( - "PdfContentImporter does not support importDatabase(BufferedReader reader)." - + "Instead use importDatabase(Path filePath, Charset defaultEncoding)."); + public ParserResult importDatabase(BufferedReader input) throws IOException { + try { + ContentExtractor extractor = new ContentExtractor(); + extractor.setPDF(new ReaderInputStream(input, importFormatPreferences.getEncoding())); + DocumentMetadata documentMetadata = extractor.getMetadata(); + return convertMetadataToLibrary(documentMetadata); + } catch (AnalysisException e) { + return ParserResult.fromError(e); + } } @Override - public ParserResult importDatabase(Path filePath, Charset defaultEncoding) { - final ArrayList result = new ArrayList<>(1); - try (FileInputStream fileStream = new FileInputStream(filePath.toFile()); - PDDocument document = XMPUtil.loadWithAutomaticDecryption(fileStream)) { - String firstPageContents = getFirstPageContents(document); - - Optional doi = DOI.findInText(firstPageContents); - if (doi.isPresent()) { - ParserResult parserResult = new ParserResult(result); - Optional entry = new DoiFetcher(importFormatPreferences).performSearchById(doi.get().getDOI()); - entry.ifPresent(parserResult.getDatabase()::insertEntry); - return parserResult; - } - - // idea: split[] contains the different lines - // blocks are separated by empty lines - // treat each block - // or do special treatment at authors (which are not broken) - // therefore, we do a line-based and not a block-based splitting - // i points to the current line - // curString (mostly) contains the current block - // the different lines are joined into one and thereby separated by " " - lines = firstPageContents.split(System.lineSeparator()); - - proceedToNextNonEmptyLine(); - if (i >= lines.length) { - // PDF could not be parsed or is empty - // return empty list - return new ParserResult(); - } - - // we start at the current line - curString = lines[i]; - // i might get incremented later and curString modified, too - i = i + 1; - - String author; - String editor = null; - String abstractT = null; - String keywords = null; - String title; - String conference = null; - String DOI = null; - String series = null; - String volume = null; - String number = null; - String pages = null; - // year is a class variable as the method extractYear() uses it; - String publisher = null; - - EntryType type = BibtexEntryTypes.INPROCEEDINGS; - if (curString.length() > 4) { - // special case: possibly conference as first line on the page - extractYear(); - if (curString.contains("Conference")) { - fillCurStringWithNonEmptyLines(); - conference = curString; - curString = ""; - } else { - // e.g. Copyright (c) 1998 by the Genetics Society of America - // future work: get year using RegEx - String lower = curString.toLowerCase(); - if (lower.contains("copyright")) { - fillCurStringWithNonEmptyLines(); - publisher = curString; - curString = ""; - } - } - } - - // start: title - fillCurStringWithNonEmptyLines(); - title = streamlineTitle(curString); - curString = ""; - //i points to the next non-empty line - - // after title: authors - author = null; - while ((i < lines.length) && !"".equals(lines[i])) { - // author names are unlikely to be lines among different lines - // treat them line by line - curString = streamlineNames(lines[i]); - if (author == null) { - author = curString; - } else { - if ("".equals(curString)) { - // if lines[i] is "and" then "" is returned by streamlineNames -> do nothing - } else { - author = author.concat(" and ").concat(curString); - } - } - i++; - } - curString = ""; - i++; - - // then, abstract and keywords follow - while (i < lines.length) { - curString = lines[i]; - if ((curString.length() >= "Abstract".length()) && "Abstract".equalsIgnoreCase(curString.substring(0, "Abstract".length()))) { - if (curString.length() == "Abstract".length()) { - // only word "abstract" found -- skip line - curString = ""; - } else { - curString = curString.substring("Abstract".length() + 1).trim().concat(System.lineSeparator()); - } - i++; - // fillCurStringWithNonEmptyLines() cannot be used as that uses " " as line separator - // whereas we need linebreak as separator - while ((i < lines.length) && !"".equals(lines[i])) { - curString = curString.concat(lines[i]).concat(System.lineSeparator()); - i++; - } - abstractT = curString.trim(); - i++; - } else if ((curString.length() >= "Keywords".length()) && "Keywords".equalsIgnoreCase(curString.substring(0, "Keywords".length()))) { - if (curString.length() == "Keywords".length()) { - // only word "Keywords" found -- skip line - curString = ""; - } else { - curString = curString.substring("Keywords".length() + 1).trim(); - } - i++; - fillCurStringWithNonEmptyLines(); - keywords = removeNonLettersAtEnd(curString); - } else { - String lower = curString.toLowerCase(); - - int pos = lower.indexOf("technical"); - if (pos >= 0) { - type = BibtexEntryTypes.TECHREPORT; - pos = curString.trim().lastIndexOf(' '); - if (pos >= 0) { - // assumption: last character of curString is NOT ' ' - // otherwise pos+1 leads to an out-of-bounds exception - number = curString.substring(pos + 1); - } - } - - i++; - proceedToNextNonEmptyLine(); - } - } - - i = lines.length - 1; - - // last block: DOI, detailed information - // sometimes, this information is in the third last block etc... - // therefore, read until the beginning of the file - - while (i >= 0) { - readLastBlock(); - // i now points to the block before or is -1 - // curString contains the last block, separated by " " - - extractYear(); - - int pos = curString.indexOf("(Eds.)"); - if ((pos >= 0) && (publisher == null)) { - // looks like a Springer last line - // e.g: A. Persson and J. Stirna (Eds.): PoEM 2009, LNBIP 39, pp. 161-175, 2009. - publisher = "Springer"; - editor = streamlineNames(curString.substring(0, pos - 1)); - curString = curString.substring(pos + "(Eds.)".length() + 2); //+2 because of ":" after (Eds.) and the subsequent space - String[] springerSplit = curString.split(", "); - if (springerSplit.length >= 4) { - conference = springerSplit[0]; - - String seriesData = springerSplit[1]; - int lastSpace = seriesData.lastIndexOf(' '); - series = seriesData.substring(0, lastSpace); - volume = seriesData.substring(lastSpace + 1); - - pages = springerSplit[2].substring(4); - - if (springerSplit[3].length() >= 4) { - year = springerSplit[3].substring(0, 4); - } - } - } else { - if (DOI == null) { - pos = curString.indexOf("DOI"); - if (pos < 0) { - pos = curString.indexOf(FieldName.DOI); - } - if (pos >= 0) { - pos += 3; - char delimiter = curString.charAt(pos); - if ((delimiter == ':') || (delimiter == ' ')) { - pos++; - } - int nextSpace = curString.indexOf(' ', pos); - if (nextSpace > 0) { - DOI = curString.substring(pos, nextSpace); - } else { - DOI = curString.substring(pos); - } - } - } - - if ((publisher == null) && curString.contains("IEEE")) { - // IEEE has the conference things at the end - publisher = "IEEE"; - - // year is extracted by extractYear - // otherwise, we could it determine as follows: - // String yearStr = curString.substring(curString.length()-4); - // if (isYear(yearStr)) { - // year = yearStr; - // } - - if (conference == null) { - pos = curString.indexOf('$'); - if (pos > 0) { - // we found the price - // before the price, the ISSN is stated - // skip that - pos -= 2; - while ((pos >= 0) && (curString.charAt(pos) != ' ')) { - pos--; - } - if (pos > 0) { - conference = curString.substring(0, pos); - } - } - } - } - } - } - - BibEntry entry = new BibEntry(); - entry.setType(type); - - // TODO: institution parsing missing - - if (author != null) { - entry.setField(FieldName.AUTHOR, author); - } - if (editor != null) { - entry.setField(FieldName.EDITOR, editor); - } - if (abstractT != null) { - entry.setField(FieldName.ABSTRACT, abstractT); - } - if (!Strings.isNullOrEmpty(keywords)) { - entry.setField(FieldName.KEYWORDS, keywords); - } - if (title != null) { - entry.setField(FieldName.TITLE, title); - } - if (conference != null) { - entry.setField(FieldName.BOOKTITLE, conference); - } - if (DOI != null) { - entry.setField(FieldName.DOI, DOI); - } - if (series != null) { - entry.setField(FieldName.SERIES, series); - } - if (volume != null) { - entry.setField(FieldName.VOLUME, volume); - } - if (number != null) { - entry.setField(FieldName.NUMBER, number); - } - if (pages != null) { - entry.setField(FieldName.PAGES, pages); - } - if (year != null) { - entry.setField(FieldName.YEAR, year); - } - if (publisher != null) { - entry.setField(FieldName.PUBLISHER, publisher); - } - - result.add(entry); - } catch (EncryptedPdfsNotSupportedException e) { - return ParserResult.fromErrorMessage(Localization.lang("Decryption not supported.")); - } catch(IOException exception) { - return ParserResult.fromErrorMessage(exception.getLocalizedMessage()); - } catch (FetcherException e) { - return ParserResult.fromErrorMessage(e.getMessage()); + public ParserResult importDatabase(Path filePath, Charset encoding) throws IOException { + try (InputStream inputStream = new FileInputStream(filePath.toFile())) { + ContentExtractor extractor = new ContentExtractor(); + extractor.setPDF(inputStream); + DocumentMetadata documentMetadata = extractor.getMetadata(); + return convertMetadataToLibrary(documentMetadata); + } catch (AnalysisException | FileNotFoundException e) { + return ParserResult.fromError(e); } - - return new ParserResult(result); } - private String getFirstPageContents(PDDocument document) throws IOException { - PDFTextStripper stripper = new PDFTextStripper(); - - stripper.setStartPage(1); - stripper.setEndPage(1); - stripper.setSortByPosition(true); - stripper.setParagraphEnd(System.lineSeparator()); - StringWriter writer = new StringWriter(); - stripper.writeText(document, writer); - - return writer.toString(); - } - - /** - * Extract the year out of curString (if it is not yet defined) - */ - private void extractYear() { - if (year != null) { - return; + private ParserResult convertMetadataToLibrary(DocumentMetadata documentMetadata) { + BibEntry entry = new BibEntry(BibLatexEntryTypes.ARTICLE); + if (documentMetadata.getAbstrakt() != null) { + entry.setField(FieldName.ABSTRACT, documentMetadata.getAbstrakt()); } - - Matcher m = YEAR_EXTRACT_PATTERN.matcher(curString); - if (m.find()) { - year = curString.substring(m.start(), m.end()); + if (documentMetadata.getFirstPage() != null && documentMetadata.getLastPage() != null) { + entry.setField(FieldName.PAGES, documentMetadata.getFirstPage() + documentMetadata.getLastPage()); } - - } - - /** - * PDFTextStripper normally does NOT produce multiple empty lines - * (besides at strange PDFs). These strange PDFs are handled here: - * proceed to next non-empty line - */ - private void proceedToNextNonEmptyLine() { - while ((i < lines.length) && "".equals(lines[i].trim())) { - i++; + if (documentMetadata.getIssue() != null) { + entry.setField(FieldName.ISSUE, documentMetadata.getIssue()); } - } - - /** - * Fill curString with lines until "" is found - * No trailing space is added - * i is advanced to the next non-empty line (ignoring white space) - *

- * Lines containing only white spaces are ignored, - * but NOT considered as "" - *

- * Uses GLOBAL variables lines, curLine, i - */ - private void fillCurStringWithNonEmptyLines() { - // ensure that curString does not end with " " - curString = curString.trim(); - while ((i < lines.length) && !"".equals(lines[i])) { - String curLine = lines[i].trim(); - if (!"".equals(curLine)) { - if (!curString.isEmpty()) { - // insert separating space if necessary - curString = curString.concat(" "); - } - curString = curString.concat(lines[i]); - } - i++; + if (documentMetadata.getJournal() != null) { + entry.setField(FieldName.JOURNAL, documentMetadata.getJournal()); } - - proceedToNextNonEmptyLine(); - } - - /** - * resets curString - * curString now contains the last block (until "" reached) - * Trailing space is added - *

- * invariant before/after: i points to line before the last handled block - */ - private void readLastBlock() { - while ((i >= 0) && "".equals(lines[i].trim())) { - i--; + if (documentMetadata.getJournalISSN() != null) { + entry.setField(FieldName.ISSN, documentMetadata.getJournalISSN()); + } + if (documentMetadata.getPublisher() != null) { + entry.setField(FieldName.PUBLISHER, documentMetadata.getPublisher()); + } + if (documentMetadata.getTitle() != null) { + entry.setField(FieldName.TITLE, documentMetadata.getTitle()); + } + if (documentMetadata.getVolume() != null) { + entry.setField(FieldName.VOLUME, documentMetadata.getVolume()); } - // i is now at the end of a block + if (documentMetadata.getId(DocumentMetadata.ID_DOI) != null) { + entry.setField(FieldName.DOI, documentMetadata.getId(DocumentMetadata.ID_DOI)); + } + if (documentMetadata.getDate(DocumentDate.DATE_PUBLISHED) != null) { + entry.setField(FieldName.DATE, convertDateToString(documentMetadata.getDate(DocumentDate.DATE_PUBLISHED))); + } + entry.setField(FieldName.EDITOR, convertPersonNamesToString(documentMetadata.getEditors())); + entry.setField(FieldName.AUTHOR, convertPersonNamesToString(documentMetadata.getAuthors())); + entry.setField(FieldName.KEYWORDS, convertKeywordsToString(documentMetadata.getKeywords())); + // The following fields provided by CERMINE are ignored since we have no proper BibTeX equivalent + //entry.setField(FieldName, documentMetadata.getId(DocumentMetadata.ID_URN)); + //entry.setField(FieldName, documentMetadata.getId(DocumentMetadata.ID_HINDAWI)); + //entry.setField(FieldName, documentMetadata.getDate(DocumentDate.DATE_ACCEPTED)); + //entry.setField(FieldName, documentMetadata.getDate(DocumentDate.DATE_RECEIVED)); + //entry.setField(FieldName, documentMetadata.getDate(DocumentDate.DATE_REVISED)); + //entry.setField(FieldName, documentMetadata.getAffiliations()); + return ParserResult.fromEntry(entry); + } - int end = i; + private String convertDateToString(DocumentDate date) { + return Date.parse(date.getDay(), date.getMonth(), date.getYear()).map(Date::getNormalized).orElse(""); + } - // find beginning - while ((i >= 0) && !"".equals(lines[i])) { - i--; - } - // i is now the line before the beginning of the block - // this fulfills the invariant + private String convertKeywordsToString(List keywords) { + return keywords.stream().collect(Collectors.joining(importFormatPreferences.getKeywordSeparator() + " ")); + } - curString = ""; - for (int j = i + 1; j <= end; j++) { - curString = curString.concat(lines[j].trim()); - if (j != end) { - curString = curString.concat(" "); - } - } + private String convertPersonNamesToString(List persons) { + return persons.stream() + .map(DocumentAuthor::getName) + .collect(Collectors.joining(" and ")); } @Override @@ -597,7 +136,6 @@ public FileExtensions getExtensions() { @Override public String getDescription() { - return "PdfContentImporter parses data of the first page of the PDF and creates a BibTeX entry. Currently, Springer and IEEE formats are supported."; + return "Parses the PDF and extracts metadata."; } - } diff --git a/src/main/java/net/sf/jabref/model/entry/BibEntry.java b/src/main/java/net/sf/jabref/model/entry/BibEntry.java index f1e55b73751..809798f2b2d 100644 --- a/src/main/java/net/sf/jabref/model/entry/BibEntry.java +++ b/src/main/java/net/sf/jabref/model/entry/BibEntry.java @@ -36,46 +36,35 @@ import org.apache.commons.logging.LogFactory; public class BibEntry implements Cloneable { - private static final Log LOGGER = LogFactory.getLog(BibEntry.class); - public static final String TYPE_HEADER = "entrytype"; public static final String OBSOLETE_TYPE_HEADER = "bibtextype"; public static final String KEY_FIELD = "bibtexkey"; - protected static final String ID_FIELD = "id"; public static final String DEFAULT_TYPE = "misc"; - + protected static final String ID_FIELD = "id"; + private static final Log LOGGER = LogFactory.getLog(BibEntry.class); private static final Pattern REMOVE_TRAILING_WHITESPACE = Pattern.compile("\\s+$"); - - private String id; - private final SharedBibEntryData sharedBibEntryData; - - private String type; - private Map fields = new ConcurrentHashMap<>(); - /** * Map to store the words in every field */ private final Map> fieldsAsWords = new HashMap<>(); - /** * Cache that stores latex free versions of fields. */ private final Map latexFreeFields = new ConcurrentHashMap<>(); - + private final EventBus eventBus = new EventBus(); + private String id; + private String type; + private Map fields = new ConcurrentHashMap<>(); /** * Used to cleanse field values for internal LaTeX-free storage */ private LatexToUnicode unicodeConverter = new LatexToUnicode(); - // Search and grouping status is stored in boolean fields for quick reference: private boolean searchHit; private boolean groupHit; - private String parsedSerialization; - private String commentsBeforeEntry = ""; - /** * Marks whether the complete serialization, which was read from file, should be used. * @@ -83,8 +72,6 @@ public class BibEntry implements Cloneable { */ private boolean changed; - private final EventBus eventBus = new EventBus(); - /** * Constructs a new BibEntry. The internal ID is set to IdGenerator.next() */ @@ -102,6 +89,15 @@ public BibEntry(String type) { this(IdGenerator.next(), type); } + /** + * Constructs a new BibEntry with the given type + * + * @param type The type to set. + */ + public BibEntry(EntryType type) { + this(type.getName()); + } + /** * Constructs a new BibEntry with the given ID and given type * @@ -165,6 +161,13 @@ public Optional getResolvedFieldOrAlias(String field, BibDatabase databa return result.map(resultText -> BibDatabase.getText(resultText, database)); } + /** + * Returns this entry's ID. + */ + public String getId() { + return id; + } + /** * Sets this entry's ID, provided the database containing it * doesn't veto the change. @@ -182,15 +185,17 @@ public void setId(String id) { } /** - * Returns this entry's ID. + * Returns the cite key AKA citation key AKA BibTeX key, or null if it is not set. + * Note: this is not the internal Id of this entry. The internal Id is always present, whereas the BibTeX key might not be present. */ - public String getId() { - return id; + @Deprecated + public String getCiteKey() { + return fields.get(KEY_FIELD); } /** - * Sets the cite key AKA citation key AKA BibTeX key. - * Note: This is not the internal Id of this entry. The internal Id is always present, whereas the BibTeX key might not be present. + * Sets the cite key AKA citation key AKA BibTeX key. Note: This is not the internal Id of this entry. + * The internal Id is always present, whereas the BibTeX key might not be present. * * @param newCiteKey The cite key to set. Must not be null; use {@link #clearCiteKey()} to remove the cite key. */ @@ -198,15 +203,6 @@ public void setCiteKey(String newCiteKey) { setField(KEY_FIELD, newCiteKey); } - /** - * Returns the cite key AKA citation key AKA BibTeX key, or null if it is not set. - * Note: this is not the internal Id of this entry. The internal Id is always present, whereas the BibTeX key might not be present. - */ - @Deprecated - public String getCiteKey() { - return fields.get(KEY_FIELD); - } - public Optional getCiteKeyOptional() { return Optional.ofNullable(fields.get(KEY_FIELD)); } @@ -222,6 +218,13 @@ public String getType() { return type; } + /** + * Sets this entry's type. + */ + public void setType(EntryType type) { + this.setType(type.getName()); + } + /** * Sets this entry's type. */ @@ -249,13 +252,6 @@ public void setType(String type) { setType(type, EntryEventSource.LOCAL); } - /** - * Sets this entry's type. - */ - public void setType(EntryType type) { - this.setType(type.getName()); - } - /** * Returns an set containing the names of all fields that are * set for this particular entry. @@ -381,10 +377,6 @@ public Date parse(String source, ParsePosition pos) { return Optional.empty(); } - private interface GetFieldInterface { - Optional getValueForField(String fieldName); - } - /** * Return the LaTeX-free contents of the given field or its alias an an Optional * @@ -646,15 +638,15 @@ public Optional getPublicationDate() { return year; } + public String getParsedSerialization() { + return parsedSerialization; + } + public void setParsedSerialization(String parsedSerialization) { changed = false; this.parsedSerialization = parsedSerialization; } - public String getParsedSerialization() { - return parsedSerialization; - } - public void setCommentsBeforeEntry(String parsedComments) { this.commentsBeforeEntry = parsedComments; } @@ -838,4 +830,8 @@ public Optional setFiles(List files) { return this.setField(FieldName.FILE, newValue); } + private interface GetFieldInterface { + Optional getValueForField(String fieldName); + } + } diff --git a/src/main/java/net/sf/jabref/model/entry/Date.java b/src/main/java/net/sf/jabref/model/entry/Date.java new file mode 100644 index 00000000000..b634256aeda --- /dev/null +++ b/src/main/java/net/sf/jabref/model/entry/Date.java @@ -0,0 +1,62 @@ +package net.sf.jabref.model.entry; + +import java.time.DateTimeException; +import java.time.LocalDate; +import java.time.format.DateTimeFormatter; +import java.time.format.DateTimeParseException; +import java.time.temporal.TemporalAccessor; +import java.util.Arrays; +import java.util.List; +import java.util.Optional; + +public class Date { + + private final TemporalAccessor date; + + public Date(int day, MonthUtil.Month month, int year) { + this.date = LocalDate.of(year, month.number, day); + } + + public Date(TemporalAccessor date) { + this.date = date; + } + + public static Optional parse(String day, String month, String year) { + try { + int dayParsed = Integer.parseInt(day); + MonthUtil.Month monthParsed = MonthUtil.getMonth(month); + int yearParsed = Integer.parseInt(year); + return Optional.of(new Date(dayParsed, monthParsed, yearParsed)); + } catch (NumberFormatException | DateTimeException exception) { + return Optional.empty(); + } + } + + /** + * Try to parse the following formats "M/y" (covers 9/15, 9/2015, and 09/2015) "MMMM (dd), yyyy" (covers September + * 1, 2015 and September, 2015) "yyyy-MM-dd" (covers 2009-1-15) "d.M.uuuu" (covers 15.1.2015) "uuuu.M.d" (covers + * 2015.1.15) The code is essentially taken from http://stackoverflow.com/questions/4024544/how-to-parse-dates-in-multiple-formats-using-simpledateformat. + */ + public static Optional parse(String dateString) { + List formatStrings = + Arrays.asList("uuuu-M-d", "uuuu-M", "M/uu", "M/uuuu", "MMMM d, uuuu", "MMMM, uuuu", "d.M.uuuu", "uuuu.M.d"); + for (String formatString : formatStrings) { + try { + TemporalAccessor parsed = DateTimeFormatter.ofPattern(formatString).parse(dateString); + return Optional.of(new Date(parsed)); + } catch (DateTimeParseException ignored) { + // Ignored + } + } + + return Optional.empty(); + } + + /** + * Formats the date to a string of the form yyyy-mm-dd or yyyy-mm. + */ + public String getNormalized() { + DateTimeFormatter dateFormatter = DateTimeFormatter.ofPattern("uuuu-MM[-dd]"); + return dateFormatter.format(date); + } +} diff --git a/src/main/java/net/sf/jabref/model/entry/MonthUtil.java b/src/main/java/net/sf/jabref/model/entry/MonthUtil.java index 225c830f07f..7111178aa31 100644 --- a/src/main/java/net/sf/jabref/model/entry/MonthUtil.java +++ b/src/main/java/net/sf/jabref/model/entry/MonthUtil.java @@ -26,44 +26,6 @@ public class MonthUtil { new Month("December", "dec", "12", "#dec#", 12, 11) ); - - public static class Month { - - public final String fullName; - public final String shortName; - public final String twoDigitNumber; - public final String bibtexFormat; - public final int number; - public final int index; - - - public Month(String fullName, String shortName, String twoDigitNumber, String bibtexFormat, int number, int index) { - this.fullName = fullName; - this.shortName = shortName; - this.twoDigitNumber = twoDigitNumber; - this.bibtexFormat = bibtexFormat; - this.number = number; - this.index = index; - } - - public boolean isValid() { - return true; - } - } - - private static class UnknownMonth extends Month { - - public UnknownMonth() { - super(null, null, null, null, 0, -1); - } - - @Override - public boolean isValid() { - return false; - } - } - - /** * Find month by number * @@ -136,4 +98,42 @@ public static Month getMonth(String value) { } } + public static class Month { + + public final String fullName; + public final String shortName; + public final String twoDigitNumber; + public final String bibtexFormat; + /** + * In the range 1 - 12 + */ + public final int number; + public final int index; + + + public Month(String fullName, String shortName, String twoDigitNumber, String bibtexFormat, int number, int index) { + this.fullName = fullName; + this.shortName = shortName; + this.twoDigitNumber = twoDigitNumber; + this.bibtexFormat = bibtexFormat; + this.number = number; + this.index = index; + } + + public boolean isValid() { + return true; + } + } + + private static class UnknownMonth extends Month { + + public UnknownMonth() { + super(null, null, null, null, 0, -1); + } + + @Override + public boolean isValid() { + return false; + } + } } diff --git a/src/main/java/net/sf/jabref/pdfimport/PdfImporter.java b/src/main/java/net/sf/jabref/pdfimport/PdfImporter.java index c75de6303bb..71ff2617f11 100644 --- a/src/main/java/net/sf/jabref/pdfimport/PdfImporter.java +++ b/src/main/java/net/sf/jabref/pdfimport/PdfImporter.java @@ -1,6 +1,7 @@ package net.sf.jabref.pdfimport; import java.io.File; +import java.io.IOException; import java.nio.file.Path; import java.nio.file.Paths; import java.util.ArrayList; @@ -42,13 +43,12 @@ public class PdfImporter { + private static final Log LOGGER = LogFactory.getLog(PdfImporter.class); private final JabRefFrame frame; private final BasePanel panel; private final MainTable entryTable; private final int dropRow; - private static final Log LOGGER = LogFactory.getLog(PdfImporter.class); - /** * Creates the PdfImporter * @@ -64,30 +64,6 @@ public PdfImporter(JabRefFrame frame, BasePanel panel, MainTable entryTable, int this.dropRow = dropRow; } - - public class ImportPdfFilesResult { - - private final List noPdfFiles; - private final List entries; - - - public ImportPdfFilesResult(List noPdfFiles, List entries) { - this.noPdfFiles = noPdfFiles; - this.entries = entries; - } - - - public List getNoPdfFiles() { - return noPdfFiles; - } - - - public List getEntries() { - return entries; - } - } - - /** * * Imports the PDF files given by fileNames @@ -218,7 +194,13 @@ private void doContentImport(String fileName, List res) { PdfContentImporter contentImporter = new PdfContentImporter( Globals.prefs.getImportFormatPreferences()); Path filePath = Paths.get(fileName); - ParserResult result = contentImporter.importDatabase(filePath, Globals.prefs.getDefaultEncoding()); + ParserResult result; + try { + result = contentImporter.importDatabase(filePath, Globals.prefs.getDefaultEncoding()); + } catch (IOException e) { + frame.showMessage(e.getLocalizedMessage()); + return; + } if (result.hasWarnings()) { frame.showMessage(result.getErrorMessage()); } @@ -291,4 +273,26 @@ private Optional createNewEntry() { } return Optional.empty(); } + + public class ImportPdfFilesResult { + + private final List noPdfFiles; + private final List entries; + + + public ImportPdfFilesResult(List noPdfFiles, List entries) { + this.noPdfFiles = noPdfFiles; + this.entries = entries; + } + + + public List getNoPdfFiles() { + return noPdfFiles; + } + + + public List getEntries() { + return entries; + } + } } diff --git a/src/test/java/net/sf/jabref/logic/importer/fileformat/PdfContentImporterTest.java b/src/test/java/net/sf/jabref/logic/importer/fileformat/PdfContentImporterTest.java index c5e89799471..70bde0b4871 100644 --- a/src/test/java/net/sf/jabref/logic/importer/fileformat/PdfContentImporterTest.java +++ b/src/test/java/net/sf/jabref/logic/importer/fileformat/PdfContentImporterTest.java @@ -1,15 +1,14 @@ package net.sf.jabref.logic.importer.fileformat; -import java.net.URISyntaxException; import java.nio.charset.StandardCharsets; import java.nio.file.Path; import java.nio.file.Paths; import java.util.Collections; import java.util.List; +import net.sf.jabref.logic.importer.ImportFormatPreferences; import net.sf.jabref.logic.util.FileExtensions; import net.sf.jabref.model.entry.BibEntry; -import net.sf.jabref.preferences.JabRefPreferences; import org.junit.Before; import org.junit.Test; @@ -20,10 +19,9 @@ public class PdfContentImporterTest { private PdfContentImporter importer; - @Before public void setUp() { - importer = new PdfContentImporter(JabRefPreferences.getInstance().getImportFormatPreferences()); + importer = new PdfContentImporter(new ImportFormatPreferences()); } @Test @@ -32,14 +30,7 @@ public void testsGetExtensions() { } @Test - public void testGetDescription() { - assertEquals( - "PdfContentImporter parses data of the first page of the PDF and creates a BibTeX entry. Currently, Springer and IEEE formats are supported.", - importer.getDescription()); - } - - @Test - public void doesNotHandleEncryptedPdfs() throws URISyntaxException { + public void doesNotHandleEncryptedPdfs() throws Exception { Path file = Paths.get(PdfContentImporter.class.getResource("/pdfs/encrypted.pdf").toURI()); List result = importer.importDatabase(file, StandardCharsets.UTF_8).getDatabase().getEntries(); assertEquals(Collections.emptyList(), result); diff --git a/src/test/java/net/sf/jabref/logic/importer/fileformat/PdfContentImporterTestFiles.java b/src/test/java/net/sf/jabref/logic/importer/fileformat/PdfContentImporterTestFiles.java index ab695d2df51..3d921335dc6 100644 --- a/src/test/java/net/sf/jabref/logic/importer/fileformat/PdfContentImporterTestFiles.java +++ b/src/test/java/net/sf/jabref/logic/importer/fileformat/PdfContentImporterTestFiles.java @@ -7,11 +7,12 @@ import java.nio.file.Paths; import java.util.Arrays; import java.util.Collection; +import java.util.Collections; import java.util.List; -import net.sf.jabref.logic.bibtex.BibEntryAssert; +import net.sf.jabref.logic.importer.ImportFormatPreferences; import net.sf.jabref.model.entry.BibEntry; -import net.sf.jabref.preferences.JabRefPreferences; +import net.sf.jabref.model.entry.BibLatexEntryTypes; import org.junit.Test; import org.junit.runner.RunWith; @@ -19,35 +20,45 @@ import org.junit.runners.Parameterized.Parameter; import org.junit.runners.Parameterized.Parameters; +import static org.junit.Assert.assertEquals; + @RunWith(Parameterized.class) public class PdfContentImporterTestFiles { - @Parameter - public String fileName; - + @Parameter public String pdfFileName; + @Parameter(value = 1) public BibEntry expectedEntry; @Parameters(name = "{index}: {0}") public static Collection fileNames() { - // The test folder contains pairs of PDFs and BibTeX files. We check each pair. - // This method returns the basenames of the available pairs - Object[][] data = new Object[][] { // minimal PDF, not encrypted - {"LNCS-minimal"}, + {"LNCS-minimal.pdf", + new BibEntry(BibLatexEntryTypes.ARTICLE) + .withField("abstract", "Abstract goes here Simple Figure Simple Table Figure 1. Simple Figure Table 1. Simple Table") // expected: Abstract goes here + .withField("title", "Firstname Lastname and Firstname Lastname") // expected: Paper Title and the return as author + }, // minimal PDF, write-protected, thus encrypted - {"LNCS-minimal-protected"}}; + {"LNCS-minimal-protected.pdf", + new BibEntry(BibLatexEntryTypes.ARTICLE) + .withField("abstract", "Abstract goes here Simple Figure Simple Table Figure 1. Simple Figure Table 1. Simple Table") // expected: Abstract goes here + .withField("title", "Firstname Lastname and Firstname Lastname") // expected: Paper Title and the return as author + }, + {"1405.2249v1.pdf", + new BibEntry(BibLatexEntryTypes.ARTICLE) // expected: thesis + .withField("author", "Master's Thesis and Presented by Tobias Diez and Assessors: Dr. G. Rudolph Dr. R. Verch") // expected: Tobias Diez + .withField("pages", "86127") // expected: 1 -- 127 + .withField("title", "Slice theorem for Fréchet group actions and covariant symplectic field theory") + } + }; return Arrays.asList(data); } @Test public void correctContent() throws IOException, URISyntaxException { - String pdfFileName = fileName + ".pdf"; - String bibFileName = fileName + ".bib"; - PdfContentImporter importer = new PdfContentImporter( - JabRefPreferences.getInstance().getImportFormatPreferences()); + PdfContentImporter importer = new PdfContentImporter(new ImportFormatPreferences()); Path pdfFile = Paths.get(PdfContentImporter.class.getResource(pdfFileName).toURI()); List result = importer.importDatabase(pdfFile, StandardCharsets.UTF_8).getDatabase().getEntries(); - BibEntryAssert.assertEquals(PdfContentImporterTest.class, bibFileName, result); + assertEquals(Collections.singletonList(expectedEntry), result); } } diff --git a/src/test/resources/net/sf/jabref/logic/importer/fileformat/1405.2249v1.pdf b/src/test/resources/net/sf/jabref/logic/importer/fileformat/1405.2249v1.pdf new file mode 100644 index 00000000000..b516ddb4bcc Binary files /dev/null and b/src/test/resources/net/sf/jabref/logic/importer/fileformat/1405.2249v1.pdf differ