-
-
Notifications
You must be signed in to change notification settings - Fork 2.5k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Fetcher for IACR eprints #3473
Fetcher for IACR eprints #3473
Changes from 10 commits
e23fee2
096d857
20ad210
83ac917
95e431c
b76a3f7
1076c1d
89e237b
fad3203
025eb88
4054e16
53be6f7
58034e5
4fa1534
8a091b0
fee6ccf
2ac6451
7f05741
ff6bfa7
f8b671d
108b8a9
4ebb7db
031aa87
116de21
079ba27
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,157 @@ | ||
package org.jabref.logic.importer.fetcher; | ||
|
||
import java.io.IOException; | ||
import java.nio.charset.Charset; | ||
import java.text.DateFormat; | ||
import java.text.SimpleDateFormat; | ||
import java.util.ArrayList; | ||
import java.util.Collections; | ||
import java.util.Date; | ||
import java.util.List; | ||
import java.util.Optional; | ||
import java.util.function.Predicate; | ||
import java.util.regex.Matcher; | ||
import java.util.regex.Pattern; | ||
|
||
import org.jabref.logic.importer.FetcherException; | ||
import org.jabref.logic.importer.IdBasedFetcher; | ||
import org.jabref.logic.importer.ImportFormatPreferences; | ||
import org.jabref.logic.importer.ParseException; | ||
import org.jabref.logic.importer.fileformat.BibtexParser; | ||
import org.jabref.logic.l10n.Localization; | ||
import org.jabref.logic.net.URLDownload; | ||
import org.jabref.model.entry.BibEntry; | ||
import org.jabref.model.entry.FieldName; | ||
|
||
import org.apache.commons.lang3.StringUtils; | ||
import org.apache.commons.logging.Log; | ||
import org.apache.commons.logging.LogFactory; | ||
|
||
public class IacrEprintFetcher implements IdBasedFetcher { | ||
|
||
public static final String NAME = "IACR eprints"; | ||
|
||
private static final Log LOGGER = LogFactory.getLog(IacrEprintFetcher.class); | ||
private static final Pattern DATE_FROM_WEBSITE_PATTERN = Pattern.compile("[a-z ]+(\\d{1,2} [A-Za-z][a-z]{2} \\d{4})"); | ||
private static final DateFormat DATE_FORMAT_WEBSITE = new SimpleDateFormat("dd MMM yyyy"); | ||
private static final DateFormat DATE_FORMAT_BIBTEX = new SimpleDateFormat("yyyy-MM-dd"); | ||
private static final Predicate<String> IDENTIFIER_PREDICATE = Pattern.compile("\\d{4}/\\d{3,5}").asPredicate(); | ||
private static final String CITATION_URL_PREFIX = "https://eprint.iacr.org/eprint-bin/cite.pl?entry="; | ||
private static final String DESCRIPTION_URL_PREFIX = "https://eprint.iacr.org/"; | ||
private static final Charset WEBSITE_CHARSET = Charset.forName("iso-8859-1"); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. very very minor, you can directly use the predefined enum Constant: There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Thanks for pointing that out - was looking for something like that, but apparently didn't look long enough... There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. For other cases where Java provides default enum variables, they all start with StandardXXX, for example for file opening there exists: StandardOpenOption, not really obvious if you search for it ;) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Jep, I expected them in something like Charsets or directly as constants in the Charset class... |
||
|
||
private final ImportFormatPreferences prefs; | ||
|
||
public IacrEprintFetcher(ImportFormatPreferences prefs) { | ||
this.prefs = prefs; | ||
} | ||
|
||
@Override | ||
public Optional<BibEntry> performSearchById(String identifier) throws FetcherException { | ||
String identifierWithoutLettersAndSpaces = identifier.replaceAll("[^0-9/]", " ").trim(); | ||
|
||
if (!IDENTIFIER_PREDICATE.test(identifierWithoutLettersAndSpaces)) { | ||
throw new FetcherException(Localization.lang("Invalid IACR identifier: '%0'.", identifier)); | ||
} | ||
|
||
Optional<BibEntry> entry = createEntryFromIacrCitation(identifierWithoutLettersAndSpaces); | ||
|
||
if (entry.isPresent()) { | ||
setAdditionalFields(entry.get(), identifierWithoutLettersAndSpaces); | ||
} | ||
|
||
return entry; | ||
} | ||
|
||
private Optional<BibEntry> createEntryFromIacrCitation(String validIdentifier) throws FetcherException { | ||
String bibtexCitationHtml = getHtml(CITATION_URL_PREFIX + validIdentifier); | ||
String actualEntry = getValueBetween("<PRE>", "</PRE>", bibtexCitationHtml); | ||
|
||
Optional<BibEntry> entry; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I am not sure, but you probably need to initialize it with Optional.empty() or you could get still an NPE if no entry is found There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If the javadoc on BibtexParser.singleFromString is correct, it should always return an entry or an Optional.empty(). There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. you can also just return the entry directly in the try construct. This is in my opinion the most readable solution. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 👍 |
||
try { | ||
entry = BibtexParser.singleFromString(actualEntry, prefs); | ||
} catch (ParseException e) { | ||
throw new FetcherException(Localization.lang("Entry from IACR could not be parsed."), e); | ||
} | ||
return entry; | ||
} | ||
|
||
private void setAdditionalFields(BibEntry entry, String identifier) throws FetcherException { | ||
String descriptiveHtml = getHtml(DESCRIPTION_URL_PREFIX + identifier); | ||
String version = getVersion(identifier, descriptiveHtml); | ||
|
||
entry.setField(FieldName.VERSION, version); | ||
entry.setField(FieldName.URL, DESCRIPTION_URL_PREFIX + identifier + "/" + version); | ||
entry.setField(FieldName.ABSTRACT, getAbstract(descriptiveHtml)); | ||
|
||
String dateStringAsInHtml = getValueBetween("<b>Date: </b>", "<p />", descriptiveHtml); | ||
entry.setField(FieldName.DATE, getLatestDate(dateStringAsInHtml)); | ||
} | ||
|
||
private String getVersion(String identifier, String descriptiveHtml) throws FetcherException { | ||
String startOfVersionString = "<b>Version: </b><a href=\"/" + identifier + "/"; | ||
String version = getValueBetween(startOfVersionString, "\"", descriptiveHtml); | ||
return version; | ||
} | ||
|
||
private String getAbstract(String descriptiveHtml) throws FetcherException { | ||
String abstractText = getValueBetween("<b>Abstract: </b>", "<p />", descriptiveHtml); | ||
// for some reason, all spaces are doubled... | ||
abstractText = abstractText.replaceAll("\\s(\\s)", "$1"); | ||
return abstractText; | ||
} | ||
|
||
private String getLatestDate(String dateStringAsInHtml) throws FetcherException { | ||
String[] rawDates = dateStringAsInHtml.split(","); | ||
List<String> formattedDates = new ArrayList<>(); | ||
for (String rawDate : rawDates) { | ||
Date date = parseDateFromWebsite(rawDate); | ||
if (date != null) { | ||
formattedDates.add(DATE_FORMAT_BIBTEX.format(date)); | ||
} | ||
} | ||
|
||
if (formattedDates.isEmpty()) { | ||
throw new FetcherException(Localization.lang("Entry from IACR could not be parsed.")); | ||
} | ||
|
||
Collections.sort(formattedDates, Collections.reverseOrder()); | ||
return formattedDates.get(0); | ||
} | ||
|
||
private Date parseDateFromWebsite(String dateStringFromWebsite) { | ||
Date date = null; | ||
Matcher dateMatcher = DATE_FROM_WEBSITE_PATTERN.matcher(dateStringFromWebsite.trim()); | ||
if (dateMatcher.find()) { | ||
try { | ||
date = DATE_FORMAT_WEBSITE.parse(dateMatcher.group(1)); | ||
} catch (java.text.ParseException e) { | ||
LOGGER.warn("Date from IACR could not be parsed", e); | ||
} | ||
} | ||
return date; | ||
} | ||
|
||
private String getHtml(String url) throws FetcherException { | ||
try { | ||
URLDownload download = new URLDownload(url); | ||
return download.asString(WEBSITE_CHARSET); | ||
} catch (IOException e) { | ||
throw new FetcherException(Localization.lang("Could not retrieve entry data from IACR at '%0'.", url), e); | ||
} | ||
} | ||
|
||
private String getValueBetween(String from, String to, String haystack) throws FetcherException { | ||
String value = StringUtils.substringBetween(haystack, from, to); | ||
if (value == null) { | ||
throw new FetcherException(Localization.lang("Could not extract required data from IACR HTML.")); | ||
} else { | ||
return value; | ||
} | ||
} | ||
|
||
@Override | ||
public String getName() { | ||
return NAME; | ||
} | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -1819,6 +1819,10 @@ Copy_BibTeX_key_and_title= | |
File_rename_failed_for_%0_entries.= | ||
Merged_BibTeX_source_code= | ||
Invalid_DOI\:_'%0'.=Ugyldig_DOI\:_'%0'. | ||
Invalid_IACR_identifier\:_'%0'.= | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Please try to use a more generic version in these strings. As of now, they are not reusable in other fetchers or situations. E.g. just use |
||
Could_not_extract_required_data_from_IACR_HTML.= | ||
Could_not_retrieve_entry_data_from_IACR_at_'%0'.= | ||
Entry_from_IACR_could_not_be_parsed.= | ||
should_start_with_a_name= | ||
should_end_with_a_name= | ||
unexpected_closing_curly_bracket= | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
SimpleDateFormat is outdated, it has been replaced by several other constructs in java8:
http://www.baeldung.com/java-8-date-time-intro
Or see example 18 here for an idea: http://javarevisited.blogspot.de/2015/03/20-examples-of-date-and-time-api-from-Java8.html
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Didn't know that - I'm working on changing it.